LUCENE-1824: add BoundaryScanner

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1166530 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-09-08 05:35:02 +00:00
parent 5ec2a2289a
commit 4ea767545b
9 changed files with 358 additions and 22 deletions

View File

@ -103,6 +103,10 @@ New Features
added support for simple numeric queries, such as <age:4>, in contrib
query parser (Vinicius Barros via Uwe Schindler)
* LUCENE-1824: Add BoundaryScanner interface and its implementation classes,
SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder
can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi)
Changes in runtime behavior:
* LUCENE-1768: StandardQueryConfigHandler now uses NumericFieldConfigListener

View File

@ -48,14 +48,24 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
};
public static final String[] COLORED_POST_TAGS = { "</b>" };
private char multiValuedSeparator = ' ';
private final BoundaryScanner boundaryScanner;
protected BaseFragmentsBuilder(){
this( new String[]{ "<b>" }, new String[]{ "</b>" } );
}
protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){
this(preTags, postTags, new SimpleBoundaryScanner());
}
protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){
this( new String[]{ "<b>" }, new String[]{ "</b>" }, boundaryScanner );
}
protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){
this.preTags = preTags;
this.postTags = postTags;
this.boundaryScanner = boundaryScanner;
}
static Object checkTagsArgument( Object tags ){
@ -135,28 +145,35 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder {
protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo,
String[] preTags, String[] postTags, Encoder encoder ){
final int s = fragInfo.startOffset;
return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s,
preTags, postTags, encoder );
}
private String makeFragment( WeightedFragInfo fragInfo, String src, int s,
String[] preTags, String[] postTags, Encoder encoder ){
StringBuilder fragment = new StringBuilder();
final int s = fragInfo.getStartOffset();
int[] modifiedStartOffset = { s };
String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset );
int srcIndex = 0;
for( SubInfo subInfo : fragInfo.subInfos ){
for( Toffs to : subInfo.termsOffsets ){
for( SubInfo subInfo : fragInfo.getSubInfos() ){
for( Toffs to : subInfo.getTermsOffsets() ){
fragment
.append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) )
.append( getPreTag( preTags, subInfo.seqnum ) )
.append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset - s ) ) )
.append( getPostTag( postTags, subInfo.seqnum ) );
srcIndex = to.endOffset - s;
.append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) )
.append( getPreTag( preTags, subInfo.getSeqnum() ) )
.append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) )
.append( getPostTag( postTags, subInfo.getSeqnum() ) );
srcIndex = to.getEndOffset() - modifiedStartOffset[0];
}
}
fragment.append( encoder.encodeText( src.substring( srcIndex ) ) );
return fragment.toString();
}
protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset, int[] modifiedStartOffset ){
while( buffer.length() < endOffset && index[0] < values.length ){
buffer.append( values[index[0]++].stringValue() );
buffer.append( getMultiValuedSeparator() );
}
int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset( buffer, endOffset );
modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset );
return buffer.substring( modifiedStartOffset[0], eo );
}
protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values,
int startOffset, int endOffset ){

View File

@ -0,0 +1,40 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
public interface BoundaryScanner {
/**
* Scan backward to find end offset.
* @param buffer scanned object
* @param start start offset to begin
* @return the found start offset
*/
public int findStartOffset( StringBuilder buffer, int start );
/**
* Scan forward to find start offset.
* @param buffer scanned object
* @param start start offset to begin
* @return the found end offset
*/
public int findEndOffset( StringBuilder buffer, int start );
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.BreakIterator;
/**
* A {@link BoundaryScanner} implementation that uses {@link BreakIterator} to find
* boundaries in the text. Boundary {@link Type} can be specified ({@link Type#SENTENCE} is the default).
*/
public class BreakIteratorBoundaryScanner implements BoundaryScanner {
final BreakIterator bi;
public BreakIteratorBoundaryScanner(BreakIterator bi){
this.bi = bi;
}
public int findStartOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if( start > buffer.length() || start < 1 ) return start;
bi.setText(buffer.substring(0, start));
bi.last();
return bi.previous();
}
public int findEndOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if( start > buffer.length() || start < 0 ) return start;
bi.setText(buffer.substring(start));
return bi.next() + start;
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
public class SimpleBoundaryScanner implements BoundaryScanner {
public static final int DEFAULT_MAX_SCAN = 20;
public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};
protected int maxScan;
protected Set<Character> boundaryChars;
public SimpleBoundaryScanner(){
this( DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS );
}
public SimpleBoundaryScanner( int maxScan ){
this( maxScan, DEFAULT_BOUNDARY_CHARS );
}
public SimpleBoundaryScanner( Character[] boundaryChars ){
this( DEFAULT_MAX_SCAN, boundaryChars );
}
public SimpleBoundaryScanner( int maxScan, Character[] boundaryChars ){
this.maxScan = maxScan;
this.boundaryChars = new HashSet<Character>();
this.boundaryChars.addAll(Arrays.asList(boundaryChars));
}
public SimpleBoundaryScanner( int maxScan, Set<Character> boundaryChars ){
this.maxScan = maxScan;
this.boundaryChars = boundaryChars;
}
public int findStartOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if( start > buffer.length() || start < 1 ) return start;
int offset, count = maxScan;
for( offset = start; offset > 0 && count > 0; count-- ){
// found?
if( boundaryChars.contains( buffer.charAt( offset - 1 ) ) ) return offset;
offset--;
}
// not found
return start;
}
public int findEndOffset(StringBuilder buffer, int start) {
// avoid illegal start offset
if( start > buffer.length() || start < 0 ) return start;
int offset, count = maxScan;
//for( offset = start; offset <= buffer.length() && count > 0; count-- ){
for( offset = start; offset < buffer.length() && count > 0; count-- ){
// found?
if( boundaryChars.contains( buffer.charAt( offset ) ) ) return offset;
offset++;
}
// not found
return start;
}
}

View File

@ -0,0 +1,91 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.BreakIterator;
import java.util.Locale;
import org.apache.lucene.util.LuceneTestCase;
public class BreakIteratorBoundaryScannerTest extends LuceneTestCase {
static final String TEXT =
"Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java." +
"\nIt is a technology suitable for nearly any application that requires\n" +
"full-text search, especially cross-platform. \nApache Lucene is an open source project available for free download.";
public void testOutOfRange() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH);
BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
int start = TEXT.length() + 1;
assertEquals(start, scanner.findStartOffset(text, start));
assertEquals(start, scanner.findEndOffset(text, start));
start = 0;
assertEquals(start, scanner.findStartOffset(text, start));
start = -1;
assertEquals(start, scanner.findEndOffset(text, start));
}
public void testWordBoundary() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH);
BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
int start = TEXT.indexOf("formance");
int expected = TEXT.indexOf("high-performance");
testFindStartOffset(text, start, expected, scanner);
expected = TEXT.indexOf(", full");
testFindEndOffset(text, start, expected, scanner);
}
public void testSentenceBoundary() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BreakIterator bi = BreakIterator.getSentenceInstance();
BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
int start = TEXT.indexOf("any application");
int expected = TEXT.indexOf("It is a");
testFindStartOffset(text, start, expected, scanner);
expected = TEXT.indexOf("Apache Lucene is an open source");
testFindEndOffset(text, start, expected, scanner);
}
public void testLineBoundary() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BreakIterator bi = BreakIterator.getLineInstance();
BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi);
int start = TEXT.indexOf("any application");
int expected = TEXT.indexOf("nearly");
testFindStartOffset(text, start, expected, scanner);
expected = TEXT.indexOf("application that requires");
testFindEndOffset(text, start, expected, scanner);
}
private void testFindStartOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception {
assertEquals(expected, scanner.findStartOffset(text, start));
}
private void testFindEndOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception {
assertEquals(expected, scanner.findEndOffset(text, start));
}
}

View File

@ -36,8 +36,8 @@ public class ScoreOrderFragmentsBuilderTest extends AbstractTestCase {
assertEquals( 3, f.length );
// check score order
assertEquals( "<b>c</b> <b>a</b> <b>a</b> b b ", f[0] );
assertEquals( "b b <b>a</b> b <b>a</b> b b b b b ", f[1] );
assertEquals( "<b>a</b> b b b b b b b b b ", f[2] );
assertEquals( "b b <b>a</b> b <b>a</b> b b b b b c", f[1] );
assertEquals( "<b>a</b> b b b b b b b b b b", f[2] );
}
private FieldFragList ffl(Query query, String indexValue ) throws Exception {

View File

@ -0,0 +1,55 @@
package org.apache.lucene.search.vectorhighlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
public class SimpleBoundaryScannerTest extends LuceneTestCase {
static final String TEXT =
"Apache Lucene(TM) is a high-performance, full-featured\ntext search engine library written entirely in Java.";
public void testFindStartOffset() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BoundaryScanner scanner = new SimpleBoundaryScanner();
// test out of range
int start = TEXT.length() + 1;
assertEquals(start, scanner.findStartOffset(text, start));
start = 0;
assertEquals(start, scanner.findStartOffset(text, start));
start = TEXT.indexOf("formance");
int expected = TEXT.indexOf("high-performance");
assertEquals(expected, scanner.findStartOffset(text, start));
}
public void testFindEndOffset() throws Exception {
StringBuilder text = new StringBuilder(TEXT);
BoundaryScanner scanner = new SimpleBoundaryScanner();
// test out of range
int start = TEXT.length() + 1;
assertEquals(start, scanner.findEndOffset(text, start));
start = -1;
assertEquals(start, scanner.findEndOffset(text, start));
start = TEXT.indexOf("full-");
int expected = TEXT.indexOf("\ntext");
assertEquals(expected, scanner.findEndOffset(text, start));
}
}

View File

@ -50,7 +50,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
// 3 snippets requested, but should be 2
assertEquals( 2, f.length );
assertEquals( "<b>a</b> b b b b b b b b b ", f[0] );
assertEquals( "<b>a</b> b b b b b b b b b b", f[0] );
assertEquals( "b b <b>a</b> b <b>a</b> b ", f[1] );
}
@ -63,8 +63,8 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
String[] f = sfb.createFragments( reader, 0, F, ffl, 3 );
assertEquals( 3, f.length );
assertEquals( "<b>a</b> b b b b b b b b b ", f[0] );
assertEquals( "b b <b>a</b> b <b>a</b> b b b b b ", f[1] );
assertEquals( "<b>a</b> b b b b b b b b b b", f[0] );
assertEquals( "b b <b>a</b> b <b>a</b> b b b b b c", f[1] );
assertEquals( "<b>c</b> <b>a</b> <b>a</b> b b ", f[2] );
}
@ -94,7 +94,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
assertEquals( " b c <b>d</b> e ", sfb.createFragment( reader, 0, F, ffl ) );
assertEquals( "a b c <b>d</b> e ", sfb.createFragment( reader, 0, F, ffl ) );
}
public void test1PhraseLongMV() throws Exception {
@ -106,7 +106,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
assertEquals( " most <b>search engines</b> use only one of these methods. Even the <b>search engines</b> that says they can use t",
assertEquals( "The most <b>search engines</b> use only one of these methods. Even the <b>search engines</b> that says they can use the",
sfb.createFragment( reader, 0, F, ffl ) );
}
@ -119,7 +119,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase {
SimpleFragListBuilder sflb = new SimpleFragListBuilder();
FieldFragList ffl = sflb.createFieldFragList( fpl, 100 );
SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder();
assertEquals( "ssing <b>speed</b>, the ", sfb.createFragment( reader, 0, F, ffl ) );
assertEquals( "processing <b>speed</b>, the ", sfb.createFragment( reader, 0, F, ffl ) );
}
public void testUnstoredField() throws Exception {