diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 53961d1a936..e9beb7ba90b 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -103,6 +103,10 @@ New Features added support for simple numeric queries, such as , in contrib query parser (Vinicius Barros via Uwe Schindler) + * LUCENE-1824: Add BoundaryScanner interface and its implementation classes, + SimpleBoundaryScanner and BreakIteratorBoundaryScanner, so that FVH's FragmentsBuilder + can find "natural" boundary to make snippets. (Robert Muir, Koji Sekiguchi) + Changes in runtime behavior: * LUCENE-1768: StandardQueryConfigHandler now uses NumericFieldConfigListener diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java index 29d5a5a9964..bbda0e28d6d 100644 --- a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragmentsBuilder.java @@ -48,14 +48,24 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { }; public static final String[] COLORED_POST_TAGS = { "" }; private char multiValuedSeparator = ' '; + private final BoundaryScanner boundaryScanner; protected BaseFragmentsBuilder(){ this( new String[]{ "" }, new String[]{ "" } ); } protected BaseFragmentsBuilder( String[] preTags, String[] postTags ){ + this(preTags, postTags, new SimpleBoundaryScanner()); + } + + protected BaseFragmentsBuilder(BoundaryScanner boundaryScanner){ + this( new String[]{ "" }, new String[]{ "" }, boundaryScanner ); + } + + protected BaseFragmentsBuilder( String[] preTags, String[] postTags, BoundaryScanner boundaryScanner ){ this.preTags = preTags; this.postTags = postTags; + this.boundaryScanner = boundaryScanner; } static Object checkTagsArgument( Object tags ){ @@ -135,28 +145,35 @@ public abstract class BaseFragmentsBuilder implements FragmentsBuilder { protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder ){ - final int s = fragInfo.startOffset; - return makeFragment( fragInfo, getFragmentSource( buffer, index, values, s, fragInfo.endOffset ), s, - preTags, postTags, encoder ); - } - - private String makeFragment( WeightedFragInfo fragInfo, String src, int s, - String[] preTags, String[] postTags, Encoder encoder ){ StringBuilder fragment = new StringBuilder(); + final int s = fragInfo.getStartOffset(); + int[] modifiedStartOffset = { s }; + String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset ); int srcIndex = 0; - for( SubInfo subInfo : fragInfo.subInfos ){ - for( Toffs to : subInfo.termsOffsets ){ + for( SubInfo subInfo : fragInfo.getSubInfos() ){ + for( Toffs to : subInfo.getTermsOffsets() ){ fragment - .append( encoder.encodeText( src.substring( srcIndex, to.startOffset - s ) ) ) - .append( getPreTag( preTags, subInfo.seqnum ) ) - .append( encoder.encodeText( src.substring( to.startOffset - s, to.endOffset - s ) ) ) - .append( getPostTag( postTags, subInfo.seqnum ) ); - srcIndex = to.endOffset - s; + .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) ) + .append( getPreTag( preTags, subInfo.getSeqnum() ) ) + .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) ) + .append( getPostTag( postTags, subInfo.getSeqnum() ) ); + srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } fragment.append( encoder.encodeText( src.substring( srcIndex ) ) ); return fragment.toString(); } + + protected String getFragmentSourceMSO( StringBuilder buffer, int[] index, Field[] values, + int startOffset, int endOffset, int[] modifiedStartOffset ){ + while( buffer.length() < endOffset && index[0] < values.length ){ + buffer.append( values[index[0]++].stringValue() ); + buffer.append( getMultiValuedSeparator() ); + } + int eo = buffer.length() < endOffset ? buffer.length() : boundaryScanner.findEndOffset( buffer, endOffset ); + modifiedStartOffset[0] = boundaryScanner.findStartOffset( buffer, startOffset ); + return buffer.substring( modifiedStartOffset[0], eo ); + } protected String getFragmentSource( StringBuilder buffer, int[] index, Field[] values, int startOffset, int endOffset ){ diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java new file mode 100644 index 00000000000..5e78996b750 --- /dev/null +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BoundaryScanner.java @@ -0,0 +1,40 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * + */ +public interface BoundaryScanner { + + /** + * Scan backward to find end offset. + * @param buffer scanned object + * @param start start offset to begin + * @return the found start offset + */ + public int findStartOffset( StringBuilder buffer, int start ); + + /** + * Scan forward to find start offset. + * @param buffer scanned object + * @param start start offset to begin + * @return the found end offset + */ + public int findEndOffset( StringBuilder buffer, int start ); +} diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java new file mode 100644 index 00000000000..5e116033416 --- /dev/null +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScanner.java @@ -0,0 +1,48 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.BreakIterator; + +/** + * A {@link BoundaryScanner} implementation that uses {@link BreakIterator} to find + * boundaries in the text. Boundary {@link Type} can be specified ({@link Type#SENTENCE} is the default). + */ +public class BreakIteratorBoundaryScanner implements BoundaryScanner { + + final BreakIterator bi; + + public BreakIteratorBoundaryScanner(BreakIterator bi){ + this.bi = bi; + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 1 ) return start; + bi.setText(buffer.substring(0, start)); + bi.last(); + return bi.previous(); + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 0 ) return start; + bi.setText(buffer.substring(start)); + return bi.next() + start; + } +} diff --git a/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java new file mode 100644 index 00000000000..bbfb1df3505 --- /dev/null +++ b/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScanner.java @@ -0,0 +1,81 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +public class SimpleBoundaryScanner implements BoundaryScanner { + + public static final int DEFAULT_MAX_SCAN = 20; + public static final Character[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'}; + + protected int maxScan; + protected Set boundaryChars; + + public SimpleBoundaryScanner(){ + this( DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS ); + } + + public SimpleBoundaryScanner( int maxScan ){ + this( maxScan, DEFAULT_BOUNDARY_CHARS ); + } + + public SimpleBoundaryScanner( Character[] boundaryChars ){ + this( DEFAULT_MAX_SCAN, boundaryChars ); + } + + public SimpleBoundaryScanner( int maxScan, Character[] boundaryChars ){ + this.maxScan = maxScan; + this.boundaryChars = new HashSet(); + this.boundaryChars.addAll(Arrays.asList(boundaryChars)); + } + + public SimpleBoundaryScanner( int maxScan, Set boundaryChars ){ + this.maxScan = maxScan; + this.boundaryChars = boundaryChars; + } + + public int findStartOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 1 ) return start; + int offset, count = maxScan; + for( offset = start; offset > 0 && count > 0; count-- ){ + // found? + if( boundaryChars.contains( buffer.charAt( offset - 1 ) ) ) return offset; + offset--; + } + // not found + return start; + } + + public int findEndOffset(StringBuilder buffer, int start) { + // avoid illegal start offset + if( start > buffer.length() || start < 0 ) return start; + int offset, count = maxScan; + //for( offset = start; offset <= buffer.length() && count > 0; count-- ){ + for( offset = start; offset < buffer.length() && count > 0; count-- ){ + // found? + if( boundaryChars.contains( buffer.charAt( offset ) ) ) return offset; + offset++; + } + // not found + return start; + } +} diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java new file mode 100644 index 00000000000..28447a12f49 --- /dev/null +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/BreakIteratorBoundaryScannerTest.java @@ -0,0 +1,91 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.text.BreakIterator; +import java.util.Locale; + +import org.apache.lucene.util.LuceneTestCase; + +public class BreakIteratorBoundaryScannerTest extends LuceneTestCase { + static final String TEXT = + "Apache Lucene(TM) is a high-performance, full-featured text search engine library written entirely in Java." + + "\nIt is a technology suitable for nearly any application that requires\n" + + "full-text search, especially cross-platform. \nApache Lucene is an open source project available for free download."; + + public void testOutOfRange() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.length() + 1; + assertEquals(start, scanner.findStartOffset(text, start)); + assertEquals(start, scanner.findEndOffset(text, start)); + start = 0; + assertEquals(start, scanner.findStartOffset(text, start)); + start = -1; + assertEquals(start, scanner.findEndOffset(text, start)); + } + + public void testWordBoundary() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getWordInstance(Locale.ENGLISH); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.indexOf("formance"); + int expected = TEXT.indexOf("high-performance"); + testFindStartOffset(text, start, expected, scanner); + + expected = TEXT.indexOf(", full"); + testFindEndOffset(text, start, expected, scanner); + } + + public void testSentenceBoundary() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getSentenceInstance(); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.indexOf("any application"); + int expected = TEXT.indexOf("It is a"); + testFindStartOffset(text, start, expected, scanner); + + expected = TEXT.indexOf("Apache Lucene is an open source"); + testFindEndOffset(text, start, expected, scanner); + } + + public void testLineBoundary() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BreakIterator bi = BreakIterator.getLineInstance(); + BoundaryScanner scanner = new BreakIteratorBoundaryScanner(bi); + + int start = TEXT.indexOf("any application"); + int expected = TEXT.indexOf("nearly"); + testFindStartOffset(text, start, expected, scanner); + + expected = TEXT.indexOf("application that requires"); + testFindEndOffset(text, start, expected, scanner); + } + + private void testFindStartOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception { + assertEquals(expected, scanner.findStartOffset(text, start)); + } + + private void testFindEndOffset(StringBuilder text, int start, int expected, BoundaryScanner scanner) throws Exception { + assertEquals(expected, scanner.findEndOffset(text, start)); + } +} diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java index 961bd4cbe8d..84dfe4a2060 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/ScoreOrderFragmentsBuilderTest.java @@ -36,8 +36,8 @@ public class ScoreOrderFragmentsBuilderTest extends AbstractTestCase { assertEquals( 3, f.length ); // check score order assertEquals( "c a a b b ", f[0] ); - assertEquals( "b b a b a b b b b b ", f[1] ); - assertEquals( "a b b b b b b b b b ", f[2] ); + assertEquals( "b b a b a b b b b b c", f[1] ); + assertEquals( "a b b b b b b b b b b", f[2] ); } private FieldFragList ffl(Query query, String indexValue ) throws Exception { diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java new file mode 100644 index 00000000000..1728ceb78a3 --- /dev/null +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleBoundaryScannerTest.java @@ -0,0 +1,55 @@ +package org.apache.lucene.search.vectorhighlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.util.LuceneTestCase; + +public class SimpleBoundaryScannerTest extends LuceneTestCase { + static final String TEXT = + "Apache Lucene(TM) is a high-performance, full-featured\ntext search engine library written entirely in Java."; + + public void testFindStartOffset() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BoundaryScanner scanner = new SimpleBoundaryScanner(); + + // test out of range + int start = TEXT.length() + 1; + assertEquals(start, scanner.findStartOffset(text, start)); + start = 0; + assertEquals(start, scanner.findStartOffset(text, start)); + + start = TEXT.indexOf("formance"); + int expected = TEXT.indexOf("high-performance"); + assertEquals(expected, scanner.findStartOffset(text, start)); + } + + public void testFindEndOffset() throws Exception { + StringBuilder text = new StringBuilder(TEXT); + BoundaryScanner scanner = new SimpleBoundaryScanner(); + + // test out of range + int start = TEXT.length() + 1; + assertEquals(start, scanner.findEndOffset(text, start)); + start = -1; + assertEquals(start, scanner.findEndOffset(text, start)); + + start = TEXT.indexOf("full-"); + int expected = TEXT.indexOf("\ntext"); + assertEquals(expected, scanner.findEndOffset(text, start)); + } +} diff --git a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java index 7171b68007f..f9236345634 100644 --- a/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java +++ b/lucene/contrib/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragmentsBuilderTest.java @@ -50,7 +50,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); // 3 snippets requested, but should be 2 assertEquals( 2, f.length ); - assertEquals( "a b b b b b b b b b ", f[0] ); + assertEquals( "a b b b b b b b b b b", f[0] ); assertEquals( "b b a b a b ", f[1] ); } @@ -63,8 +63,8 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); String[] f = sfb.createFragments( reader, 0, F, ffl, 3 ); assertEquals( 3, f.length ); - assertEquals( "a b b b b b b b b b ", f[0] ); - assertEquals( "b b a b a b b b b b ", f[1] ); + assertEquals( "a b b b b b b b b b b", f[0] ); + assertEquals( "b b a b a b b b b b c", f[1] ); assertEquals( "c a a b b ", f[2] ); } @@ -94,7 +94,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - assertEquals( " b c d e ", sfb.createFragment( reader, 0, F, ffl ) ); + assertEquals( "a b c d e ", sfb.createFragment( reader, 0, F, ffl ) ); } public void test1PhraseLongMV() throws Exception { @@ -106,7 +106,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - assertEquals( " most search engines use only one of these methods. Even the search engines that says they can use t", + assertEquals( "The most search engines use only one of these methods. Even the search engines that says they can use the", sfb.createFragment( reader, 0, F, ffl ) ); } @@ -119,7 +119,7 @@ public class SimpleFragmentsBuilderTest extends AbstractTestCase { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl, 100 ); SimpleFragmentsBuilder sfb = new SimpleFragmentsBuilder(); - assertEquals( "ssing speed, the ", sfb.createFragment( reader, 0, F, ffl ) ); + assertEquals( "processing speed, the ", sfb.createFragment( reader, 0, F, ffl ) ); } public void testUnstoredField() throws Exception {