From 66580ef07490006a398d99a8a2f0e027f4ac7f50 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 5 Apr 2013 16:39:46 +0000 Subject: [PATCH] LUCENE-4899: FastVectorHighlihgter failed with StringIndexOutOfBoundsException if a single highlight phrase or term was greater than the fragCharSize producing negative string offsets git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465032 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../vectorhighlight/BaseFragListBuilder.java | 127 +++++++----- .../vectorhighlight/FieldPhraseList.java | 2 +- .../FastVectorHighlighterTest.java | 180 +++++++++++++++++- .../SimpleFragListBuilderTest.java | 4 +- 5 files changed, 265 insertions(+), 52 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 742e1cdbe3b..cf52ebc0460 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -220,6 +220,10 @@ Bug Fixes with target<=current (in this case the behavior of advance is undefined). (Adrien Grand) +* LUCENE-4899: FastVectorHighlihgter failed with StringIndexOutOfBoundsException + if a single highlight phrase or term was greater than the fragCharSize producing + negative string offsets. (Simon Willnauer) + Documentation * LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java index 82f17cc64e9..6699f6e9713 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java @@ -46,63 +46,98 @@ public abstract class BaseFragListBuilder implements FragListBuilder { this( MARGIN_DEFAULT ); } - protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ - + protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){ if( fragCharSize < minFragCharSize ) throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." ); List wpil = new ArrayList(); - Iterator ite = fieldPhraseList.getPhraseList().iterator(); + IteratorQueue queue = new IteratorQueue(fieldPhraseList.getPhraseList().iterator()); WeightedPhraseInfo phraseInfo = null; int startOffset = 0; - boolean taken = false; - while( true ){ - if( !taken ){ - if( !ite.hasNext() ) break; - phraseInfo = ite.next(); - } - taken = false; - if( phraseInfo == null ) break; - + while((phraseInfo = queue.top()) != null){ // if the phrase violates the border of previous fragment, discard it and try next phrase - if( phraseInfo.getStartOffset() < startOffset ) continue; - + if( phraseInfo.getStartOffset() < startOffset ) { + queue.removeTop(); + continue; + } + wpil.clear(); - wpil.add( phraseInfo ); - int firstOffset = phraseInfo.getStartOffset(); - int st = phraseInfo.getStartOffset() - margin < startOffset ? - startOffset : phraseInfo.getStartOffset() - margin; - int en = st + fragCharSize; - if( phraseInfo.getEndOffset() > en ) - en = phraseInfo.getEndOffset(); - - int lastEndOffset = phraseInfo.getEndOffset(); - while( true ){ - if( ite.hasNext() ){ - phraseInfo = ite.next(); - taken = true; - if( phraseInfo == null ) break; - } - else - break; - if( phraseInfo.getEndOffset() <= en ){ - wpil.add( phraseInfo ); - lastEndOffset = phraseInfo.getEndOffset(); - } - else - break; + final int currentPhraseStartOffset = phraseInfo.getStartOffset(); + int currentPhraseEndOffset = phraseInfo.getEndOffset(); + int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset); + int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize); + if (acceptPhrase(queue.removeTop(), currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) { + wpil.add(phraseInfo); } - int matchLen = lastEndOffset - firstOffset; - //now recalculate the start and end position to "center" the result - int newMargin = (fragCharSize-matchLen)/2; - st = firstOffset - newMargin; - if(st fragCharSize prevent IAOOB here + spanStart = currentPhraseStartOffset - newMargin; + if (spanStart < startOffset) { + spanStart = startOffset; + } + // whatever is bigger here we grow this out + spanEnd = spanStart + Math.max(matchLen, fragCharSize); + startOffset = spanEnd; + fieldFragList.add(spanStart, spanEnd, wpil); } return fieldFragList; } + + /** + * A predicate to decide if the given {@link WeightedPhraseInfo} should be + * accepted as a highlighted phrase or if it should be discarded. + *

+ * The default implementation discards phrases that are composed of more than one term + * and where the matchLength exceeds the fragment character size. + * + * @param info the phrase info to accept + * @param matchLength the match length of the current phrase + * @param fragCharSize the configured fragment character size + * @return true if this phrase info should be accepted as a highligh phrase + */ + protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) { + return info.getTermsOffsets().size() <= 1 || matchLength <= fragCharSize; + } + + private static final class IteratorQueue { + private final Iterator iter; + private T top; + + public IteratorQueue(Iterator iter) { + this.iter = iter; + T removeTop = removeTop(); + assert removeTop == null; + } + + public T top() { + return top; + } + + public T removeTop() { + T currentTop = top; + if (iter.hasNext()) { + top = iter.next(); + } else { + top = null; + } + return currentTop; + } + + } + } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java index 4b564d227b6..74cceb471a3 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldPhraseList.java @@ -81,7 +81,7 @@ public class FieldPhraseList { if( ti != null ) nextMap = currMap.getTermMap( ti.getText() ); if( ti == null || nextMap == null ){ - if( ti != null ) + if( ti != null ) fieldTermStack.push( ti ); if( currMap.isValidTermOrPhrase( phraseCandidate ) ){ addIfNoOverlap( new WeightedPhraseInfo( phraseCandidate, currMap.getBoost(), currMap.getTermOrPhraseNumber() ) ); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java index de6b60e7f70..0208ab4b4e9 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java @@ -29,18 +29,19 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queries.CommonTermsQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.highlight.SimpleSpanFragmenter; -import org.apache.lucene.search.highlight.TokenSources; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; public class FastVectorHighlighterTest extends LuceneTestCase { + public void testSimpleHighlightTest() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); @@ -71,6 +72,179 @@ public class FastVectorHighlighterTest extends LuceneTestCase { dir.close(); } + public void testPhraseHighlightLongTextTest() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + Document doc = new Document(); + FieldType type = new FieldType(TextField.TYPE_STORED); + type.setStoreTermVectorOffsets(true); + type.setStoreTermVectorPositions(true); + type.setStoreTermVectors(true); + type.freeze(); + Field text = new Field("text", + "Netscape was the general name for a series of web browsers originally produced by Netscape Communications Corporation, now a subsidiary of AOL The original browser was once the dominant browser in terms of usage share, but as a result of the first browser war it lost virtually all of its share to Internet Explorer Netscape was discontinued and support for all Netscape browsers and client products was terminated on March 1, 2008 Netscape Navigator was the name of Netscape\u0027s web browser from versions 1.0 through 4.8 The first beta release versions of the browser were released in 1994 and known as Mosaic and then Mosaic Netscape until a legal challenge from the National Center for Supercomputing Applications (makers of NCSA Mosaic, which many of Netscape\u0027s founders used to develop), led to the name change to Netscape Navigator The company\u0027s name also changed from Mosaic Communications Corporation to Netscape Communications Corporation The browser was easily the most advanced...", type); + doc.add(text); + writer.addDocument(doc); + FastVectorHighlighter highlighter = new FastVectorHighlighter(); + IndexReader reader = DirectoryReader.open(writer, true); + int docId = 0; + String field = "text"; + { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(field, "internet")), Occur.MUST); + query.add(new TermQuery(new Term(field, "explorer")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 128, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("first browser war it lost virtually all of its share to Internet Explorer Netscape was discontinued and support for all Netscape browsers", bestFragments[0]); + } + + { + PhraseQuery query = new PhraseQuery(); + query.add(new Term(field, "internet")); + query.add(new Term(field, "explorer")); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 128, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("first browser war it lost virtually all of its share to Internet Explorer Netscape was discontinued and support for all Netscape browsers", bestFragments[0]); + } + reader.close(); + writer.close(); + dir.close(); + } + + // see LUCENE-4899 + public void testPhraseHighlightTest() throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()))); + Document doc = new Document(); + FieldType type = new FieldType(TextField.TYPE_STORED); + type.setStoreTermVectorOffsets(true); + type.setStoreTermVectorPositions(true); + type.setStoreTermVectors(true); + type.freeze(); + Field longTermField = new Field("long_term", "This is a test thisisaverylongwordandmakessurethisfails where foo is highlighed and should be highlighted", type); + Field noLongTermField = new Field("no_long_term", "This is a test where foo is highlighed and should be highlighted", type); + + doc.add(longTermField); + doc.add(noLongTermField); + writer.addDocument(doc); + FastVectorHighlighter highlighter = new FastVectorHighlighter(); + IndexReader reader = DirectoryReader.open(writer, true); + int docId = 0; + String field = "no_long_term"; + { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(field, "test")), Occur.MUST); + query.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("foo is highlighed and", bestFragments[0]); + } + { + BooleanQuery query = new BooleanQuery(); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(field, "test")); + pq.add(new Term(field, "foo")); + pq.add(new Term(field, "highlighed")); + pq.setSlop(5); + query.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(pq, Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(0, bestFragments.length); + bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 30, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("a test where foo is highlighed and", bestFragments[0]); + + } + { + PhraseQuery query = new PhraseQuery(); + query.add(new Term(field, "test")); + query.add(new Term(field, "foo")); + query.add(new Term(field, "highlighed")); + query.setSlop(3); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(0, bestFragments.length); + bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 30, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("a test where foo is highlighed and", bestFragments[0]); + + } + { + PhraseQuery query = new PhraseQuery(); + query.add(new Term(field, "test")); + query.add(new Term(field, "foo")); + query.add(new Term(field, "highlighted")); + query.setSlop(30); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + assertEquals(0, bestFragments.length); + } + { + BooleanQuery query = new BooleanQuery(); + PhraseQuery pq = new PhraseQuery(); + pq.add(new Term(field, "test")); + pq.add(new Term(field, "foo")); + pq.add(new Term(field, "highlighed")); + pq.setSlop(5); + BooleanQuery inner = new BooleanQuery(); + inner.add(pq, Occur.MUST); + inner.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(inner, Occur.MUST); + query.add(pq, Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + assertEquals(0, bestFragments.length); + + bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 30, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("a test where foo is highlighed and", bestFragments[0]); + } + + field = "long_term"; + { + BooleanQuery query = new BooleanQuery(); + query.add(new TermQuery(new Term(field, + "thisisaverylongwordandmakessurethisfails")), Occur.MUST); + query.add(new TermQuery(new Term(field, "foo")), Occur.MUST); + query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST); + FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader); + String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader, + docId, field, 18, 1); + // highlighted results are centered + assertEquals(1, bestFragments.length); + assertEquals("thisisaverylongwordandmakessurethisfails", + bestFragments[0]); + } + reader.close(); + writer.close(); + dir.close(); + } + public void testCommonTermsQueryHighlightTest() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true))); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java index a1b83966c53..1833472f638 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java @@ -42,7 +42,7 @@ public class SimpleFragListBuilderTest extends AbstractTestCase { SimpleFragListBuilder sflb = new SimpleFragListBuilder(); FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "abcdefghijklmnopqrs")), "abcdefghijklmnopqrs" ), sflb.minFragCharSize ); assertEquals( 1, ffl.getFragInfos().size() ); - assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,18)", ffl.getFragInfos().get( 0 ).toString() ); + assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,19)", ffl.getFragInfos().get( 0 ).toString() ); } public void testSmallerFragSizeThanPhraseQuery() throws Exception { @@ -55,7 +55,7 @@ public class SimpleFragListBuilderTest extends AbstractTestCase { FieldFragList ffl = sflb.createFieldFragList( fpl(phraseQuery, "abcdefgh jklmnopqrs" ), sflb.minFragCharSize ); assertEquals( 1, ffl.getFragInfos().size() ); if (VERBOSE) System.out.println( ffl.getFragInfos().get( 0 ).toString() ); - assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(1,19)", ffl.getFragInfos().get( 0 ).toString() ); + assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(0,21)", ffl.getFragInfos().get( 0 ).toString() ); } public void test1TermIndex() throws Exception {