LUCENE-4899: FastVectorHighlihgter failed with StringIndexOutOfBoundsException if a single highlight phrase or term was greater than the fragCharSize producing negative string offsets

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1465032 13f79535-47bb-0310-9956-ffa450edef68
2013-04-05 16:39:46 +00:00 · 2013-04-05 16:39:46 +00:00 · 66580ef074
parent 0b999ebb46
commit 66580ef074
5 changed files with 265 additions and 52 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -220,6 +220,10 @@ Bug Fixes
  with target<=current (in this case the behavior of advance is undefined).
  (Adrien Grand)

+* LUCENE-4899: FastVectorHighlihgter failed with StringIndexOutOfBoundsException
+  if a single highlight phrase or term was greater than the fragCharSize producing
+  negative string offsets. (Simon Willnauer)
+
 Documentation

 * LUCENE-4841: Added example SimpleSortedSetFacetsExample to show how
--- a/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/vectorhighlight/BaseFragListBuilder.java
@ -47,62 +47,97 @@ public abstract class BaseFragListBuilder implements FragListBuilder {
  }
  
 protected FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, FieldFragList fieldFragList, int fragCharSize ){
-    
    if( fragCharSize < minFragCharSize )
      throw new IllegalArgumentException( "fragCharSize(" + fragCharSize + ") is too small. It must be " + minFragCharSize + " or higher." );
    
    List<WeightedPhraseInfo> wpil = new ArrayList<WeightedPhraseInfo>();
-    Iterator<WeightedPhraseInfo> ite = fieldPhraseList.getPhraseList().iterator();
+    IteratorQueue<WeightedPhraseInfo> queue = new IteratorQueue<WeightedPhraseInfo>(fieldPhraseList.getPhraseList().iterator());
    WeightedPhraseInfo phraseInfo = null;
    int startOffset = 0;
-    boolean taken = false;
-    while( true ){
-      if( !taken ){
-        if( !ite.hasNext() ) break;
-        phraseInfo = ite.next();
-      }
-      taken = false;
-      if( phraseInfo == null ) break;
-
+    while((phraseInfo = queue.top()) != null){
      // if the phrase violates the border of previous fragment, discard it and try next phrase
-      if( phraseInfo.getStartOffset() < startOffset ) continue;
+      if( phraseInfo.getStartOffset() < startOffset )  {
+        queue.removeTop();
+        continue;
+      }
      
      wpil.clear();
-      wpil.add( phraseInfo );
-      int firstOffset = phraseInfo.getStartOffset();
-      int st = phraseInfo.getStartOffset() - margin < startOffset ?
-          startOffset : phraseInfo.getStartOffset() - margin;
-      int en = st + fragCharSize;
-      if( phraseInfo.getEndOffset() > en )
-        en = phraseInfo.getEndOffset();
+      final int currentPhraseStartOffset = phraseInfo.getStartOffset();
+      int currentPhraseEndOffset = phraseInfo.getEndOffset();
+      int spanStart = Math.max(currentPhraseStartOffset - margin, startOffset);
+      int spanEnd = Math.max(currentPhraseEndOffset, spanStart + fragCharSize);
+      if (acceptPhrase(queue.removeTop(),  currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
+        wpil.add(phraseInfo);
+      }
+      while((phraseInfo = queue.top()) != null) { // pull until we crossed the current spanEnd
+        if (phraseInfo.getEndOffset() <= spanEnd) {
+          currentPhraseEndOffset = phraseInfo.getEndOffset();
+          if (acceptPhrase(queue.removeTop(),  currentPhraseEndOffset - currentPhraseStartOffset, fragCharSize)) {
+            wpil.add(phraseInfo);
+          }
+        } else {
+          break;
+        }
+      }
+      if (wpil.isEmpty()) {
+        continue;
+      }
      
-      int lastEndOffset = phraseInfo.getEndOffset();
-      while( true ){
-        if( ite.hasNext() ){
-          phraseInfo = ite.next();
-          taken = true;
-          if( phraseInfo == null ) break;
+      final int matchLen = currentPhraseEndOffset - currentPhraseStartOffset;
+      // now recalculate the start and end position to "center" the result
+      final int newMargin = Math.max(0, (fragCharSize-matchLen)/2); // matchLen can be > fragCharSize prevent IAOOB here
+      spanStart = currentPhraseStartOffset - newMargin;
+      if (spanStart < startOffset) {
+        spanStart = startOffset;
      }
-        else
-          break;
-        if( phraseInfo.getEndOffset() <= en ){
-          wpil.add( phraseInfo );
-          lastEndOffset = phraseInfo.getEndOffset();
-        }
-        else
-          break;
-      }
-      int matchLen = lastEndOffset - firstOffset;
-      //now recalculate the start and end position to "center" the result
-      int newMargin = (fragCharSize-matchLen)/2;
-      st = firstOffset - newMargin;
-      if(st<startOffset){
-        st = startOffset;
-      }
-      en = st+fragCharSize;
-      startOffset = en;
-      fieldFragList.add( st, en, wpil );
+      // whatever is bigger here we grow this out
+      spanEnd = spanStart + Math.max(matchLen, fragCharSize);  
+      startOffset = spanEnd;
+      fieldFragList.add(spanStart, spanEnd, wpil);
    }
    return fieldFragList;
  }
+ 
+  /**
+   * A predicate to decide if the given {@link WeightedPhraseInfo} should be
+   * accepted as a highlighted phrase or if it should be discarded.
+   * <p>
+   * The default implementation discards phrases that are composed of more than one term
+   * and where the matchLength exceeds the fragment character size.
+   * 
+   * @param info the phrase info to accept
+   * @param matchLength the match length of the current phrase
+   * @param fragCharSize the configured fragment character size
+   * @return <code>true</code> if this phrase info should be accepted as a highligh phrase
+   */
+ protected boolean acceptPhrase(WeightedPhraseInfo info, int matchLength, int fragCharSize) {
+   return info.getTermsOffsets().size() <= 1 ||  matchLength <= fragCharSize;
+ }
+ 
+ private static final class IteratorQueue<T> {
+   private final Iterator<T> iter;
+   private T top;
+   
+   public IteratorQueue(Iterator<T> iter) {
+     this.iter = iter;
+     T removeTop = removeTop();
+     assert removeTop == null;
+   }
+   
+   public T top() {
+     return top;
+   }
+   
+   public T removeTop() {
+     T currentTop = top;
+     if (iter.hasNext()) {
+       top = iter.next();
+     } else {
+       top = null;
+     }
+     return currentTop;
+   }
+   
+ }
+ 
 }
--- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/FastVectorHighlighterTest.java
@ -29,18 +29,19 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.CommonTermsQuery;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.BooleanClause.Occur;
-import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
-import org.apache.lucene.search.highlight.TokenSources;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;


 public class FastVectorHighlighterTest extends LuceneTestCase {
  
+  
  public void testSimpleHighlightTest() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
@ -71,6 +72,179 @@ public class FastVectorHighlighterTest extends LuceneTestCase {
    dir.close();
  }
  
+  public void testPhraseHighlightLongTextTest() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
+    Document doc = new Document();
+    FieldType type = new FieldType(TextField.TYPE_STORED);
+    type.setStoreTermVectorOffsets(true);
+    type.setStoreTermVectorPositions(true);
+    type.setStoreTermVectors(true);
+    type.freeze();
+    Field text = new Field("text", 
+        "Netscape was the general name for a series of web browsers originally produced by Netscape Communications Corporation, now a subsidiary of AOL The original browser was once the dominant browser in terms of usage share, but as a result of the first browser war it lost virtually all of its share to Internet Explorer Netscape was discontinued and support for all Netscape browsers and client products was terminated on March 1, 2008 Netscape Navigator was the name of Netscape\u0027s web browser from versions 1.0 through 4.8 The first beta release versions of the browser were released in 1994 and known as Mosaic and then Mosaic Netscape until a legal challenge from the National Center for Supercomputing Applications (makers of NCSA Mosaic, which many of Netscape\u0027s founders used to develop), led to the name change to Netscape Navigator The company\u0027s name also changed from Mosaic Communications Corporation to Netscape Communications Corporation The browser was easily the most advanced...", type);
+    doc.add(text);
+    writer.addDocument(doc);
+    FastVectorHighlighter highlighter = new FastVectorHighlighter();
+    IndexReader reader = DirectoryReader.open(writer, true);
+    int docId = 0;
+    String field = "text";
+    {
+      BooleanQuery query = new BooleanQuery();
+      query.add(new TermQuery(new Term(field, "internet")), Occur.MUST);
+      query.add(new TermQuery(new Term(field, "explorer")), Occur.MUST);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 128, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("first browser war it lost virtually all of its share to <b>Internet</b> <b>Explorer</b> Netscape was discontinued and support for all Netscape browsers", bestFragments[0]);
+    }
+    
+    {
+      PhraseQuery query = new PhraseQuery();
+      query.add(new Term(field, "internet"));
+      query.add(new Term(field, "explorer"));
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 128, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("first browser war it lost virtually all of its share to <b>Internet Explorer</b> Netscape was discontinued and support for all Netscape browsers", bestFragments[0]);
+    }
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+  
+  // see LUCENE-4899
+  public void testPhraseHighlightTest() throws IOException {
+    Directory dir = newDirectory();
+    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
+    Document doc = new Document();
+    FieldType type = new FieldType(TextField.TYPE_STORED);
+    type.setStoreTermVectorOffsets(true);
+    type.setStoreTermVectorPositions(true);
+    type.setStoreTermVectors(true);
+    type.freeze();
+    Field longTermField = new Field("long_term", "This is a test thisisaverylongwordandmakessurethisfails where foo is highlighed and should be highlighted", type);
+    Field noLongTermField = new Field("no_long_term", "This is a test where foo is highlighed and should be highlighted", type);
+
+    doc.add(longTermField);
+    doc.add(noLongTermField);
+    writer.addDocument(doc);
+    FastVectorHighlighter highlighter = new FastVectorHighlighter();
+    IndexReader reader = DirectoryReader.open(writer, true);
+    int docId = 0;
+    String field = "no_long_term";
+    {
+      BooleanQuery query = new BooleanQuery();
+      query.add(new TermQuery(new Term(field, "test")), Occur.MUST);
+      query.add(new TermQuery(new Term(field, "foo")), Occur.MUST);
+      query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 18, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("<b>foo</b> is <b>highlighed</b> and", bestFragments[0]);
+    }
+    {
+      BooleanQuery query = new BooleanQuery();
+      PhraseQuery pq = new PhraseQuery();
+      pq.add(new Term(field, "test"));
+      pq.add(new Term(field, "foo"));
+      pq.add(new Term(field, "highlighed"));
+      pq.setSlop(5);
+      query.add(new TermQuery(new Term(field, "foo")), Occur.MUST);
+      query.add(pq, Occur.MUST);
+      query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 18, 1);
+      // highlighted results are centered
+      assertEquals(0, bestFragments.length);
+      bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 30, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("a <b>test</b> where <b>foo</b> is <b>highlighed</b> and", bestFragments[0]);
+      
+    }
+    {
+      PhraseQuery query = new PhraseQuery();
+      query.add(new Term(field, "test"));
+      query.add(new Term(field, "foo"));
+      query.add(new Term(field, "highlighed"));
+      query.setSlop(3);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 18, 1);
+      // highlighted results are centered
+      assertEquals(0, bestFragments.length);
+      bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 30, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("a <b>test</b> where <b>foo</b> is <b>highlighed</b> and", bestFragments[0]);
+      
+    }
+    {
+      PhraseQuery query = new PhraseQuery();
+      query.add(new Term(field, "test"));
+      query.add(new Term(field, "foo"));
+      query.add(new Term(field, "highlighted"));
+      query.setSlop(30);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 18, 1);
+      assertEquals(0, bestFragments.length);
+    }
+    {
+      BooleanQuery query = new BooleanQuery();
+      PhraseQuery pq = new PhraseQuery();
+      pq.add(new Term(field, "test"));
+      pq.add(new Term(field, "foo"));
+      pq.add(new Term(field, "highlighed"));
+      pq.setSlop(5);
+      BooleanQuery inner = new BooleanQuery();
+      inner.add(pq, Occur.MUST);
+      inner.add(new TermQuery(new Term(field, "foo")), Occur.MUST);
+      query.add(inner, Occur.MUST);
+      query.add(pq, Occur.MUST);
+      query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 18, 1);
+      assertEquals(0, bestFragments.length);
+      
+      bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 30, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("a <b>test</b> where <b>foo</b> is <b>highlighed</b> and", bestFragments[0]);
+    }
+    
+    field = "long_term";
+    {
+      BooleanQuery query = new BooleanQuery();
+      query.add(new TermQuery(new Term(field,
+          "thisisaverylongwordandmakessurethisfails")), Occur.MUST);
+      query.add(new TermQuery(new Term(field, "foo")), Occur.MUST);
+      query.add(new TermQuery(new Term(field, "highlighed")), Occur.MUST);
+      FieldQuery fieldQuery = highlighter.getFieldQuery(query, reader);
+      String[] bestFragments = highlighter.getBestFragments(fieldQuery, reader,
+          docId, field, 18, 1);
+      // highlighted results are centered
+      assertEquals(1, bestFragments.length);
+      assertEquals("<b>thisisaverylongwordandmakessurethisfails</b>",
+          bestFragments[0]);
+    }
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+  
  public void testCommonTermsQueryHighlightTest() throws IOException {
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(TEST_VERSION_CURRENT,  new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET, true)));
--- a/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/vectorhighlight/SimpleFragListBuilderTest.java
@ -42,7 +42,7 @@ public class SimpleFragListBuilderTest extends AbstractTestCase {
    SimpleFragListBuilder sflb = new SimpleFragListBuilder();
    FieldFragList ffl = sflb.createFieldFragList( fpl(new TermQuery(new Term(F, "abcdefghijklmnopqrs")), "abcdefghijklmnopqrs" ), sflb.minFragCharSize );
    assertEquals( 1, ffl.getFragInfos().size() );
-    assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,18)", ffl.getFragInfos().get( 0 ).toString() );
+    assertEquals( "subInfos=(abcdefghijklmnopqrs((0,19)))/1.0(0,19)", ffl.getFragInfos().get( 0 ).toString() );
  }
  
  public void testSmallerFragSizeThanPhraseQuery() throws Exception {
@ -55,7 +55,7 @@ public class SimpleFragListBuilderTest extends AbstractTestCase {
    FieldFragList ffl = sflb.createFieldFragList( fpl(phraseQuery, "abcdefgh   jklmnopqrs" ), sflb.minFragCharSize );
    assertEquals( 1, ffl.getFragInfos().size() );
    if (VERBOSE) System.out.println( ffl.getFragInfos().get( 0 ).toString() );
-    assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(1,19)", ffl.getFragInfos().get( 0 ).toString() );
+    assertEquals( "subInfos=(abcdefghjklmnopqrs((0,21)))/1.0(0,21)", ffl.getFragInfos().get( 0 ).toString() );
  }
  
  public void test1TermIndex() throws Exception {