From 1216f64e2569830d7d6ac05fbcfdce7baa2dadc3 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Mon, 12 Aug 2013 17:47:09 +0000
Subject: [PATCH] LUCENE-5166: PostingsHighlighter fails with
 IndexOutOfBoundsException

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1513207 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  4 +
 .../PostingsHighlighter.java                  |  7 ++
 .../TestPostingsHighlighter.java              | 75 +++++++++++++++++++
 .../TestPostingsHighlighterRanking.java       |  2 +
 4 files changed, 88 insertions(+)

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5859bb75f2a..e7c4c97abb2 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -111,6 +111,10 @@ Bug Fixes
   time we start the read loop (where we check the length) and when we actually do
   the read. (gsingers, yonik, Robert Muir, Uwe Schindler)
 
+* LUCENE-5166: PostingsHighlighter would throw IOOBE if a term spanned the maxLength
+  boundary, made it into the top-N and went to the formatter.
+  (Manuel Amoabeng, Michael McCandless, Robert Muir)
+
 API Changes
 
 * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap.
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
index 622a9f28d61..3d7b95d2aaf 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
@@ -506,6 +506,13 @@ public class PostingsHighlighter {
         throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
       }
       int end = dp.endOffset();
+      // LUCENE-5166: this hit would span the content limit... however more valid 
+      // hits may exist (they are sorted by start). so we pretend like we never 
+      // saw this term, it won't cause a passage to be added to passageQueue or anything.
+      assert EMPTY.startOffset() == Integer.MAX_VALUE;
+      if (start < contentLength && end > contentLength) {
+        continue;
+      }
       if (start >= current.endOffset) {
         if (current.startOffset >= 0) {
           // finalize current
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
index f72c59c8486..10e2c5fa2c5 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java
@@ -87,6 +87,81 @@ public class TestPostingsHighlighter extends LuceneTestCase {
     dir.close();
   }
   
+  public void testFormatWithMatchExceedingContentLength() throws Exception {
+          
+    int maxLength = 17;
+    String bodyText = "123 5678 01234 TEST";
+    
+    final Analyzer analyzer = new MockAnalyzer(random());
+    
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer);
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    final FieldType fieldType = new FieldType(TextField.TYPE_STORED);
+    fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    final Field body = new Field("body", bodyText, fieldType);
+    
+    Document doc = new Document();
+    doc.add(body);
+    
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    
+    Query query = new TermQuery(new Term("body", "test"));
+    
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    
+    PostingsHighlighter highlighter = new PostingsHighlighter(maxLength);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+    
+    
+    assertEquals(1, snippets.length);
+    // LUCENE-5166: no snippet
+    assertEquals("123 5678 01234 TE", snippets[0]);
+    
+    ir.close();
+    dir.close();
+  }
+  
+  // simple test highlighting last word.
+  public void testHighlightLastWord() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    iwc.setMergePolicy(newLogMergePolicy());
+    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
+    
+    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
+    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
+    Field body = new Field("body", "", offsetsType);
+    Document doc = new Document();
+    doc.add(body);
+    
+    body.setStringValue("This is a test");
+    iw.addDocument(doc);
+    
+    IndexReader ir = iw.getReader();
+    iw.close();
+    
+    IndexSearcher searcher = newSearcher(ir);
+    PostingsHighlighter highlighter = new PostingsHighlighter();
+    Query query = new TermQuery(new Term("body", "test"));
+    TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER);
+    assertEquals(1, topDocs.totalHits);
+    String snippets[] = highlighter.highlight("body", query, searcher, topDocs);
+    assertEquals(1, snippets.length);
+    assertEquals("This is a <b>test</b>", snippets[0]);
+    
+    ir.close();
+    dir.close();
+  }
+  
   // simple test with one sentence documents.
   public void testOneSentence() throws Exception {
     Directory dir = newDirectory();
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
index 32fbf505166..0050e73ee85 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java
@@ -173,6 +173,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase {
         assertTrue(p.getNumMatches() > 0);
         assertTrue(p.getStartOffset() >= 0);
         assertTrue(p.getStartOffset() <= content.length());
+        assertTrue(p.getEndOffset() >= p.getStartOffset());
+        assertTrue(p.getEndOffset() <= content.length());
         // we use a very simple analyzer. so we can assert the matches are correct
         int lastMatchStart = -1;
         for (int i = 0; i < p.getNumMatches(); i++) {