From 1216f64e2569830d7d6ac05fbcfdce7baa2dadc3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 12 Aug 2013 17:47:09 +0000 Subject: [PATCH] LUCENE-5166: PostingsHighlighter fails with IndexOutOfBoundsException git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1513207 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../PostingsHighlighter.java | 7 ++ .../TestPostingsHighlighter.java | 75 +++++++++++++++++++ .../TestPostingsHighlighterRanking.java | 2 + 4 files changed, 88 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5859bb75f2a..e7c4c97abb2 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -111,6 +111,10 @@ Bug Fixes time we start the read loop (where we check the length) and when we actually do the read. (gsingers, yonik, Robert Muir, Uwe Schindler) +* LUCENE-5166: PostingsHighlighter would throw IOOBE if a term spanned the maxLength + boundary, made it into the top-N and went to the formatter. + (Manuel Amoabeng, Michael McCandless, Robert Muir) + API Changes * LUCENE-5094: Add ramBytesUsed() to MultiDocValues.OrdinalMap. diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java index 622a9f28d61..3d7b95d2aaf 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java @@ -506,6 +506,13 @@ public class PostingsHighlighter { throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.endOffset(); + // LUCENE-5166: this hit would span the content limit... however more valid + // hits may exist (they are sorted by start). so we pretend like we never + // saw this term, it won't cause a passage to be added to passageQueue or anything. + assert EMPTY.startOffset() == Integer.MAX_VALUE; + if (start < contentLength && end > contentLength) { + continue; + } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java index f72c59c8486..10e2c5fa2c5 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighter.java @@ -87,6 +87,81 @@ public class TestPostingsHighlighter extends LuceneTestCase { dir.close(); } + public void testFormatWithMatchExceedingContentLength() throws Exception { + + int maxLength = 17; + String bodyText = "123 5678 01234 TEST"; + + final Analyzer analyzer = new MockAnalyzer(random()); + + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, analyzer); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + final FieldType fieldType = new FieldType(TextField.TYPE_STORED); + fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + final Field body = new Field("body", bodyText, fieldType); + + Document doc = new Document(); + doc.add(body); + + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + + Query query = new TermQuery(new Term("body", "test")); + + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + + PostingsHighlighter highlighter = new PostingsHighlighter(maxLength); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + + + assertEquals(1, snippets.length); + // LUCENE-5166: no snippet + assertEquals("123 5678 01234 TE", snippets[0]); + + ir.close(); + dir.close(); + } + + // simple test highlighting last word. + public void testHighlightLastWord() throws Exception { + Directory dir = newDirectory(); + IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())); + iwc.setMergePolicy(newLogMergePolicy()); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc); + + FieldType offsetsType = new FieldType(TextField.TYPE_STORED); + offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); + Field body = new Field("body", "", offsetsType); + Document doc = new Document(); + doc.add(body); + + body.setStringValue("This is a test"); + iw.addDocument(doc); + + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher searcher = newSearcher(ir); + PostingsHighlighter highlighter = new PostingsHighlighter(); + Query query = new TermQuery(new Term("body", "test")); + TopDocs topDocs = searcher.search(query, null, 10, Sort.INDEXORDER); + assertEquals(1, topDocs.totalHits); + String snippets[] = highlighter.highlight("body", query, searcher, topDocs); + assertEquals(1, snippets.length); + assertEquals("This is a test", snippets[0]); + + ir.close(); + dir.close(); + } + // simple test with one sentence documents. public void testOneSentence() throws Exception { Directory dir = newDirectory(); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java index 32fbf505166..0050e73ee85 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/postingshighlight/TestPostingsHighlighterRanking.java @@ -173,6 +173,8 @@ public class TestPostingsHighlighterRanking extends LuceneTestCase { assertTrue(p.getNumMatches() > 0); assertTrue(p.getStartOffset() >= 0); assertTrue(p.getStartOffset() <= content.length()); + assertTrue(p.getEndOffset() >= p.getStartOffset()); + assertTrue(p.getEndOffset() <= content.length()); // we use a very simple analyzer. so we can assert the matches are correct int lastMatchStart = -1; for (int i = 0; i < p.getNumMatches(); i++) {