From d453832bb8d5318a444053240f93bff17dc680d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Campinas?= Date: Thu, 20 Jun 2024 16:36:51 +0200 Subject: [PATCH] Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter by unordered matches (#13315) --- lucene/CHANGES.txt | 3 ++ .../search/DisjunctionMatchesIterator.java | 9 ++++ .../apache/lucene/search/MatchesIterator.java | 4 +- .../TestUnifiedHighlighterTermVec.java | 47 +++++++++++++++++++ 4 files changed, 61 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index baa53ed5d90..13c7c2dcb3b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -179,6 +179,9 @@ Bug Fixes * GITHUB#12878: Fix the declared Exceptions of Expression#evaluate() to match those of DoubleValues#doubleValue(). (Uwe Schindler) +* GITHUB#12431: Fix IndexOutOfBoundsException thrown in DefaultPassageFormatter + by unordered matches. (Stephane Campinas) + Changes in Runtime Behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java index fa86505088d..13852b1b9ee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/DisjunctionMatchesIterator.java @@ -194,6 +194,15 @@ final class DisjunctionMatchesIterator implements MatchesIterator { new PriorityQueue(matches.size()) { @Override protected boolean lessThan(MatchesIterator a, MatchesIterator b) { + if (a.startPosition() == -1 && b.startPosition() == -1) { + try { + return a.startOffset() < b.startOffset() + || (a.startOffset() == b.startOffset() && a.endOffset() < b.endOffset()) + || (a.startOffset() == b.startOffset() && a.endOffset() == b.endOffset()); + } catch (IOException e) { + throw new IllegalArgumentException("Failed to retrieve term offset", e); + } + } return a.startPosition() < b.startPosition() || (a.startPosition() == b.startPosition() && a.endPosition() < b.endPosition()) || (a.startPosition() == b.startPosition() && a.endPosition() == b.endPosition()); diff --git a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java index 371ba0ee695..24da0246c6d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java +++ b/lucene/core/src/java/org/apache/lucene/search/MatchesIterator.java @@ -45,14 +45,14 @@ public interface MatchesIterator { boolean next() throws IOException; /** - * The start position of the current match + * The start position of the current match, or {@code -1} if positions are not available * *

Should only be called after {@link #next()} has returned {@code true} */ int startPosition(); /** - * The end position of the current match + * The end position of the current match, or {@code -1} if positions are not available * *

Should only be called after {@link #next()} has returned {@code true} */ diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java index 35a638e2aa1..84ea233affe 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterTermVec.java @@ -27,6 +27,7 @@ import java.util.Set; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FilterDirectoryReader; @@ -57,6 +58,52 @@ public class TestUnifiedHighlighterTermVec extends UnifiedHighlighterTestBase { super(randomFieldType(random())); } + public void testTermVecButNoPositions1() throws Exception { + testTermVecButNoPositions("x", "y", "y x", "y x"); + } + + public void testTermVecButNoPositions2() throws Exception { + testTermVecButNoPositions("y", "x", "y x", "y x"); + } + + public void testTermVecButNoPositions3() throws Exception { + testTermVecButNoPositions("zzz", "yyy", "zzz yyy", "zzz yyy"); + } + + public void testTermVecButNoPositions4() throws Exception { + testTermVecButNoPositions("zzz", "yyy", "yyy zzz", "yyy zzz"); + } + + public void testTermVecButNoPositions(String aaa, String bbb, String indexed, String expected) + throws Exception { + final FieldType tvNoPosType = new FieldType(TextField.TYPE_STORED); + tvNoPosType.setStoreTermVectors(true); + tvNoPosType.setStoreTermVectorOffsets(true); + tvNoPosType.freeze(); + + RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer); + + Field body = new Field("body", indexed, tvNoPosType); + Document document = new Document(); + document.add(body); + iw.addDocument(document); + try (IndexReader ir = iw.getReader()) { + iw.close(); + IndexSearcher searcher = newSearcher(ir); + BooleanQuery query = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("body", aaa)), BooleanClause.Occur.MUST) + .add(new TermQuery(new Term("body", bbb)), BooleanClause.Occur.MUST) + .build(); + TopDocs topDocs = searcher.search(query, 10); + assertEquals(1, topDocs.totalHits.value); + UnifiedHighlighter highlighter = UnifiedHighlighter.builder(searcher, indexAnalyzer).build(); + String[] snippets = highlighter.highlight("body", query, topDocs, 2); + assertEquals(1, snippets.length); + assertTrue(snippets[0], snippets[0].contains(expected)); + } + } + public void testFetchTermVecsOncePerDoc() throws IOException { RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);