diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6812d2d729a..a60346293d1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -48,6 +48,9 @@ Improvements * LUCENE-9538: Detect polygon self-intersections in the Tessellator. (Ignacio Vera) * LUCENE-10275: Speed up MultiRangeQuery by using an interval tree. (Ignacio Vera) + +* LUCENE-10229: Unify behaviour of match offsets for interval queries on fields + with or without offsets enabled. (Patrick Zhai) Optimizations --------------------- diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java index fa6eb1dc504..3ac4ec6a60c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchHighlighter.java @@ -332,8 +332,6 @@ public class TestMatchHighlighter extends LuceneTestCase { } }; - // TODO: the highlights are different when the field is indexed with offsets. Weird. - // String field = FLD_TEXT1; String field = FLD_TEXT2; new IndexBuilder(this::toField) // Just one document and multiple interval queries. @@ -541,6 +539,235 @@ public class TestMatchHighlighter extends LuceneTestCase { }); } + /** + * Almost the same as the one above, make sure the fields indexed with offsets are also + * highlighted correctly + */ + @Test + public void testIntervalFunctionsWithOffsetField() throws Exception { + Analyzer analyzer = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new StandardTokenizer(); + TokenStream ts = tokenizer; + ts = new LowerCaseFilter(ts); + return new TokenStreamComponents(tokenizer, ts); + } + }; + + String field = FLD_TEXT1; + new IndexBuilder(this::toField) + // Just one document and multiple interval queries. + .doc(field, "The quick brown fox jumps over the lazy dog") + .build( + analyzer, + reader -> { + IndexSearcher searcher = new IndexSearcher(reader); + Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered. + + MatchHighlighter highlighter = + new MatchHighlighter(searcher, analyzer) + .appendFieldHighlighter( + FieldValueHighlighters.highlighted( + 80 * 3, 1, new PassageFormatter("...", ">", "<"), fld -> true)) + .appendFieldHighlighter(FieldValueHighlighters.skipRemaining()); + + StandardQueryParser qp = new StandardQueryParser(analyzer); + + // Run all pairs of query-expected highlight. + List errors = new ArrayList<>(); + for (var queryHighlightPair : + new String[][] { + { + "fn:ordered(brown dog)", + "0. %s: The quick >brown fox jumps over the lazy dog<" + }, + { + "fn:within(fn:or(lazy quick) 1 fn:or(dog fox))", + "0. %s: The quick brown fox jumps over the >lazy< dog" + }, + { + "fn:containedBy(fox fn:ordered(brown fox dog))", + "0. %s: The quick brown >fox< jumps over the lazy dog" + }, + { + "fn:atLeast(2 quick fox \"furry dog\")", + "0. %s: The >quick brown fox< jumps over the lazy dog" + }, + { + "fn:maxgaps(0 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))", + "0. %s: The quick brown fox jumps over the >lazy dog<" + }, + { + "fn:maxgaps(1 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))", + "0. %s: The >quick brown fox< jumps over the >lazy dog<" + }, + { + "fn:maxwidth(2 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))", + "0. %s: The quick brown fox jumps over the >lazy dog<" + }, + { + "fn:maxwidth(3 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))", + "0. %s: The >quick brown fox< jumps over the >lazy dog<" + }, + { + "fn:or(quick \"fox\")", + "0. %s: The >quick< brown >fox< jumps over the lazy dog" + }, + {"fn:or(\"quick fox\")"}, + { + "fn:phrase(quick brown fox)", + "0. %s: The >quick brown fox< jumps over the lazy dog" + }, + {"fn:wildcard(jump*)", "0. %s: The quick brown fox >jumps< over the lazy dog"}, + {"fn:wildcard(br*n)", "0. %s: The quick >brown< fox jumps over the lazy dog"}, + {"fn:or(dog fox)", "0. %s: The quick brown >fox< jumps over the lazy >dog<"}, + { + "fn:phrase(fn:ordered(quick fox) jumps)", + "0. %s: The >quick brown fox jumps< over the lazy dog" + }, + { + "fn:ordered(quick jumps dog)", + "0. %s: The >quick brown fox jumps over the lazy dog<" + }, + { + "fn:ordered(quick fn:or(fox dog))", + "0. %s: The >quick brown fox< jumps over the lazy dog" + }, + { + "fn:ordered(quick jumps fn:or(fox dog))", + "0. %s: The >quick brown fox jumps over the lazy dog<" + }, + { + "fn:unordered(dog jumps quick)", + "0. %s: The >quick brown fox jumps over the lazy dog<" + }, + { + "fn:unordered(fn:or(fox dog) quick)", + "0. %s: The >quick brown fox< jumps over the lazy dog" + }, + { + "fn:unordered(fn:phrase(brown fox) fn:phrase(fox jumps))", + "0. %s: The quick >brown fox jumps< over the lazy dog" + }, + {"fn:ordered(fn:phrase(brown fox) fn:phrase(fox jumps))"}, + {"fn:unorderedNoOverlaps(fn:phrase(brown fox) fn:phrase(fox jumps))"}, + { + "fn:before(fn:or(brown lazy) fox)", + "0. %s: The quick >brown< fox jumps over the lazy dog" + }, + { + "fn:before(fn:or(brown lazy) fn:or(dog fox))", + "0. %s: The quick >brown< fox jumps over the >lazy< dog" + }, + { + "fn:after(fn:or(brown lazy) fox)", + "0. %s: The quick brown fox jumps over the >lazy< dog" + }, + { + "fn:after(fn:or(brown lazy) fn:or(dog fox))", + "0. %s: The quick brown fox jumps over the >lazy< dog" + }, + { + "fn:within(fn:or(fox dog) 1 fn:or(quick lazy))", + "0. %s: The quick brown fox jumps over the lazy >dog<" + }, + { + "fn:within(fn:or(fox dog) 2 fn:or(quick lazy))", + "0. %s: The quick brown >fox< jumps over the lazy >dog<" + }, + { + "fn:notWithin(fn:or(fox dog) 1 fn:or(quick lazy))", + "0. %s: The quick brown >fox< jumps over the lazy dog" + }, + { + "fn:containedBy(fn:or(fox dog) fn:ordered(quick lazy))", + "0. %s: The quick brown >fox< jumps over the lazy dog" + }, + { + "fn:notContainedBy(fn:or(fox dog) fn:ordered(quick lazy))", + "0. %s: The quick brown fox jumps over the lazy >dog<" + }, + { + "fn:containing(fn:atLeast(2 quick fox dog) jumps)", + "0. %s: The quick brown >fox jumps over the lazy dog<" + }, + { + "fn:notContaining(fn:ordered(fn:or(the The) fn:or(fox dog)) brown)", + "0. %s: The quick brown fox jumps over >the lazy dog<" + }, + { + "fn:overlapping(fn:phrase(brown fox) fn:phrase(fox jumps))", + "0. %s: The quick >brown fox< jumps over the lazy dog" + }, + { + "fn:overlapping(fn:or(fox dog) fn:extend(lazy 2 2))", + "0. %s: The quick brown fox jumps over the lazy >dog<" + }, + { + "fn:nonOverlapping(fn:phrase(brown fox) fn:phrase(lazy dog))", + "0. %s: The quick >brown fox< jumps over the lazy dog" + }, + { + "fn:nonOverlapping(fn:or(fox dog) fn:extend(lazy 2 2))", + "0. %s: The quick brown >fox< jumps over the lazy dog" + }, + { + "fn:atLeast(2 fn:unordered(furry dog) fn:unordered(brown dog) lazy quick)", + "0. %s: The >quick >brown fox jumps over the lazy<<> dog<" + }, + /* + The test cases below do not work for fields enabled with offset yet: + mainly "extend" + TODO: Fix them! + + {"fn:extend(fox 1 2)", "0. %s: The quick >brown fox jumps over< the lazy dog"}, + { + "fn:extend(fn:or(dog fox) 2 0)", + "0. %s: The >quick brown fox< jumps over >the lazy dog<" + }, + { + "fn:containedBy(fn:or(fox dog) fn:extend(lazy 3 3))", + "0. %s: The quick brown fox jumps over the lazy >dog<" + }, + { + "fn:notContainedBy(fn:or(fox dog) fn:extend(lazy 3 3))", + "0. %s: The quick brown >fox< jumps over the lazy dog" + }, + { + "fn:containing(fn:extend(fn:or(lazy brown) 1 1) fn:or(fox dog))", + "0. %s: The >quick brown fox< jumps over >the lazy dog<" + }, + { + "fn:notContaining(fn:extend(fn:or(fox dog) 1 0) fn:or(brown yellow))", + "0. %s: The quick brown fox jumps over the >lazy dog<" + } */ + }) { + assert queryHighlightPair.length >= 1; + String queryString = queryHighlightPair[0]; + var query = qp.parse(queryString, field); + var expected = + Arrays.stream(queryHighlightPair) + .skip(1) + .map(v -> String.format(Locale.ROOT, v, field)) + .toArray(String[]::new); + + try { + assertHighlights( + toDocList( + highlighter.highlight(searcher.search(query, 10, sortOrder), query)), + expected); + } catch (AssertionError e) { + errors.add("MISMATCH: query: " + queryString + "\n" + e.getMessage()); + } + } + if (errors.size() > 0) { + throw new AssertionError(String.join("\n\n", errors)); + } + }); + } + @Test public void testCustomFieldHighlightHandling() throws Exception { // Match highlighter is a showcase of individual components in this package, suitable diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java index 4d8d4f29d53..dac77c141b6 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java @@ -379,7 +379,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase { Intervals.containedBy( Intervals.term("foo"), Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))), - containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field))); + containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field))); MatcherAssert.assertThat( highlights( @@ -387,9 +387,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase { new IntervalQuery( field, Intervals.overlapping( - Intervals.unordered(Intervals.term("foo"), Intervals.term("bar")), - Intervals.term("foo")))), - containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field))); + Intervals.term("foo"), + Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))), + containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field))); }); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java index 56cd0a6074d..90e15a04775 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java @@ -59,6 +59,15 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource { protected abstract IntervalIterator combine(List iterators); + /** + * Create matches iterator from an advanced and validated interval iterator and a list of matches + * iterator of all the sub-sources + */ + protected IntervalMatchesIterator createMatchesIterator( + IntervalIterator it, List subs) { + return new ConjunctionMatchesIterator(it, subs); + } + @Override public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { @@ -81,6 +90,6 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource { if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { return null; } - return new ConjunctionMatchesIterator(it, subs); + return createMatchesIterator(it, subs); } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java index f8c50b1b2d4..d862114d4dc 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ContainedByIntervalsSource.java @@ -66,6 +66,14 @@ class ContainedByIntervalsSource extends ConjunctionIntervalsSource { }; } + @Override + protected IntervalMatchesIterator createMatchesIterator( + IntervalIterator it, List subs) { + assert subs.size() == 2; + // the only sub source we care is the "small" source + return new ConjunctionMatchesIterator(it, List.of(subs.get(0))); + } + @Override public int minExtent() { return small.minExtent(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java index 51dfe48ad8b..c2346ee8e49 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/Intervals.java @@ -271,7 +271,10 @@ public final class Intervals { } /** - * Create an unordered {@link IntervalsSource} + * Create an unordered {@link IntervalsSource}. Note that if there are multiple intervals ends at + * the same position are eligible, only the narrowest one will be returned. For example if asking + * for unordered(term("apple"), term("banana")) on field of "apple wolf apple orange + * banana", only the "apple orange banana" will be returned. * *

Returns intervals in which all the subsources appear. The subsources may overlap * diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java index 6a1db8618f5..ea3181ef389 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java @@ -215,6 +215,7 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource { @Override public int nextInterval() throws IOException { + lead = null; // first, find a matching interval beyond the current start while (this.proximityQueue.size() == minShouldMatch && proximityQueue.top().start() == start) { @@ -258,7 +259,9 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource { Collection getCurrentIterators() { currentIterators.clear(); - currentIterators.add(lead); + if (lead != null) { + currentIterators.add(lead); + } for (IntervalIterator it : this.proximityQueue) { if (it.end() <= end) { currentIterators.add(it); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java index bb8c5bbfbcb..81efb350891 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OverlappingIntervalsSource.java @@ -62,6 +62,14 @@ class OverlappingIntervalsSource extends ConjunctionIntervalsSource { }; } + @Override + protected IntervalMatchesIterator createMatchesIterator( + IntervalIterator it, List subs) { + assert subs.size() == 2; + // the only sub source we care is the "real" source + return new ConjunctionMatchesIterator(it, List.of(subs.get(0))); + } + @Override public int minExtent() { return source.minExtent(); diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index abee3d06a4c..fe34c0c963e 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -619,17 +619,13 @@ public class TestIntervals extends LuceneTestCase { checkIntervals( source, "field1", 3, new int[][] {{}, {4, 4, 7, 7}, {1, 1, 7, 7}, {}, {4, 4}, {}}); MatchesIterator mi = getMatches(source, 1, "field1"); - assertMatch(mi, 4, 4, 20, 39); + assertMatch(mi, 4, 4, 26, 34); MatchesIterator subs = mi.getSubMatches(); - assertMatch(subs, 3, 3, 20, 25); assertMatch(subs, 4, 4, 26, 34); - assertMatch(subs, 5, 5, 35, 39); assertFalse(subs.next()); - assertMatch(mi, 7, 7, 41, 118); + assertMatch(mi, 7, 7, 47, 55); subs = mi.getSubMatches(); - assertMatch(subs, 6, 6, 41, 46); assertMatch(subs, 7, 7, 47, 55); - assertMatch(subs, 21, 21, 114, 118); assertFalse(subs.next()); assertFalse(mi.next()); assertEquals(1, source.minExtent()); @@ -786,6 +782,30 @@ public class TestIntervals extends LuceneTestCase { assertEquals(3, source.minExtent()); } + public void testMinShouldMatch2() throws IOException { + IntervalsSource source = + Intervals.atLeast( + 2, + Intervals.unordered(Intervals.term("alph"), Intervals.term("ran")), + Intervals.term("where"), + Intervals.term("river")); + MatchesIterator mi = getMatches(source, 1, "field2"); + assertMatch(mi, 0, 4, 0, 27); // contains "where" and "river" + MatchesIterator subs = mi.getSubMatches(); + assertNotNull(subs); + assertMatch(subs, 0, 0, 0, 5); + assertMatch(subs, 4, 4, 22, 27); + assertFalse(subs.next()); + assertMatch(mi, 1, 5, 6, 31); // contains "river" and unordered("alph", "run") + subs = mi.getSubMatches(); + assertNotNull(subs); + assertMatch(subs, 1, 1, 6, 10); + assertMatch(subs, 4, 4, 22, 27); + assertMatch(subs, 5, 5, 28, 31); + assertFalse(subs.next()); + assertFalse(mi.next()); + } + public void testDegenerateMinShouldMatch() throws IOException { IntervalsSource source = Intervals.ordered( @@ -849,11 +869,9 @@ public class TestIntervals extends LuceneTestCase { checkIntervals(source, "field1", 3, new int[][] {{}, {7, 7}, {4, 4, 7, 7}, {}, {7, 7}, {}}); MatchesIterator mi = getMatches(source, 1, "field1"); - assertMatch(mi, 7, 7, 20, 55); + assertMatch(mi, 7, 7, 47, 55); MatchesIterator sub = mi.getSubMatches(); assertNotNull(sub); - assertMatch(sub, 3, 3, 20, 25); - assertMatch(sub, 5, 5, 35, 39); assertMatch(sub, 7, 7, 47, 55); assertFalse(sub.next()); @@ -888,15 +906,13 @@ public class TestIntervals extends LuceneTestCase { MatchesIterator mi = getMatches(source, 1, "field1"); assertNotNull(mi); - assertMatch(mi, 2, 4, 15, 39); + assertMatch(mi, 2, 4, 15, 34); MatchesIterator sub = mi.getSubMatches(); assertNotNull(sub); assertMatch(sub, 2, 2, 15, 18); - assertMatch(sub, 3, 3, 20, 25); assertMatch(sub, 4, 4, 26, 34); - assertMatch(sub, 5, 5, 35, 39); assertFalse(sub.next()); - assertMatch(mi, 7, 17, 41, 118); + assertMatch(mi, 7, 17, 47, 99); assertEquals(2, source.minExtent()); }