LUCENE-10229: Unify behaviour of match offsets for interval queries (#521)

This commit is contained in:
Patrick Zhai 2021-12-09 08:19:18 -08:00 committed by GitHub
parent 40c213d873
commit 53099e01de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 299 additions and 22 deletions

View File

@ -79,6 +79,9 @@ Improvements
* LUCENE-9538: Detect polygon self-intersections in the Tessellator. (Ignacio Vera)
* LUCENE-10275: Speed up MultiRangeQuery by using an interval tree. (Ignacio Vera)
* LUCENE-10229: Unify behaviour of match offsets for interval queries on fields
with or without offsets enabled. (Patrick Zhai)
Optimizations
---------------------

View File

@ -332,8 +332,6 @@ public class TestMatchHighlighter extends LuceneTestCase {
}
};
// TODO: the highlights are different when the field is indexed with offsets. Weird.
// String field = FLD_TEXT1;
String field = FLD_TEXT2;
new IndexBuilder(this::toField)
// Just one document and multiple interval queries.
@ -541,6 +539,235 @@ public class TestMatchHighlighter extends LuceneTestCase {
});
}
/**
* Almost the same as the one above, make sure the fields indexed with offsets are also
* highlighted correctly
*/
@Test
public void testIntervalFunctionsWithOffsetField() throws Exception {
Analyzer analyzer =
new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
TokenStream ts = tokenizer;
ts = new LowerCaseFilter(ts);
return new TokenStreamComponents(tokenizer, ts);
}
};
String field = FLD_TEXT1;
new IndexBuilder(this::toField)
// Just one document and multiple interval queries.
.doc(field, "The quick brown fox jumps over the lazy dog")
.build(
analyzer,
reader -> {
IndexSearcher searcher = new IndexSearcher(reader);
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
MatchHighlighter highlighter =
new MatchHighlighter(searcher, analyzer)
.appendFieldHighlighter(
FieldValueHighlighters.highlighted(
80 * 3, 1, new PassageFormatter("...", ">", "<"), fld -> true))
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
StandardQueryParser qp = new StandardQueryParser(analyzer);
// Run all pairs of query-expected highlight.
List<String> errors = new ArrayList<>();
for (var queryHighlightPair :
new String[][] {
{
"fn:ordered(brown dog)",
"0. %s: The quick >brown fox jumps over the lazy dog<"
},
{
"fn:within(fn:or(lazy quick) 1 fn:or(dog fox))",
"0. %s: The quick brown fox jumps over the >lazy< dog"
},
{
"fn:containedBy(fox fn:ordered(brown fox dog))",
"0. %s: The quick brown >fox< jumps over the lazy dog"
},
{
"fn:atLeast(2 quick fox \"furry dog\")",
"0. %s: The >quick brown fox< jumps over the lazy dog"
},
{
"fn:maxgaps(0 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
"0. %s: The quick brown fox jumps over the >lazy dog<"
},
{
"fn:maxgaps(1 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
"0. %s: The >quick brown fox< jumps over the >lazy dog<"
},
{
"fn:maxwidth(2 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
"0. %s: The quick brown fox jumps over the >lazy dog<"
},
{
"fn:maxwidth(3 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
"0. %s: The >quick brown fox< jumps over the >lazy dog<"
},
{
"fn:or(quick \"fox\")",
"0. %s: The >quick< brown >fox< jumps over the lazy dog"
},
{"fn:or(\"quick fox\")"},
{
"fn:phrase(quick brown fox)",
"0. %s: The >quick brown fox< jumps over the lazy dog"
},
{"fn:wildcard(jump*)", "0. %s: The quick brown fox >jumps< over the lazy dog"},
{"fn:wildcard(br*n)", "0. %s: The quick >brown< fox jumps over the lazy dog"},
{"fn:or(dog fox)", "0. %s: The quick brown >fox< jumps over the lazy >dog<"},
{
"fn:phrase(fn:ordered(quick fox) jumps)",
"0. %s: The >quick brown fox jumps< over the lazy dog"
},
{
"fn:ordered(quick jumps dog)",
"0. %s: The >quick brown fox jumps over the lazy dog<"
},
{
"fn:ordered(quick fn:or(fox dog))",
"0. %s: The >quick brown fox< jumps over the lazy dog"
},
{
"fn:ordered(quick jumps fn:or(fox dog))",
"0. %s: The >quick brown fox jumps over the lazy dog<"
},
{
"fn:unordered(dog jumps quick)",
"0. %s: The >quick brown fox jumps over the lazy dog<"
},
{
"fn:unordered(fn:or(fox dog) quick)",
"0. %s: The >quick brown fox< jumps over the lazy dog"
},
{
"fn:unordered(fn:phrase(brown fox) fn:phrase(fox jumps))",
"0. %s: The quick >brown fox jumps< over the lazy dog"
},
{"fn:ordered(fn:phrase(brown fox) fn:phrase(fox jumps))"},
{"fn:unorderedNoOverlaps(fn:phrase(brown fox) fn:phrase(fox jumps))"},
{
"fn:before(fn:or(brown lazy) fox)",
"0. %s: The quick >brown< fox jumps over the lazy dog"
},
{
"fn:before(fn:or(brown lazy) fn:or(dog fox))",
"0. %s: The quick >brown< fox jumps over the >lazy< dog"
},
{
"fn:after(fn:or(brown lazy) fox)",
"0. %s: The quick brown fox jumps over the >lazy< dog"
},
{
"fn:after(fn:or(brown lazy) fn:or(dog fox))",
"0. %s: The quick brown fox jumps over the >lazy< dog"
},
{
"fn:within(fn:or(fox dog) 1 fn:or(quick lazy))",
"0. %s: The quick brown fox jumps over the lazy >dog<"
},
{
"fn:within(fn:or(fox dog) 2 fn:or(quick lazy))",
"0. %s: The quick brown >fox< jumps over the lazy >dog<"
},
{
"fn:notWithin(fn:or(fox dog) 1 fn:or(quick lazy))",
"0. %s: The quick brown >fox< jumps over the lazy dog"
},
{
"fn:containedBy(fn:or(fox dog) fn:ordered(quick lazy))",
"0. %s: The quick brown >fox< jumps over the lazy dog"
},
{
"fn:notContainedBy(fn:or(fox dog) fn:ordered(quick lazy))",
"0. %s: The quick brown fox jumps over the lazy >dog<"
},
{
"fn:containing(fn:atLeast(2 quick fox dog) jumps)",
"0. %s: The quick brown >fox jumps over the lazy dog<"
},
{
"fn:notContaining(fn:ordered(fn:or(the The) fn:or(fox dog)) brown)",
"0. %s: The quick brown fox jumps over >the lazy dog<"
},
{
"fn:overlapping(fn:phrase(brown fox) fn:phrase(fox jumps))",
"0. %s: The quick >brown fox< jumps over the lazy dog"
},
{
"fn:overlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
"0. %s: The quick brown fox jumps over the lazy >dog<"
},
{
"fn:nonOverlapping(fn:phrase(brown fox) fn:phrase(lazy dog))",
"0. %s: The quick >brown fox< jumps over the lazy dog"
},
{
"fn:nonOverlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
"0. %s: The quick brown >fox< jumps over the lazy dog"
},
{
"fn:atLeast(2 fn:unordered(furry dog) fn:unordered(brown dog) lazy quick)",
"0. %s: The >quick >brown fox jumps over the lazy<<> dog<"
},
/*
The test cases below do not work for fields enabled with offset yet:
mainly "extend"
TODO: Fix them!
{"fn:extend(fox 1 2)", "0. %s: The quick >brown fox jumps over< the lazy dog"},
{
"fn:extend(fn:or(dog fox) 2 0)",
"0. %s: The >quick brown fox< jumps over >the lazy dog<"
},
{
"fn:containedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
"0. %s: The quick brown fox jumps over the lazy >dog<"
},
{
"fn:notContainedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
"0. %s: The quick brown >fox< jumps over the lazy dog"
},
{
"fn:containing(fn:extend(fn:or(lazy brown) 1 1) fn:or(fox dog))",
"0. %s: The >quick brown fox< jumps over >the lazy dog<"
},
{
"fn:notContaining(fn:extend(fn:or(fox dog) 1 0) fn:or(brown yellow))",
"0. %s: The quick brown fox jumps over the >lazy dog<"
} */
}) {
assert queryHighlightPair.length >= 1;
String queryString = queryHighlightPair[0];
var query = qp.parse(queryString, field);
var expected =
Arrays.stream(queryHighlightPair)
.skip(1)
.map(v -> String.format(Locale.ROOT, v, field))
.toArray(String[]::new);
try {
assertHighlights(
toDocList(
highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
expected);
} catch (AssertionError e) {
errors.add("MISMATCH: query: " + queryString + "\n" + e.getMessage());
}
}
if (errors.size() > 0) {
throw new AssertionError(String.join("\n\n", errors));
}
});
}
@Test
public void testCustomFieldHighlightHandling() throws Exception {
// Match highlighter is a showcase of individual components in this package, suitable

View File

@ -379,7 +379,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
Intervals.containedBy(
Intervals.term("foo"),
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
MatcherAssert.assertThat(
highlights(
@ -387,9 +387,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
new IntervalQuery(
field,
Intervals.overlapping(
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar")),
Intervals.term("foo")))),
containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
Intervals.term("foo"),
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
});
}

View File

@ -59,6 +59,15 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
protected abstract IntervalIterator combine(List<IntervalIterator> iterators);
/**
* Create matches iterator from an advanced and validated interval iterator and a list of matches
* iterator of all the sub-sources
*/
protected IntervalMatchesIterator createMatchesIterator(
IntervalIterator it, List<IntervalMatchesIterator> subs) {
return new ConjunctionMatchesIterator(it, subs);
}
@Override
public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc)
throws IOException {
@ -81,6 +90,6 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
return null;
}
return new ConjunctionMatchesIterator(it, subs);
return createMatchesIterator(it, subs);
}
}

View File

@ -66,6 +66,14 @@ class ContainedByIntervalsSource extends ConjunctionIntervalsSource {
};
}
@Override
protected IntervalMatchesIterator createMatchesIterator(
IntervalIterator it, List<IntervalMatchesIterator> subs) {
assert subs.size() == 2;
// the only sub source we care is the "small" source
return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
}
@Override
public int minExtent() {
return small.minExtent();

View File

@ -275,7 +275,10 @@ public final class Intervals {
}
/**
* Create an unordered {@link IntervalsSource}
* Create an unordered {@link IntervalsSource}. Note that if there are multiple intervals ends at
* the same position are eligible, only the narrowest one will be returned. For example if asking
* for <code>unordered(term("apple"), term("banana"))</code> on field of "apple wolf apple orange
* banana", only the "apple orange banana" will be returned.
*
* <p>Returns intervals in which all the subsources appear. The subsources may overlap
*

View File

@ -215,6 +215,7 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
@Override
public int nextInterval() throws IOException {
lead = null;
// first, find a matching interval beyond the current start
while (this.proximityQueue.size() == minShouldMatch
&& proximityQueue.top().start() == start) {
@ -258,7 +259,9 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
Collection<IntervalIterator> getCurrentIterators() {
currentIterators.clear();
currentIterators.add(lead);
if (lead != null) {
currentIterators.add(lead);
}
for (IntervalIterator it : this.proximityQueue) {
if (it.end() <= end) {
currentIterators.add(it);

View File

@ -62,6 +62,14 @@ class OverlappingIntervalsSource extends ConjunctionIntervalsSource {
};
}
@Override
protected IntervalMatchesIterator createMatchesIterator(
IntervalIterator it, List<IntervalMatchesIterator> subs) {
assert subs.size() == 2;
// the only sub source we care is the "real" source
return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
}
@Override
public int minExtent() {
return source.minExtent();

View File

@ -621,17 +621,13 @@ public class TestIntervals extends LuceneTestCase {
checkIntervals(
source, "field1", 3, new int[][] {{}, {4, 4, 7, 7}, {1, 1, 7, 7}, {}, {4, 4}, {}});
MatchesIterator mi = getMatches(source, 1, "field1");
assertMatch(mi, 4, 4, 20, 39);
assertMatch(mi, 4, 4, 26, 34);
MatchesIterator subs = mi.getSubMatches();
assertMatch(subs, 3, 3, 20, 25);
assertMatch(subs, 4, 4, 26, 34);
assertMatch(subs, 5, 5, 35, 39);
assertFalse(subs.next());
assertMatch(mi, 7, 7, 41, 118);
assertMatch(mi, 7, 7, 47, 55);
subs = mi.getSubMatches();
assertMatch(subs, 6, 6, 41, 46);
assertMatch(subs, 7, 7, 47, 55);
assertMatch(subs, 21, 21, 114, 118);
assertFalse(subs.next());
assertFalse(mi.next());
assertEquals(1, source.minExtent());
@ -788,6 +784,30 @@ public class TestIntervals extends LuceneTestCase {
assertEquals(3, source.minExtent());
}
public void testMinShouldMatch2() throws IOException {
IntervalsSource source =
Intervals.atLeast(
2,
Intervals.unordered(Intervals.term("alph"), Intervals.term("ran")),
Intervals.term("where"),
Intervals.term("river"));
MatchesIterator mi = getMatches(source, 1, "field2");
assertMatch(mi, 0, 4, 0, 27); // contains "where" and "river"
MatchesIterator subs = mi.getSubMatches();
assertNotNull(subs);
assertMatch(subs, 0, 0, 0, 5);
assertMatch(subs, 4, 4, 22, 27);
assertFalse(subs.next());
assertMatch(mi, 1, 5, 6, 31); // contains "river" and unordered("alph", "run")
subs = mi.getSubMatches();
assertNotNull(subs);
assertMatch(subs, 1, 1, 6, 10);
assertMatch(subs, 4, 4, 22, 27);
assertMatch(subs, 5, 5, 28, 31);
assertFalse(subs.next());
assertFalse(mi.next());
}
public void testDegenerateMinShouldMatch() throws IOException {
IntervalsSource source =
Intervals.ordered(
@ -851,11 +871,9 @@ public class TestIntervals extends LuceneTestCase {
checkIntervals(source, "field1", 3, new int[][] {{}, {7, 7}, {4, 4, 7, 7}, {}, {7, 7}, {}});
MatchesIterator mi = getMatches(source, 1, "field1");
assertMatch(mi, 7, 7, 20, 55);
assertMatch(mi, 7, 7, 47, 55);
MatchesIterator sub = mi.getSubMatches();
assertNotNull(sub);
assertMatch(sub, 3, 3, 20, 25);
assertMatch(sub, 5, 5, 35, 39);
assertMatch(sub, 7, 7, 47, 55);
assertFalse(sub.next());
@ -890,15 +908,13 @@ public class TestIntervals extends LuceneTestCase {
MatchesIterator mi = getMatches(source, 1, "field1");
assertNotNull(mi);
assertMatch(mi, 2, 4, 15, 39);
assertMatch(mi, 2, 4, 15, 34);
MatchesIterator sub = mi.getSubMatches();
assertNotNull(sub);
assertMatch(sub, 2, 2, 15, 18);
assertMatch(sub, 3, 3, 20, 25);
assertMatch(sub, 4, 4, 26, 34);
assertMatch(sub, 5, 5, 35, 39);
assertFalse(sub.next());
assertMatch(mi, 7, 17, 41, 118);
assertMatch(mi, 7, 17, 47, 99);
assertEquals(2, source.minExtent());
}