mirror of https://github.com/apache/lucene.git
LUCENE-10229: Unify behaviour of match offsets for interval queries (#521)
This commit is contained in:
parent
40c213d873
commit
53099e01de
|
@ -80,6 +80,9 @@ Improvements
|
|||
|
||||
* LUCENE-10275: Speed up MultiRangeQuery by using an interval tree. (Ignacio Vera)
|
||||
|
||||
* LUCENE-10229: Unify behaviour of match offsets for interval queries on fields
|
||||
with or without offsets enabled. (Patrick Zhai)
|
||||
|
||||
Optimizations
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -332,8 +332,6 @@ public class TestMatchHighlighter extends LuceneTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
// TODO: the highlights are different when the field is indexed with offsets. Weird.
|
||||
// String field = FLD_TEXT1;
|
||||
String field = FLD_TEXT2;
|
||||
new IndexBuilder(this::toField)
|
||||
// Just one document and multiple interval queries.
|
||||
|
@ -541,6 +539,235 @@ public class TestMatchHighlighter extends LuceneTestCase {
|
|||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Almost the same as the one above, make sure the fields indexed with offsets are also
|
||||
* highlighted correctly
|
||||
*/
|
||||
@Test
|
||||
public void testIntervalFunctionsWithOffsetField() throws Exception {
|
||||
Analyzer analyzer =
|
||||
new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new StandardTokenizer();
|
||||
TokenStream ts = tokenizer;
|
||||
ts = new LowerCaseFilter(ts);
|
||||
return new TokenStreamComponents(tokenizer, ts);
|
||||
}
|
||||
};
|
||||
|
||||
String field = FLD_TEXT1;
|
||||
new IndexBuilder(this::toField)
|
||||
// Just one document and multiple interval queries.
|
||||
.doc(field, "The quick brown fox jumps over the lazy dog")
|
||||
.build(
|
||||
analyzer,
|
||||
reader -> {
|
||||
IndexSearcher searcher = new IndexSearcher(reader);
|
||||
Sort sortOrder = Sort.INDEXORDER; // So that results are consistently ordered.
|
||||
|
||||
MatchHighlighter highlighter =
|
||||
new MatchHighlighter(searcher, analyzer)
|
||||
.appendFieldHighlighter(
|
||||
FieldValueHighlighters.highlighted(
|
||||
80 * 3, 1, new PassageFormatter("...", ">", "<"), fld -> true))
|
||||
.appendFieldHighlighter(FieldValueHighlighters.skipRemaining());
|
||||
|
||||
StandardQueryParser qp = new StandardQueryParser(analyzer);
|
||||
|
||||
// Run all pairs of query-expected highlight.
|
||||
List<String> errors = new ArrayList<>();
|
||||
for (var queryHighlightPair :
|
||||
new String[][] {
|
||||
{
|
||||
"fn:ordered(brown dog)",
|
||||
"0. %s: The quick >brown fox jumps over the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:within(fn:or(lazy quick) 1 fn:or(dog fox))",
|
||||
"0. %s: The quick brown fox jumps over the >lazy< dog"
|
||||
},
|
||||
{
|
||||
"fn:containedBy(fox fn:ordered(brown fox dog))",
|
||||
"0. %s: The quick brown >fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:atLeast(2 quick fox \"furry dog\")",
|
||||
"0. %s: The >quick brown fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:maxgaps(0 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
|
||||
"0. %s: The quick brown fox jumps over the >lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:maxgaps(1 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
|
||||
"0. %s: The >quick brown fox< jumps over the >lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:maxwidth(2 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
|
||||
"0. %s: The quick brown fox jumps over the >lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:maxwidth(3 fn:ordered(fn:or(quick lazy) fn:or(fox dog)))",
|
||||
"0. %s: The >quick brown fox< jumps over the >lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:or(quick \"fox\")",
|
||||
"0. %s: The >quick< brown >fox< jumps over the lazy dog"
|
||||
},
|
||||
{"fn:or(\"quick fox\")"},
|
||||
{
|
||||
"fn:phrase(quick brown fox)",
|
||||
"0. %s: The >quick brown fox< jumps over the lazy dog"
|
||||
},
|
||||
{"fn:wildcard(jump*)", "0. %s: The quick brown fox >jumps< over the lazy dog"},
|
||||
{"fn:wildcard(br*n)", "0. %s: The quick >brown< fox jumps over the lazy dog"},
|
||||
{"fn:or(dog fox)", "0. %s: The quick brown >fox< jumps over the lazy >dog<"},
|
||||
{
|
||||
"fn:phrase(fn:ordered(quick fox) jumps)",
|
||||
"0. %s: The >quick brown fox jumps< over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:ordered(quick jumps dog)",
|
||||
"0. %s: The >quick brown fox jumps over the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:ordered(quick fn:or(fox dog))",
|
||||
"0. %s: The >quick brown fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:ordered(quick jumps fn:or(fox dog))",
|
||||
"0. %s: The >quick brown fox jumps over the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:unordered(dog jumps quick)",
|
||||
"0. %s: The >quick brown fox jumps over the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:unordered(fn:or(fox dog) quick)",
|
||||
"0. %s: The >quick brown fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:unordered(fn:phrase(brown fox) fn:phrase(fox jumps))",
|
||||
"0. %s: The quick >brown fox jumps< over the lazy dog"
|
||||
},
|
||||
{"fn:ordered(fn:phrase(brown fox) fn:phrase(fox jumps))"},
|
||||
{"fn:unorderedNoOverlaps(fn:phrase(brown fox) fn:phrase(fox jumps))"},
|
||||
{
|
||||
"fn:before(fn:or(brown lazy) fox)",
|
||||
"0. %s: The quick >brown< fox jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:before(fn:or(brown lazy) fn:or(dog fox))",
|
||||
"0. %s: The quick >brown< fox jumps over the >lazy< dog"
|
||||
},
|
||||
{
|
||||
"fn:after(fn:or(brown lazy) fox)",
|
||||
"0. %s: The quick brown fox jumps over the >lazy< dog"
|
||||
},
|
||||
{
|
||||
"fn:after(fn:or(brown lazy) fn:or(dog fox))",
|
||||
"0. %s: The quick brown fox jumps over the >lazy< dog"
|
||||
},
|
||||
{
|
||||
"fn:within(fn:or(fox dog) 1 fn:or(quick lazy))",
|
||||
"0. %s: The quick brown fox jumps over the lazy >dog<"
|
||||
},
|
||||
{
|
||||
"fn:within(fn:or(fox dog) 2 fn:or(quick lazy))",
|
||||
"0. %s: The quick brown >fox< jumps over the lazy >dog<"
|
||||
},
|
||||
{
|
||||
"fn:notWithin(fn:or(fox dog) 1 fn:or(quick lazy))",
|
||||
"0. %s: The quick brown >fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:containedBy(fn:or(fox dog) fn:ordered(quick lazy))",
|
||||
"0. %s: The quick brown >fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:notContainedBy(fn:or(fox dog) fn:ordered(quick lazy))",
|
||||
"0. %s: The quick brown fox jumps over the lazy >dog<"
|
||||
},
|
||||
{
|
||||
"fn:containing(fn:atLeast(2 quick fox dog) jumps)",
|
||||
"0. %s: The quick brown >fox jumps over the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:notContaining(fn:ordered(fn:or(the The) fn:or(fox dog)) brown)",
|
||||
"0. %s: The quick brown fox jumps over >the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:overlapping(fn:phrase(brown fox) fn:phrase(fox jumps))",
|
||||
"0. %s: The quick >brown fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:overlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
|
||||
"0. %s: The quick brown fox jumps over the lazy >dog<"
|
||||
},
|
||||
{
|
||||
"fn:nonOverlapping(fn:phrase(brown fox) fn:phrase(lazy dog))",
|
||||
"0. %s: The quick >brown fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:nonOverlapping(fn:or(fox dog) fn:extend(lazy 2 2))",
|
||||
"0. %s: The quick brown >fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:atLeast(2 fn:unordered(furry dog) fn:unordered(brown dog) lazy quick)",
|
||||
"0. %s: The >quick >brown fox jumps over the lazy<<> dog<"
|
||||
},
|
||||
/*
|
||||
The test cases below do not work for fields enabled with offset yet:
|
||||
mainly "extend"
|
||||
TODO: Fix them!
|
||||
|
||||
{"fn:extend(fox 1 2)", "0. %s: The quick >brown fox jumps over< the lazy dog"},
|
||||
{
|
||||
"fn:extend(fn:or(dog fox) 2 0)",
|
||||
"0. %s: The >quick brown fox< jumps over >the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:containedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
|
||||
"0. %s: The quick brown fox jumps over the lazy >dog<"
|
||||
},
|
||||
{
|
||||
"fn:notContainedBy(fn:or(fox dog) fn:extend(lazy 3 3))",
|
||||
"0. %s: The quick brown >fox< jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"fn:containing(fn:extend(fn:or(lazy brown) 1 1) fn:or(fox dog))",
|
||||
"0. %s: The >quick brown fox< jumps over >the lazy dog<"
|
||||
},
|
||||
{
|
||||
"fn:notContaining(fn:extend(fn:or(fox dog) 1 0) fn:or(brown yellow))",
|
||||
"0. %s: The quick brown fox jumps over the >lazy dog<"
|
||||
} */
|
||||
}) {
|
||||
assert queryHighlightPair.length >= 1;
|
||||
String queryString = queryHighlightPair[0];
|
||||
var query = qp.parse(queryString, field);
|
||||
var expected =
|
||||
Arrays.stream(queryHighlightPair)
|
||||
.skip(1)
|
||||
.map(v -> String.format(Locale.ROOT, v, field))
|
||||
.toArray(String[]::new);
|
||||
|
||||
try {
|
||||
assertHighlights(
|
||||
toDocList(
|
||||
highlighter.highlight(searcher.search(query, 10, sortOrder), query)),
|
||||
expected);
|
||||
} catch (AssertionError e) {
|
||||
errors.add("MISMATCH: query: " + queryString + "\n" + e.getMessage());
|
||||
}
|
||||
}
|
||||
if (errors.size() > 0) {
|
||||
throw new AssertionError(String.join("\n\n", errors));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCustomFieldHighlightHandling() throws Exception {
|
||||
// Match highlighter is a showcase of individual components in this package, suitable
|
||||
|
|
|
@ -379,7 +379,7 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
Intervals.containedBy(
|
||||
Intervals.term("foo"),
|
||||
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
|
||||
containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
|
||||
containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
|
||||
|
||||
MatcherAssert.assertThat(
|
||||
highlights(
|
||||
|
@ -387,9 +387,9 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
new IntervalQuery(
|
||||
field,
|
||||
Intervals.overlapping(
|
||||
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar")),
|
||||
Intervals.term("foo")))),
|
||||
containsInAnyOrder(fmt("2: (field_text_offs: '>bar baz foo< xyz')", field)));
|
||||
Intervals.term("foo"),
|
||||
Intervals.unordered(Intervals.term("foo"), Intervals.term("bar"))))),
|
||||
containsInAnyOrder(fmt("2: (field_text_offs: 'bar baz >foo< xyz')", field)));
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -59,6 +59,15 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
|
|||
|
||||
protected abstract IntervalIterator combine(List<IntervalIterator> iterators);
|
||||
|
||||
/**
|
||||
* Create matches iterator from an advanced and validated interval iterator and a list of matches
|
||||
* iterator of all the sub-sources
|
||||
*/
|
||||
protected IntervalMatchesIterator createMatchesIterator(
|
||||
IntervalIterator it, List<IntervalMatchesIterator> subs) {
|
||||
return new ConjunctionMatchesIterator(it, subs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc)
|
||||
throws IOException {
|
||||
|
@ -81,6 +90,6 @@ abstract class ConjunctionIntervalsSource extends IntervalsSource {
|
|||
if (it.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) {
|
||||
return null;
|
||||
}
|
||||
return new ConjunctionMatchesIterator(it, subs);
|
||||
return createMatchesIterator(it, subs);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,6 +66,14 @@ class ContainedByIntervalsSource extends ConjunctionIntervalsSource {
|
|||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected IntervalMatchesIterator createMatchesIterator(
|
||||
IntervalIterator it, List<IntervalMatchesIterator> subs) {
|
||||
assert subs.size() == 2;
|
||||
// the only sub source we care is the "small" source
|
||||
return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int minExtent() {
|
||||
return small.minExtent();
|
||||
|
|
|
@ -275,7 +275,10 @@ public final class Intervals {
|
|||
}
|
||||
|
||||
/**
|
||||
* Create an unordered {@link IntervalsSource}
|
||||
* Create an unordered {@link IntervalsSource}. Note that if there are multiple intervals ends at
|
||||
* the same position are eligible, only the narrowest one will be returned. For example if asking
|
||||
* for <code>unordered(term("apple"), term("banana"))</code> on field of "apple wolf apple orange
|
||||
* banana", only the "apple orange banana" will be returned.
|
||||
*
|
||||
* <p>Returns intervals in which all the subsources appear. The subsources may overlap
|
||||
*
|
||||
|
|
|
@ -215,6 +215,7 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
|
|||
|
||||
@Override
|
||||
public int nextInterval() throws IOException {
|
||||
lead = null;
|
||||
// first, find a matching interval beyond the current start
|
||||
while (this.proximityQueue.size() == minShouldMatch
|
||||
&& proximityQueue.top().start() == start) {
|
||||
|
@ -258,7 +259,9 @@ class MinimumShouldMatchIntervalsSource extends IntervalsSource {
|
|||
|
||||
Collection<IntervalIterator> getCurrentIterators() {
|
||||
currentIterators.clear();
|
||||
if (lead != null) {
|
||||
currentIterators.add(lead);
|
||||
}
|
||||
for (IntervalIterator it : this.proximityQueue) {
|
||||
if (it.end() <= end) {
|
||||
currentIterators.add(it);
|
||||
|
|
|
@ -62,6 +62,14 @@ class OverlappingIntervalsSource extends ConjunctionIntervalsSource {
|
|||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
protected IntervalMatchesIterator createMatchesIterator(
|
||||
IntervalIterator it, List<IntervalMatchesIterator> subs) {
|
||||
assert subs.size() == 2;
|
||||
// the only sub source we care is the "real" source
|
||||
return new ConjunctionMatchesIterator(it, List.of(subs.get(0)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int minExtent() {
|
||||
return source.minExtent();
|
||||
|
|
|
@ -621,17 +621,13 @@ public class TestIntervals extends LuceneTestCase {
|
|||
checkIntervals(
|
||||
source, "field1", 3, new int[][] {{}, {4, 4, 7, 7}, {1, 1, 7, 7}, {}, {4, 4}, {}});
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertMatch(mi, 4, 4, 20, 39);
|
||||
assertMatch(mi, 4, 4, 26, 34);
|
||||
MatchesIterator subs = mi.getSubMatches();
|
||||
assertMatch(subs, 3, 3, 20, 25);
|
||||
assertMatch(subs, 4, 4, 26, 34);
|
||||
assertMatch(subs, 5, 5, 35, 39);
|
||||
assertFalse(subs.next());
|
||||
assertMatch(mi, 7, 7, 41, 118);
|
||||
assertMatch(mi, 7, 7, 47, 55);
|
||||
subs = mi.getSubMatches();
|
||||
assertMatch(subs, 6, 6, 41, 46);
|
||||
assertMatch(subs, 7, 7, 47, 55);
|
||||
assertMatch(subs, 21, 21, 114, 118);
|
||||
assertFalse(subs.next());
|
||||
assertFalse(mi.next());
|
||||
assertEquals(1, source.minExtent());
|
||||
|
@ -788,6 +784,30 @@ public class TestIntervals extends LuceneTestCase {
|
|||
assertEquals(3, source.minExtent());
|
||||
}
|
||||
|
||||
public void testMinShouldMatch2() throws IOException {
|
||||
IntervalsSource source =
|
||||
Intervals.atLeast(
|
||||
2,
|
||||
Intervals.unordered(Intervals.term("alph"), Intervals.term("ran")),
|
||||
Intervals.term("where"),
|
||||
Intervals.term("river"));
|
||||
MatchesIterator mi = getMatches(source, 1, "field2");
|
||||
assertMatch(mi, 0, 4, 0, 27); // contains "where" and "river"
|
||||
MatchesIterator subs = mi.getSubMatches();
|
||||
assertNotNull(subs);
|
||||
assertMatch(subs, 0, 0, 0, 5);
|
||||
assertMatch(subs, 4, 4, 22, 27);
|
||||
assertFalse(subs.next());
|
||||
assertMatch(mi, 1, 5, 6, 31); // contains "river" and unordered("alph", "run")
|
||||
subs = mi.getSubMatches();
|
||||
assertNotNull(subs);
|
||||
assertMatch(subs, 1, 1, 6, 10);
|
||||
assertMatch(subs, 4, 4, 22, 27);
|
||||
assertMatch(subs, 5, 5, 28, 31);
|
||||
assertFalse(subs.next());
|
||||
assertFalse(mi.next());
|
||||
}
|
||||
|
||||
public void testDegenerateMinShouldMatch() throws IOException {
|
||||
IntervalsSource source =
|
||||
Intervals.ordered(
|
||||
|
@ -851,11 +871,9 @@ public class TestIntervals extends LuceneTestCase {
|
|||
checkIntervals(source, "field1", 3, new int[][] {{}, {7, 7}, {4, 4, 7, 7}, {}, {7, 7}, {}});
|
||||
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertMatch(mi, 7, 7, 20, 55);
|
||||
assertMatch(mi, 7, 7, 47, 55);
|
||||
MatchesIterator sub = mi.getSubMatches();
|
||||
assertNotNull(sub);
|
||||
assertMatch(sub, 3, 3, 20, 25);
|
||||
assertMatch(sub, 5, 5, 35, 39);
|
||||
assertMatch(sub, 7, 7, 47, 55);
|
||||
assertFalse(sub.next());
|
||||
|
||||
|
@ -890,15 +908,13 @@ public class TestIntervals extends LuceneTestCase {
|
|||
|
||||
MatchesIterator mi = getMatches(source, 1, "field1");
|
||||
assertNotNull(mi);
|
||||
assertMatch(mi, 2, 4, 15, 39);
|
||||
assertMatch(mi, 2, 4, 15, 34);
|
||||
MatchesIterator sub = mi.getSubMatches();
|
||||
assertNotNull(sub);
|
||||
assertMatch(sub, 2, 2, 15, 18);
|
||||
assertMatch(sub, 3, 3, 20, 25);
|
||||
assertMatch(sub, 4, 4, 26, 34);
|
||||
assertMatch(sub, 5, 5, 35, 39);
|
||||
assertFalse(sub.next());
|
||||
assertMatch(mi, 7, 17, 41, 118);
|
||||
assertMatch(mi, 7, 17, 47, 99);
|
||||
|
||||
assertEquals(2, source.minExtent());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue