LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (#2127)

This commit is contained in:
Dawid Weiss 2020-12-11 20:07:26 +01:00 committed by GitHub
parent a95ce0d422
commit a648143955
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 102 additions and 43 deletions

View File

@ -84,6 +84,9 @@ API Changes
Improvements Improvements
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai) * LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai)
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O. * LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.

View File

@ -269,7 +269,7 @@ public class MatchRegionRetriever {
// only those terms that the query hinted at when passed // only those terms that the query hinted at when passed
// a QueryVisitor. // a QueryVisitor.
// //
// Alternative straties are also possible and may make sense // Alternative strategies are also possible and may make sense
// depending on the use case (OffsetsFromValues, for example). // depending on the use case (OffsetsFromValues, for example).
return new OffsetsFromTokens(field, analyzer); return new OffsetsFromTokens(field, analyzer);

View File

@ -24,16 +24,14 @@ import org.apache.lucene.search.MatchesIterator;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* This strategy applies to fields with stored positions but no offsets. We re-analyze * This strategy applies to fields with stored positions but no offsets. We re-analyze the field's
* the field's value to find out offsets of match positions. * value to find out offsets of match positions.
* <p> *
* Note that this may fail if index data (positions stored in the index) is out of sync * <p>Note that this may fail if index data (positions stored in the index) is out of sync with the
* with the field values or the analyzer. This strategy assumes it'll never happen. * field values or the analyzer. This strategy assumes it'll never happen.
*/ */
public final class OffsetsFromPositions implements OffsetsRetrievalStrategy { public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
private final String field; private final String field;
@ -45,22 +43,21 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
} }
@Override @Override
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) public List<OffsetRange> get(
MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
throws IOException { throws IOException {
ArrayList<OffsetRange> ranges = new ArrayList<>(); ArrayList<OffsetRange> positionRanges = new ArrayList<>();
while (matchesIterator.next()) { while (matchesIterator.next()) {
int from = matchesIterator.startPosition(); int from = matchesIterator.startPosition();
int to = matchesIterator.endPosition(); int to = matchesIterator.endPosition();
if (from < 0 || to < 0) { if (from < 0 || to < 0) {
throw new IOException("Matches API returned negative positions for field: " + field); throw new IOException("Matches API returned negative positions for field: " + field);
} }
ranges.add(new OffsetRange(from, to)); positionRanges.add(new OffsetRange(from, to));
} }
// Convert from positions to offsets. // Convert from positions to offsets.
ranges = convertPositionsToOffsets(ranges, analyzer, field, doc.getValues(field)); return convertPositionsToOffsets(positionRanges, analyzer, field, doc.getValues(field));
return ranges;
} }
@Override @Override
@ -68,37 +65,42 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
return true; return true;
} }
private static ArrayList<OffsetRange> convertPositionsToOffsets( private static List<OffsetRange> convertPositionsToOffsets(
ArrayList<OffsetRange> ranges, ArrayList<OffsetRange> positionRanges,
Analyzer analyzer, Analyzer analyzer,
String fieldName, String fieldName,
List<CharSequence> values) List<CharSequence> values)
throws IOException { throws IOException {
if (ranges.isEmpty()) { if (positionRanges.isEmpty()) {
return ranges; return positionRanges;
} }
class LeftRight { class PositionSpan extends OffsetRange {
int left = Integer.MAX_VALUE; int leftOffset = Integer.MAX_VALUE;
int right = Integer.MIN_VALUE; int rightOffset = Integer.MIN_VALUE;
PositionSpan(int from, int to) {
super(from, to);
}
@Override @Override
public String toString() { public String toString() {
return "[" + "L: " + left + ", R: " + right + ']'; return "[from=" + from + ", to=" + to + ", L: " + leftOffset + ", R: " + rightOffset + ']';
} }
} }
Map<Integer, LeftRight> requiredPositionSpans = new HashMap<>(); ArrayList<PositionSpan> spans = new ArrayList<>();
int minPosition = Integer.MAX_VALUE; int minPosition = Integer.MAX_VALUE;
int maxPosition = Integer.MIN_VALUE; int maxPosition = Integer.MIN_VALUE;
for (OffsetRange range : ranges) { for (OffsetRange range : positionRanges) {
requiredPositionSpans.computeIfAbsent(range.from, (key) -> new LeftRight()); spans.add(new PositionSpan(range.from, range.to));
requiredPositionSpans.computeIfAbsent(range.to, (key) -> new LeftRight());
minPosition = Math.min(minPosition, range.from); minPosition = Math.min(minPosition, range.from);
maxPosition = Math.max(maxPosition, range.to); maxPosition = Math.max(maxPosition, range.to);
} }
PositionSpan[] spansTable = spans.toArray(PositionSpan[]::new);
int spanCount = spansTable.length;
int position = -1; int position = -1;
int valueOffset = 0; int valueOffset = 0;
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) { for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
@ -113,14 +115,26 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
position += posAttr.getPositionIncrement(); position += posAttr.getPositionIncrement();
if (position >= minPosition) { if (position >= minPosition) {
LeftRight leftRight = requiredPositionSpans.get(position); // Correct left and right offsets for each span this position applies to.
if (leftRight != null) {
int startOffset = valueOffset + offsetAttr.startOffset(); int startOffset = valueOffset + offsetAttr.startOffset();
int endOffset = valueOffset + offsetAttr.endOffset(); int endOffset = valueOffset + offsetAttr.endOffset();
leftRight.left = Math.min(leftRight.left, startOffset); int j = 0;
leftRight.right = Math.max(leftRight.right, endOffset); for (int i = 0; i < spanCount; i++) {
PositionSpan span = spansTable[j] = spansTable[i];
if (position >= span.from) {
if (position <= span.to) {
span.leftOffset = Math.min(span.leftOffset, startOffset);
span.rightOffset = Math.max(span.rightOffset, endOffset);
} else {
// this span can't intersect with any following position
// so omit it by skipping j++.
continue;
} }
}
j++;
}
spanCount = j;
// Only short-circuit if we're on the last value (which should be the common // Only short-circuit if we're on the last value (which should be the common
// case since most fields would only have a single value anyway). We need // case since most fields would only have a single value anyway). We need
@ -136,19 +150,13 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
ts.close(); ts.close();
} }
ArrayList<OffsetRange> converted = new ArrayList<>(); ArrayList<OffsetRange> converted = new ArrayList<>(spans.size());
for (OffsetRange range : ranges) { for (PositionSpan span : spans) {
LeftRight left = requiredPositionSpans.get(range.from); if (span.leftOffset == Integer.MAX_VALUE || span.rightOffset == Integer.MIN_VALUE) {
LeftRight right = requiredPositionSpans.get(range.to); throw new RuntimeException("One of the offsets missing for position range: " + span);
if (left == null
|| right == null
|| left.left == Integer.MAX_VALUE
|| right.right == Integer.MIN_VALUE) {
throw new RuntimeException("Position not properly initialized for range: " + range);
} }
converted.add(new OffsetRange(left.left, right.right)); converted.add(new OffsetRange(span.leftOffset, span.rightOffset));
} }
return converted; return converted;
} }
} }

View File

@ -18,13 +18,15 @@ package org.apache.lucene.search.matchhighlight;
import com.carrotsearch.randomizedtesting.RandomizedTest; import com.carrotsearch.randomizedtesting.RandomizedTest;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType; import org.apache.lucene.document.FieldType;
@ -59,6 +61,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -87,6 +90,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
private Analyzer analyzer; private Analyzer analyzer;
private static final String STOPWORD1 = "stopword";
@Before @Before
public void setup() throws IOException { public void setup() throws IOException {
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED); TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
@ -101,7 +106,15 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100}); final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
Analyzer whitespaceAnalyzer = Analyzer whitespaceAnalyzer =
new AnalyzerWithGaps(offsetGap, positionGap, new AnalyzerWithGaps(offsetGap, positionGap,
new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN)); new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
TokenStream tokenStream;
tokenStream = new StopFilter(tokenizer, new CharArraySet(Set.of(STOPWORD1), true));
return new TokenStreamComponents(tokenizer, tokenStream);
}
});
SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] { SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
{"foo\u0000bar", "syn1"}, {"foo\u0000bar", "syn1"},
@ -361,6 +374,41 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
); );
} }
@Test
public void testDegenerateIntervalsWithPositions() throws IOException {
testDegenerateIntervals(FLD_TEXT_POS);
}
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9634: " +
"Highlighting of degenerate spans on fields with offsets doesn't work properly")
public void testDegenerateIntervalsWithOffsets() throws IOException {
testDegenerateIntervals(FLD_TEXT_POS_OFFS);
}
public void testDegenerateIntervals(String field) throws IOException {
new IndexBuilder(this::toField)
.doc(field, fmt("foo %s bar", STOPWORD1))
.build(analyzer, reader -> {
assertThat(
highlights(reader, new IntervalQuery(
field,
Intervals.extend(
Intervals.term("bar"), 1, 3))),
containsInAnyOrder(
fmt("0: (%s: 'foo %s >bar<')", field, STOPWORD1)
));
assertThat(
highlights(reader, new IntervalQuery(
field,
Intervals.extend(
Intervals.term("bar"), 5, 100))),
containsInAnyOrder(
fmt("0: (%s: '>foo %s bar<')", field, STOPWORD1)
));
});
}
@Test @Test
public void testMultivaluedFieldsWithOffsets() throws IOException { public void testMultivaluedFieldsWithOffsets() throws IOException {
checkMultivaluedFields(FLD_TEXT_POS_OFFS); checkMultivaluedFields(FLD_TEXT_POS_OFFS);