mirror of https://github.com/apache/lucene.git
LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (#2127)
This commit is contained in:
parent
a95ce0d422
commit
a648143955
|
@ -84,6 +84,9 @@ API Changes
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
|
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||||
|
(Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai)
|
* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai)
|
||||||
|
|
||||||
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
|
* LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
|
||||||
|
|
|
@ -269,7 +269,7 @@ public class MatchRegionRetriever {
|
||||||
// only those terms that the query hinted at when passed
|
// only those terms that the query hinted at when passed
|
||||||
// a QueryVisitor.
|
// a QueryVisitor.
|
||||||
//
|
//
|
||||||
// Alternative straties are also possible and may make sense
|
// Alternative strategies are also possible and may make sense
|
||||||
// depending on the use case (OffsetsFromValues, for example).
|
// depending on the use case (OffsetsFromValues, for example).
|
||||||
return new OffsetsFromTokens(field, analyzer);
|
return new OffsetsFromTokens(field, analyzer);
|
||||||
|
|
||||||
|
|
|
@ -24,16 +24,14 @@ import org.apache.lucene.search.MatchesIterator;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This strategy applies to fields with stored positions but no offsets. We re-analyze
|
* This strategy applies to fields with stored positions but no offsets. We re-analyze the field's
|
||||||
* the field's value to find out offsets of match positions.
|
* value to find out offsets of match positions.
|
||||||
* <p>
|
*
|
||||||
* Note that this may fail if index data (positions stored in the index) is out of sync
|
* <p>Note that this may fail if index data (positions stored in the index) is out of sync with the
|
||||||
* with the field values or the analyzer. This strategy assumes it'll never happen.
|
* field values or the analyzer. This strategy assumes it'll never happen.
|
||||||
*/
|
*/
|
||||||
public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
||||||
private final String field;
|
private final String field;
|
||||||
|
@ -45,22 +43,21 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<OffsetRange> get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
|
public List<OffsetRange> get(
|
||||||
|
MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
ArrayList<OffsetRange> positionRanges = new ArrayList<>();
|
||||||
while (matchesIterator.next()) {
|
while (matchesIterator.next()) {
|
||||||
int from = matchesIterator.startPosition();
|
int from = matchesIterator.startPosition();
|
||||||
int to = matchesIterator.endPosition();
|
int to = matchesIterator.endPosition();
|
||||||
if (from < 0 || to < 0) {
|
if (from < 0 || to < 0) {
|
||||||
throw new IOException("Matches API returned negative positions for field: " + field);
|
throw new IOException("Matches API returned negative positions for field: " + field);
|
||||||
}
|
}
|
||||||
ranges.add(new OffsetRange(from, to));
|
positionRanges.add(new OffsetRange(from, to));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Convert from positions to offsets.
|
// Convert from positions to offsets.
|
||||||
ranges = convertPositionsToOffsets(ranges, analyzer, field, doc.getValues(field));
|
return convertPositionsToOffsets(positionRanges, analyzer, field, doc.getValues(field));
|
||||||
|
|
||||||
return ranges;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -68,37 +65,42 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ArrayList<OffsetRange> convertPositionsToOffsets(
|
private static List<OffsetRange> convertPositionsToOffsets(
|
||||||
ArrayList<OffsetRange> ranges,
|
ArrayList<OffsetRange> positionRanges,
|
||||||
Analyzer analyzer,
|
Analyzer analyzer,
|
||||||
String fieldName,
|
String fieldName,
|
||||||
List<CharSequence> values)
|
List<CharSequence> values)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
if (ranges.isEmpty()) {
|
if (positionRanges.isEmpty()) {
|
||||||
return ranges;
|
return positionRanges;
|
||||||
}
|
}
|
||||||
|
|
||||||
class LeftRight {
|
class PositionSpan extends OffsetRange {
|
||||||
int left = Integer.MAX_VALUE;
|
int leftOffset = Integer.MAX_VALUE;
|
||||||
int right = Integer.MIN_VALUE;
|
int rightOffset = Integer.MIN_VALUE;
|
||||||
|
|
||||||
|
PositionSpan(int from, int to) {
|
||||||
|
super(from, to);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "[" + "L: " + left + ", R: " + right + ']';
|
return "[from=" + from + ", to=" + to + ", L: " + leftOffset + ", R: " + rightOffset + ']';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<Integer, LeftRight> requiredPositionSpans = new HashMap<>();
|
ArrayList<PositionSpan> spans = new ArrayList<>();
|
||||||
int minPosition = Integer.MAX_VALUE;
|
int minPosition = Integer.MAX_VALUE;
|
||||||
int maxPosition = Integer.MIN_VALUE;
|
int maxPosition = Integer.MIN_VALUE;
|
||||||
for (OffsetRange range : ranges) {
|
for (OffsetRange range : positionRanges) {
|
||||||
requiredPositionSpans.computeIfAbsent(range.from, (key) -> new LeftRight());
|
spans.add(new PositionSpan(range.from, range.to));
|
||||||
requiredPositionSpans.computeIfAbsent(range.to, (key) -> new LeftRight());
|
|
||||||
minPosition = Math.min(minPosition, range.from);
|
minPosition = Math.min(minPosition, range.from);
|
||||||
maxPosition = Math.max(maxPosition, range.to);
|
maxPosition = Math.max(maxPosition, range.to);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PositionSpan[] spansTable = spans.toArray(PositionSpan[]::new);
|
||||||
|
int spanCount = spansTable.length;
|
||||||
int position = -1;
|
int position = -1;
|
||||||
int valueOffset = 0;
|
int valueOffset = 0;
|
||||||
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
|
for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) {
|
||||||
|
@ -113,14 +115,26 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
||||||
position += posAttr.getPositionIncrement();
|
position += posAttr.getPositionIncrement();
|
||||||
|
|
||||||
if (position >= minPosition) {
|
if (position >= minPosition) {
|
||||||
LeftRight leftRight = requiredPositionSpans.get(position);
|
// Correct left and right offsets for each span this position applies to.
|
||||||
if (leftRight != null) {
|
|
||||||
int startOffset = valueOffset + offsetAttr.startOffset();
|
int startOffset = valueOffset + offsetAttr.startOffset();
|
||||||
int endOffset = valueOffset + offsetAttr.endOffset();
|
int endOffset = valueOffset + offsetAttr.endOffset();
|
||||||
|
|
||||||
leftRight.left = Math.min(leftRight.left, startOffset);
|
int j = 0;
|
||||||
leftRight.right = Math.max(leftRight.right, endOffset);
|
for (int i = 0; i < spanCount; i++) {
|
||||||
|
PositionSpan span = spansTable[j] = spansTable[i];
|
||||||
|
if (position >= span.from) {
|
||||||
|
if (position <= span.to) {
|
||||||
|
span.leftOffset = Math.min(span.leftOffset, startOffset);
|
||||||
|
span.rightOffset = Math.max(span.rightOffset, endOffset);
|
||||||
|
} else {
|
||||||
|
// this span can't intersect with any following position
|
||||||
|
// so omit it by skipping j++.
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
spanCount = j;
|
||||||
|
|
||||||
// Only short-circuit if we're on the last value (which should be the common
|
// Only short-circuit if we're on the last value (which should be the common
|
||||||
// case since most fields would only have a single value anyway). We need
|
// case since most fields would only have a single value anyway). We need
|
||||||
|
@ -136,19 +150,13 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy {
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
ArrayList<OffsetRange> converted = new ArrayList<>();
|
ArrayList<OffsetRange> converted = new ArrayList<>(spans.size());
|
||||||
for (OffsetRange range : ranges) {
|
for (PositionSpan span : spans) {
|
||||||
LeftRight left = requiredPositionSpans.get(range.from);
|
if (span.leftOffset == Integer.MAX_VALUE || span.rightOffset == Integer.MIN_VALUE) {
|
||||||
LeftRight right = requiredPositionSpans.get(range.to);
|
throw new RuntimeException("One of the offsets missing for position range: " + span);
|
||||||
if (left == null
|
|
||||||
|| right == null
|
|
||||||
|| left.left == Integer.MAX_VALUE
|
|
||||||
|| right.right == Integer.MIN_VALUE) {
|
|
||||||
throw new RuntimeException("Position not properly initialized for range: " + range);
|
|
||||||
}
|
}
|
||||||
converted.add(new OffsetRange(left.left, right.right));
|
converted.add(new OffsetRange(span.leftOffset, span.rightOffset));
|
||||||
}
|
}
|
||||||
|
|
||||||
return converted;
|
return converted;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,13 +18,15 @@ package org.apache.lucene.search.matchhighlight;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.RandomizedTest;
|
import com.carrotsearch.randomizedtesting.RandomizedTest;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
|
||||||
import org.apache.lucene.analysis.synonym.SynonymMap;
|
import org.apache.lucene.analysis.synonym.SynonymMap;
|
||||||
|
import org.apache.lucene.analysis.util.CharTokenizer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.FieldType;
|
import org.apache.lucene.document.FieldType;
|
||||||
|
@ -59,6 +61,7 @@ import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.function.BiFunction;
|
import java.util.function.BiFunction;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -87,6 +90,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
||||||
|
|
||||||
private Analyzer analyzer;
|
private Analyzer analyzer;
|
||||||
|
|
||||||
|
private static final String STOPWORD1 = "stopword";
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() throws IOException {
|
public void setup() throws IOException {
|
||||||
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
|
TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED);
|
||||||
|
@ -101,7 +106,15 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
||||||
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
|
final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100});
|
||||||
Analyzer whitespaceAnalyzer =
|
Analyzer whitespaceAnalyzer =
|
||||||
new AnalyzerWithGaps(offsetGap, positionGap,
|
new AnalyzerWithGaps(offsetGap, positionGap,
|
||||||
new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN));
|
new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN);
|
||||||
|
TokenStream tokenStream;
|
||||||
|
tokenStream = new StopFilter(tokenizer, new CharArraySet(Set.of(STOPWORD1), true));
|
||||||
|
return new TokenStreamComponents(tokenizer, tokenStream);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
|
SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] {
|
||||||
{"foo\u0000bar", "syn1"},
|
{"foo\u0000bar", "syn1"},
|
||||||
|
@ -361,6 +374,41 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDegenerateIntervalsWithPositions() throws IOException {
|
||||||
|
testDegenerateIntervals(FLD_TEXT_POS);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9634: " +
|
||||||
|
"Highlighting of degenerate spans on fields with offsets doesn't work properly")
|
||||||
|
public void testDegenerateIntervalsWithOffsets() throws IOException {
|
||||||
|
testDegenerateIntervals(FLD_TEXT_POS_OFFS);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDegenerateIntervals(String field) throws IOException {
|
||||||
|
new IndexBuilder(this::toField)
|
||||||
|
.doc(field, fmt("foo %s bar", STOPWORD1))
|
||||||
|
.build(analyzer, reader -> {
|
||||||
|
assertThat(
|
||||||
|
highlights(reader, new IntervalQuery(
|
||||||
|
field,
|
||||||
|
Intervals.extend(
|
||||||
|
Intervals.term("bar"), 1, 3))),
|
||||||
|
containsInAnyOrder(
|
||||||
|
fmt("0: (%s: 'foo %s >bar<')", field, STOPWORD1)
|
||||||
|
));
|
||||||
|
|
||||||
|
assertThat(
|
||||||
|
highlights(reader, new IntervalQuery(
|
||||||
|
field,
|
||||||
|
Intervals.extend(
|
||||||
|
Intervals.term("bar"), 5, 100))),
|
||||||
|
containsInAnyOrder(
|
||||||
|
fmt("0: (%s: '>foo %s bar<')", field, STOPWORD1)
|
||||||
|
));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMultivaluedFieldsWithOffsets() throws IOException {
|
public void testMultivaluedFieldsWithOffsets() throws IOException {
|
||||||
checkMultivaluedFields(FLD_TEXT_POS_OFFS);
|
checkMultivaluedFields(FLD_TEXT_POS_OFFS);
|
||||||
|
|
Loading…
Reference in New Issue