From a6481439556c2f9dcb038d233d7eee4179f4ffff Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 11 Dec 2020 20:07:26 +0100 Subject: [PATCH] LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). (#2127) --- lucene/CHANGES.txt | 3 + .../matchhighlight/MatchRegionRetriever.java | 2 +- .../matchhighlight/OffsetsFromPositions.java | 88 ++++++++++--------- .../TestMatchRegionRetriever.java | 52 ++++++++++- 4 files changed, 102 insertions(+), 43 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8a482b78d5d..543611fea7d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -84,6 +84,9 @@ API Changes Improvements +* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). + (Dawid Weiss) + * LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai) * LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O. diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java index 2861ac66191..3f07aca6ff5 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/MatchRegionRetriever.java @@ -269,7 +269,7 @@ public class MatchRegionRetriever { // only those terms that the query hinted at when passed // a QueryVisitor. // - // Alternative straties are also possible and may make sense + // Alternative strategies are also possible and may make sense // depending on the use case (OffsetsFromValues, for example). return new OffsetsFromTokens(field, analyzer); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromPositions.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromPositions.java index 8eb932f5aef..f9049f3a182 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromPositions.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetsFromPositions.java @@ -24,16 +24,14 @@ import org.apache.lucene.search.MatchesIterator; import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; /** - * This strategy applies to fields with stored positions but no offsets. We re-analyze - * the field's value to find out offsets of match positions. - *

- * Note that this may fail if index data (positions stored in the index) is out of sync - * with the field values or the analyzer. This strategy assumes it'll never happen. + * This strategy applies to fields with stored positions but no offsets. We re-analyze the field's + * value to find out offsets of match positions. + * + *

Note that this may fail if index data (positions stored in the index) is out of sync with the + * field values or the analyzer. This strategy assumes it'll never happen. */ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy { private final String field; @@ -45,22 +43,21 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy { } @Override - public List get(MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) + public List get( + MatchesIterator matchesIterator, MatchRegionRetriever.FieldValueProvider doc) throws IOException { - ArrayList ranges = new ArrayList<>(); + ArrayList positionRanges = new ArrayList<>(); while (matchesIterator.next()) { int from = matchesIterator.startPosition(); int to = matchesIterator.endPosition(); if (from < 0 || to < 0) { throw new IOException("Matches API returned negative positions for field: " + field); } - ranges.add(new OffsetRange(from, to)); + positionRanges.add(new OffsetRange(from, to)); } // Convert from positions to offsets. - ranges = convertPositionsToOffsets(ranges, analyzer, field, doc.getValues(field)); - - return ranges; + return convertPositionsToOffsets(positionRanges, analyzer, field, doc.getValues(field)); } @Override @@ -68,37 +65,42 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy { return true; } - private static ArrayList convertPositionsToOffsets( - ArrayList ranges, + private static List convertPositionsToOffsets( + ArrayList positionRanges, Analyzer analyzer, String fieldName, List values) throws IOException { - if (ranges.isEmpty()) { - return ranges; + if (positionRanges.isEmpty()) { + return positionRanges; } - class LeftRight { - int left = Integer.MAX_VALUE; - int right = Integer.MIN_VALUE; + class PositionSpan extends OffsetRange { + int leftOffset = Integer.MAX_VALUE; + int rightOffset = Integer.MIN_VALUE; + + PositionSpan(int from, int to) { + super(from, to); + } @Override public String toString() { - return "[" + "L: " + left + ", R: " + right + ']'; + return "[from=" + from + ", to=" + to + ", L: " + leftOffset + ", R: " + rightOffset + ']'; } } - Map requiredPositionSpans = new HashMap<>(); + ArrayList spans = new ArrayList<>(); int minPosition = Integer.MAX_VALUE; int maxPosition = Integer.MIN_VALUE; - for (OffsetRange range : ranges) { - requiredPositionSpans.computeIfAbsent(range.from, (key) -> new LeftRight()); - requiredPositionSpans.computeIfAbsent(range.to, (key) -> new LeftRight()); + for (OffsetRange range : positionRanges) { + spans.add(new PositionSpan(range.from, range.to)); minPosition = Math.min(minPosition, range.from); maxPosition = Math.max(maxPosition, range.to); } + PositionSpan[] spansTable = spans.toArray(PositionSpan[]::new); + int spanCount = spansTable.length; int position = -1; int valueOffset = 0; for (int valueIndex = 0, max = values.size(); valueIndex < max; valueIndex++) { @@ -113,14 +115,26 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy { position += posAttr.getPositionIncrement(); if (position >= minPosition) { - LeftRight leftRight = requiredPositionSpans.get(position); - if (leftRight != null) { + // Correct left and right offsets for each span this position applies to. int startOffset = valueOffset + offsetAttr.startOffset(); int endOffset = valueOffset + offsetAttr.endOffset(); - leftRight.left = Math.min(leftRight.left, startOffset); - leftRight.right = Math.max(leftRight.right, endOffset); + int j = 0; + for (int i = 0; i < spanCount; i++) { + PositionSpan span = spansTable[j] = spansTable[i]; + if (position >= span.from) { + if (position <= span.to) { + span.leftOffset = Math.min(span.leftOffset, startOffset); + span.rightOffset = Math.max(span.rightOffset, endOffset); + } else { + // this span can't intersect with any following position + // so omit it by skipping j++. + continue; } + } + j++; + } + spanCount = j; // Only short-circuit if we're on the last value (which should be the common // case since most fields would only have a single value anyway). We need @@ -136,19 +150,13 @@ public final class OffsetsFromPositions implements OffsetsRetrievalStrategy { ts.close(); } - ArrayList converted = new ArrayList<>(); - for (OffsetRange range : ranges) { - LeftRight left = requiredPositionSpans.get(range.from); - LeftRight right = requiredPositionSpans.get(range.to); - if (left == null - || right == null - || left.left == Integer.MAX_VALUE - || right.right == Integer.MIN_VALUE) { - throw new RuntimeException("Position not properly initialized for range: " + range); + ArrayList converted = new ArrayList<>(spans.size()); + for (PositionSpan span : spans) { + if (span.leftOffset == Integer.MAX_VALUE || span.rightOffset == Integer.MIN_VALUE) { + throw new RuntimeException("One of the offsets missing for position range: " + span); } - converted.add(new OffsetRange(left.left, right.right)); + converted.add(new OffsetRange(span.leftOffset, span.rightOffset)); } - return converted; } } diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java index 77d2b0c5c6a..08a4b8391d7 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java @@ -18,13 +18,15 @@ package org.apache.lucene.search.matchhighlight; import com.carrotsearch.randomizedtesting.RandomizedTest; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; +import org.apache.lucene.analysis.util.CharTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -59,6 +61,7 @@ import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import java.util.function.BiFunction; import java.util.stream.Collectors; @@ -87,6 +90,8 @@ public class TestMatchRegionRetriever extends LuceneTestCase { private Analyzer analyzer; + private static final String STOPWORD1 = "stopword"; + @Before public void setup() throws IOException { TYPE_STORED_WITH_OFFSETS = new FieldType(TextField.TYPE_STORED); @@ -101,7 +106,15 @@ public class TestMatchRegionRetriever extends LuceneTestCase { final int positionGap = RandomizedTest.randomFrom(new int[]{0, 1, 100}); Analyzer whitespaceAnalyzer = new AnalyzerWithGaps(offsetGap, positionGap, - new WhitespaceAnalyzer(WhitespaceTokenizer.DEFAULT_MAX_WORD_LEN)); + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new WhitespaceTokenizer(CharTokenizer.DEFAULT_MAX_WORD_LEN); + TokenStream tokenStream; + tokenStream = new StopFilter(tokenizer, new CharArraySet(Set.of(STOPWORD1), true)); + return new TokenStreamComponents(tokenizer, tokenStream); + } + }); SynonymMap synonymMap = TestMatchHighlighter.buildSynonymMap(new String[][] { {"foo\u0000bar", "syn1"}, @@ -361,6 +374,41 @@ public class TestMatchRegionRetriever extends LuceneTestCase { ); } + @Test + public void testDegenerateIntervalsWithPositions() throws IOException { + testDegenerateIntervals(FLD_TEXT_POS); + } + + @Test @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-9634: " + + "Highlighting of degenerate spans on fields with offsets doesn't work properly") + public void testDegenerateIntervalsWithOffsets() throws IOException { + testDegenerateIntervals(FLD_TEXT_POS_OFFS); + } + + public void testDegenerateIntervals(String field) throws IOException { + new IndexBuilder(this::toField) + .doc(field, fmt("foo %s bar", STOPWORD1)) + .build(analyzer, reader -> { + assertThat( + highlights(reader, new IntervalQuery( + field, + Intervals.extend( + Intervals.term("bar"), 1, 3))), + containsInAnyOrder( + fmt("0: (%s: 'foo %s >bar<')", field, STOPWORD1) + )); + + assertThat( + highlights(reader, new IntervalQuery( + field, + Intervals.extend( + Intervals.term("bar"), 5, 100))), + containsInAnyOrder( + fmt("0: (%s: '>foo %s bar<')", field, STOPWORD1) + )); + }); + } + @Test public void testMultivaluedFieldsWithOffsets() throws IOException { checkMultivaluedFields(FLD_TEXT_POS_OFFS);