From 59d83428bcbe5d722c2245cf5b42839e6c02e637 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Thu, 10 Sep 2020 16:15:51 +0200 Subject: [PATCH] LUCENE-9519: Correct behavior for highlights that cross multi-value boundaries (#1853) --- .../search/matchhighlight/OffsetRange.java | 11 +++ .../matchhighlight/PassageSelector.java | 69 ++++++++++++++----- .../TestMatchRegionRetriever.java | 17 +++++ .../matchhighlight/TestPassageSelector.java | 59 +++++++++++++--- 4 files changed, 127 insertions(+), 29 deletions(-) diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetRange.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetRange.java index 7ed397c1ef2..731650c2a38 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetRange.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/OffsetRange.java @@ -62,4 +62,15 @@ public class OffsetRange { public int hashCode() { return Objects.hash(from, to); } + + /** + * Returns a sub-range of this range (a copy). Subclasses should + * override and return an appropriate type covariant so that payloads + * are not lost. + */ + public OffsetRange slice(int from, int to) { + assert from >= this.from; + assert to <= this.to; + return new OffsetRange(from, to); + } } diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageSelector.java b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageSelector.java index 3ac18fe3c6f..0468802b50e 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageSelector.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/matchhighlight/PassageSelector.java @@ -80,22 +80,16 @@ public class PassageSelector { int maxPassageWindow, int maxPassages, List permittedPassageRanges) { - assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess; + assert markers instanceof RandomAccess; + assert permittedPassageRanges instanceof RandomAccess; + assert sortedAndNonOverlapping(permittedPassageRanges); // Handle odd special cases early. if (value.length() == 0 || maxPassageWindow == 0) { return Collections.emptyList(); } - // Sort markers by their start offset, shortest first. - markers.sort( - (a, b) -> { - int v = Integer.compare(a.from, b.from); - return v != 0 ? v : Integer.compare(a.to, b.to); - }); - - // Determine a maximum offset window around each highlight marker and - // pick the best scoring passage candidates. + // Best passages so far. PriorityQueue pq = new PriorityQueue<>(maxPassages) { @Override @@ -104,7 +98,10 @@ public class PassageSelector { } }; - assert sortedAndNonOverlapping(permittedPassageRanges); + markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges); + + // Sort markers by their start offset, shortest first. + markers.sort(Comparator. comparingInt(r -> r.from).thenComparingInt(r -> r.to)); final int max = markers.size(); int markerIndex = 0; @@ -180,7 +177,7 @@ public class PassageSelector { } } else { // Handle the default, no highlighting markers case. - passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges); + passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges); } // Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're @@ -224,11 +221,43 @@ public class PassageSelector { } // Sort in the offset order again. - Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from)); + Arrays.sort(passages, Comparator.comparingInt(a -> a.from)); return Arrays.asList(passages); } + /** + * Truncate or split highlight markers that cross permitted value boundaries. + */ + private List splitOrTruncateToWindows(List markers, int maxPassageWindow, List permittedPassageRanges) { + // Process markers overlapping with each permitted window. + ArrayList processedMarkers = new ArrayList<>(markers.size()); + for (OffsetRange marker : markers) { + for (OffsetRange permitted : permittedPassageRanges) { + boolean needsNew = false; + int from = marker.from; + if (from < permitted.from) { + from = permitted.from; + needsNew = true; + } + + int to = marker.to; + if (to > permitted.to) { + to = permitted.to; + needsNew = true; + } + + if (from >= to || (to - from) > maxPassageWindow) { + continue; + } + + processedMarkers.add(needsNew ? marker.slice(from, to) : marker); + } + } + markers = processedMarkers; + return markers; + } + static boolean sortedAndNonOverlapping(List permittedPassageRanges) { if (permittedPassageRanges.size() > 1) { Iterator i = permittedPassageRanges.iterator(); @@ -248,19 +277,21 @@ public class PassageSelector { * Invoked when no passages could be selected (due to constraints or lack of highlight markers). */ protected Passage[] pickDefaultPassage( - CharSequence value, int maxCharacterWindow, List permittedPassageRanges) { + CharSequence value, int maxCharacterWindow, int maxPassages, List permittedPassageRanges) { // Search for the first range that is not empty. + ArrayList defaultPassages = new ArrayList<>(); for (OffsetRange o : permittedPassageRanges) { + if (defaultPassages.size() >= maxPassages) { + break; + } + int to = Math.min(value.length(), o.to); if (o.from < to) { - return new Passage[] { - new Passage( - o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList()) - }; + defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())); } } - return new Passage[] {}; + return defaultPassages.toArray(Passage[]::new); } private static boolean adjecentOrOverlapping(Passage a, Passage b) { diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java index 691877c1199..77d2b0c5c6a 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestMatchRegionRetriever.java @@ -279,6 +279,23 @@ public class TestMatchRegionRetriever extends LuceneTestCase { ); } + @Test + public void testIntervalQueryHighlightCrossingMultivalueBoundary() throws IOException { + String field = FLD_TEXT_POS; + new IndexBuilder(this::toField) + .doc(field, "foo", "bar") + .build(analyzer, reader -> { + assertThat( + highlights(reader, new IntervalQuery(field, + Intervals.unordered( + Intervals.term("foo"), + Intervals.term("bar")))), + containsInAnyOrder( + fmt("0: (field_text: '>foo< | >bar<')", field) + )); + }); + } + @Test public void testIntervalQueries() throws IOException { String field = FLD_TEXT_POS_OFFS; diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestPassageSelector.java b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestPassageSelector.java index 3d03d6beb37..c3ec7c834fb 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestPassageSelector.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/matchhighlight/TestPassageSelector.java @@ -16,19 +16,16 @@ */ package org.apache.lucene.search.matchhighlight; -import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween; -import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; -import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; -import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween; +import org.apache.lucene.util.LuceneTestCase; +import org.hamcrest.Matchers; +import org.junit.Test; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; -import org.apache.lucene.util.LuceneTestCase; -import org.hamcrest.Matchers; -import org.junit.Test; +import static com.carrotsearch.randomizedtesting.RandomizedTest.*; public class TestPassageSelector extends LuceneTestCase { @Test @@ -72,12 +69,12 @@ public class TestPassageSelector extends LuceneTestCase { checkPassages("0123456789a", "0123456789a", 300, 1); checkPassages("01234...", "0123456789a", 5, 1); checkPassages( - "0123", + "0123|45678", "0123456789a", 15, 2, new OffsetRange[0], - new OffsetRange[] {new OffsetRange(0, 4), new OffsetRange(4, 9)}); + new OffsetRange[]{new OffsetRange(0, 4), new OffsetRange(4, 9)}); } @Test @@ -137,6 +134,36 @@ public class TestPassageSelector extends LuceneTestCase { ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11))); } + @Test + public void defaultPassages() { + // No highlights, multiple value ranges. + checkPassages( + "01|23|4567", + "0123456789", + 100, + 100, + ranges(), + ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8))); + + // No highlights, multiple value ranges, maxPassages = 1 + checkPassages( + "01", + "0123456789", + 100, + 1, + ranges(), + ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8))); + + // No highlights, multiple value ranges, maxWindows size too short. + checkPassages( + "0123...|5678...", + "0123456789", + 4, + 2, + ranges(), + ranges(new OffsetRange(0, 5), new OffsetRange(5, 10))); + } + @Test public void passageScoring() { // More highlights per passage -> better passage @@ -197,12 +224,24 @@ public class TestPassageSelector extends LuceneTestCase { new OffsetRange(18, Integer.MAX_VALUE))); } + @Test + public void testHighlightAcrossAllowedValueRange() { + checkPassages( + "012>34<|>56<789", + "0123456789", + 100, + 10, + ranges(new OffsetRange(3, 7)), + ranges(new OffsetRange(0, 5), new OffsetRange(5, 10))); + } + @Test public void randomizedSanityCheck() { PassageSelector selector = new PassageSelector(); PassageFormatter formatter = new PassageFormatter("...", ">", "<"); ArrayList highlights = new ArrayList<>(); ArrayList ranges = new ArrayList<>(); + for (int i = 0; i < 5000; i++) { String value = randomBoolean() @@ -272,8 +311,8 @@ public class TestPassageSelector extends LuceneTestCase { OffsetRange[] ranges) { PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<"); PassageSelector selector = new PassageSelector(); - List hlist = Arrays.asList(highlights); List rangeList = Arrays.asList(ranges); + List hlist = Arrays.asList(highlights); List passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList); return String.join("|", passageFormatter.format(value, passages, rangeList)); }