LUCENE-9519: Correct behavior for highlights that cross multi-value boundaries (#1853)

This commit is contained in:
Dawid Weiss 2020-09-10 16:15:51 +02:00 committed by GitHub
parent e2f3f626ee
commit 59d83428bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 127 additions and 29 deletions

View File

@ -62,4 +62,15 @@ public class OffsetRange {
public int hashCode() { public int hashCode() {
return Objects.hash(from, to); return Objects.hash(from, to);
} }
/**
* Returns a sub-range of this range (a copy). Subclasses should
* override and return an appropriate type covariant so that payloads
* are not lost.
*/
public OffsetRange slice(int from, int to) {
assert from >= this.from;
assert to <= this.to;
return new OffsetRange(from, to);
}
} }

View File

@ -80,22 +80,16 @@ public class PassageSelector {
int maxPassageWindow, int maxPassageWindow,
int maxPassages, int maxPassages,
List<OffsetRange> permittedPassageRanges) { List<OffsetRange> permittedPassageRanges) {
assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess; assert markers instanceof RandomAccess;
assert permittedPassageRanges instanceof RandomAccess;
assert sortedAndNonOverlapping(permittedPassageRanges);
// Handle odd special cases early. // Handle odd special cases early.
if (value.length() == 0 || maxPassageWindow == 0) { if (value.length() == 0 || maxPassageWindow == 0) {
return Collections.emptyList(); return Collections.emptyList();
} }
// Sort markers by their start offset, shortest first. // Best passages so far.
markers.sort(
(a, b) -> {
int v = Integer.compare(a.from, b.from);
return v != 0 ? v : Integer.compare(a.to, b.to);
});
// Determine a maximum offset window around each highlight marker and
// pick the best scoring passage candidates.
PriorityQueue<Passage> pq = PriorityQueue<Passage> pq =
new PriorityQueue<>(maxPassages) { new PriorityQueue<>(maxPassages) {
@Override @Override
@ -104,7 +98,10 @@ public class PassageSelector {
} }
}; };
assert sortedAndNonOverlapping(permittedPassageRanges); markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges);
// Sort markers by their start offset, shortest first.
markers.sort(Comparator.<OffsetRange> comparingInt(r -> r.from).thenComparingInt(r -> r.to));
final int max = markers.size(); final int max = markers.size();
int markerIndex = 0; int markerIndex = 0;
@ -180,7 +177,7 @@ public class PassageSelector {
} }
} else { } else {
// Handle the default, no highlighting markers case. // Handle the default, no highlighting markers case.
passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges); passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges);
} }
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're // Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
@ -224,11 +221,43 @@ public class PassageSelector {
} }
// Sort in the offset order again. // Sort in the offset order again.
Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from)); Arrays.sort(passages, Comparator.comparingInt(a -> a.from));
return Arrays.asList(passages); return Arrays.asList(passages);
} }
/**
* Truncate or split highlight markers that cross permitted value boundaries.
*/
private List<? extends OffsetRange> splitOrTruncateToWindows(List<? extends OffsetRange> markers, int maxPassageWindow, List<OffsetRange> permittedPassageRanges) {
// Process markers overlapping with each permitted window.
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
for (OffsetRange marker : markers) {
for (OffsetRange permitted : permittedPassageRanges) {
boolean needsNew = false;
int from = marker.from;
if (from < permitted.from) {
from = permitted.from;
needsNew = true;
}
int to = marker.to;
if (to > permitted.to) {
to = permitted.to;
needsNew = true;
}
if (from >= to || (to - from) > maxPassageWindow) {
continue;
}
processedMarkers.add(needsNew ? marker.slice(from, to) : marker);
}
}
markers = processedMarkers;
return markers;
}
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) { static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
if (permittedPassageRanges.size() > 1) { if (permittedPassageRanges.size() > 1) {
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator(); Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
@ -248,19 +277,21 @@ public class PassageSelector {
* Invoked when no passages could be selected (due to constraints or lack of highlight markers). * Invoked when no passages could be selected (due to constraints or lack of highlight markers).
*/ */
protected Passage[] pickDefaultPassage( protected Passage[] pickDefaultPassage(
CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) { CharSequence value, int maxCharacterWindow, int maxPassages, List<OffsetRange> permittedPassageRanges) {
// Search for the first range that is not empty. // Search for the first range that is not empty.
ArrayList<Passage> defaultPassages = new ArrayList<>();
for (OffsetRange o : permittedPassageRanges) { for (OffsetRange o : permittedPassageRanges) {
if (defaultPassages.size() >= maxPassages) {
break;
}
int to = Math.min(value.length(), o.to); int to = Math.min(value.length(), o.to);
if (o.from < to) { if (o.from < to) {
return new Passage[] { defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList()));
new Passage(
o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
};
} }
} }
return new Passage[] {}; return defaultPassages.toArray(Passage[]::new);
} }
private static boolean adjecentOrOverlapping(Passage a, Passage b) { private static boolean adjecentOrOverlapping(Passage a, Passage b) {

View File

@ -279,6 +279,23 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
); );
} }
@Test
public void testIntervalQueryHighlightCrossingMultivalueBoundary() throws IOException {
String field = FLD_TEXT_POS;
new IndexBuilder(this::toField)
.doc(field, "foo", "bar")
.build(analyzer, reader -> {
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bar")))),
containsInAnyOrder(
fmt("0: (field_text: '>foo< | >bar<')", field)
));
});
}
@Test @Test
public void testIntervalQueries() throws IOException { public void testIntervalQueries() throws IOException {
String field = FLD_TEXT_POS_OFFS; String field = FLD_TEXT_POS_OFFS;

View File

@ -16,19 +16,16 @@
*/ */
package org.apache.lucene.search.matchhighlight; package org.apache.lucene.search.matchhighlight;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween; import org.apache.lucene.util.LuceneTestCase;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; import org.hamcrest.Matchers;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import org.apache.lucene.util.LuceneTestCase; import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
import org.hamcrest.Matchers;
import org.junit.Test;
public class TestPassageSelector extends LuceneTestCase { public class TestPassageSelector extends LuceneTestCase {
@Test @Test
@ -72,7 +69,7 @@ public class TestPassageSelector extends LuceneTestCase {
checkPassages("0123456789a", "0123456789a", 300, 1); checkPassages("0123456789a", "0123456789a", 300, 1);
checkPassages("01234...", "0123456789a", 5, 1); checkPassages("01234...", "0123456789a", 5, 1);
checkPassages( checkPassages(
"0123", "0123|45678",
"0123456789a", "0123456789a",
15, 15,
2, 2,
@ -137,6 +134,36 @@ public class TestPassageSelector extends LuceneTestCase {
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11))); ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
} }
@Test
public void defaultPassages() {
// No highlights, multiple value ranges.
checkPassages(
"01|23|4567",
"0123456789",
100,
100,
ranges(),
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
// No highlights, multiple value ranges, maxPassages = 1
checkPassages(
"01",
"0123456789",
100,
1,
ranges(),
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
// No highlights, multiple value ranges, maxWindows size too short.
checkPassages(
"0123...|5678...",
"0123456789",
4,
2,
ranges(),
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
}
@Test @Test
public void passageScoring() { public void passageScoring() {
// More highlights per passage -> better passage // More highlights per passage -> better passage
@ -197,12 +224,24 @@ public class TestPassageSelector extends LuceneTestCase {
new OffsetRange(18, Integer.MAX_VALUE))); new OffsetRange(18, Integer.MAX_VALUE)));
} }
@Test
public void testHighlightAcrossAllowedValueRange() {
checkPassages(
"012>34<|>56<789",
"0123456789",
100,
10,
ranges(new OffsetRange(3, 7)),
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
}
@Test @Test
public void randomizedSanityCheck() { public void randomizedSanityCheck() {
PassageSelector selector = new PassageSelector(); PassageSelector selector = new PassageSelector();
PassageFormatter formatter = new PassageFormatter("...", ">", "<"); PassageFormatter formatter = new PassageFormatter("...", ">", "<");
ArrayList<OffsetRange> highlights = new ArrayList<>(); ArrayList<OffsetRange> highlights = new ArrayList<>();
ArrayList<OffsetRange> ranges = new ArrayList<>(); ArrayList<OffsetRange> ranges = new ArrayList<>();
for (int i = 0; i < 5000; i++) { for (int i = 0; i < 5000; i++) {
String value = String value =
randomBoolean() randomBoolean()
@ -272,8 +311,8 @@ public class TestPassageSelector extends LuceneTestCase {
OffsetRange[] ranges) { OffsetRange[] ranges) {
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<"); PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
PassageSelector selector = new PassageSelector(); PassageSelector selector = new PassageSelector();
List<OffsetRange> hlist = Arrays.asList(highlights);
List<OffsetRange> rangeList = Arrays.asList(ranges); List<OffsetRange> rangeList = Arrays.asList(ranges);
List<OffsetRange> hlist = Arrays.asList(highlights);
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList); List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
return String.join("|", passageFormatter.format(value, passages, rangeList)); return String.join("|", passageFormatter.format(value, passages, rangeList));
} }