LUCENE-9519: Correct behavior for highlights that cross multi-value boundaries (#1853)

This commit is contained in:
Dawid Weiss 2020-09-10 16:15:51 +02:00 committed by GitHub
parent e2f3f626ee
commit 59d83428bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 127 additions and 29 deletions

View File

@ -62,4 +62,15 @@ public class OffsetRange {
public int hashCode() {
return Objects.hash(from, to);
}
/**
* Returns a sub-range of this range (a copy). Subclasses should
* override and return an appropriate type covariant so that payloads
* are not lost.
*/
public OffsetRange slice(int from, int to) {
assert from >= this.from;
assert to <= this.to;
return new OffsetRange(from, to);
}
}

View File

@ -80,22 +80,16 @@ public class PassageSelector {
int maxPassageWindow,
int maxPassages,
List<OffsetRange> permittedPassageRanges) {
assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess;
assert markers instanceof RandomAccess;
assert permittedPassageRanges instanceof RandomAccess;
assert sortedAndNonOverlapping(permittedPassageRanges);
// Handle odd special cases early.
if (value.length() == 0 || maxPassageWindow == 0) {
return Collections.emptyList();
}
// Sort markers by their start offset, shortest first.
markers.sort(
(a, b) -> {
int v = Integer.compare(a.from, b.from);
return v != 0 ? v : Integer.compare(a.to, b.to);
});
// Determine a maximum offset window around each highlight marker and
// pick the best scoring passage candidates.
// Best passages so far.
PriorityQueue<Passage> pq =
new PriorityQueue<>(maxPassages) {
@Override
@ -104,7 +98,10 @@ public class PassageSelector {
}
};
assert sortedAndNonOverlapping(permittedPassageRanges);
markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges);
// Sort markers by their start offset, shortest first.
markers.sort(Comparator.<OffsetRange> comparingInt(r -> r.from).thenComparingInt(r -> r.to));
final int max = markers.size();
int markerIndex = 0;
@ -180,7 +177,7 @@ public class PassageSelector {
}
} else {
// Handle the default, no highlighting markers case.
passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges);
passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges);
}
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
@ -224,11 +221,43 @@ public class PassageSelector {
}
// Sort in the offset order again.
Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from));
Arrays.sort(passages, Comparator.comparingInt(a -> a.from));
return Arrays.asList(passages);
}
/**
* Truncate or split highlight markers that cross permitted value boundaries.
*/
private List<? extends OffsetRange> splitOrTruncateToWindows(List<? extends OffsetRange> markers, int maxPassageWindow, List<OffsetRange> permittedPassageRanges) {
// Process markers overlapping with each permitted window.
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
for (OffsetRange marker : markers) {
for (OffsetRange permitted : permittedPassageRanges) {
boolean needsNew = false;
int from = marker.from;
if (from < permitted.from) {
from = permitted.from;
needsNew = true;
}
int to = marker.to;
if (to > permitted.to) {
to = permitted.to;
needsNew = true;
}
if (from >= to || (to - from) > maxPassageWindow) {
continue;
}
processedMarkers.add(needsNew ? marker.slice(from, to) : marker);
}
}
markers = processedMarkers;
return markers;
}
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
if (permittedPassageRanges.size() > 1) {
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
@ -248,19 +277,21 @@ public class PassageSelector {
* Invoked when no passages could be selected (due to constraints or lack of highlight markers).
*/
protected Passage[] pickDefaultPassage(
CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) {
CharSequence value, int maxCharacterWindow, int maxPassages, List<OffsetRange> permittedPassageRanges) {
// Search for the first range that is not empty.
ArrayList<Passage> defaultPassages = new ArrayList<>();
for (OffsetRange o : permittedPassageRanges) {
if (defaultPassages.size() >= maxPassages) {
break;
}
int to = Math.min(value.length(), o.to);
if (o.from < to) {
return new Passage[] {
new Passage(
o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
};
defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList()));
}
}
return new Passage[] {};
return defaultPassages.toArray(Passage[]::new);
}
private static boolean adjecentOrOverlapping(Passage a, Passage b) {

View File

@ -279,6 +279,23 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
);
}
@Test
public void testIntervalQueryHighlightCrossingMultivalueBoundary() throws IOException {
String field = FLD_TEXT_POS;
new IndexBuilder(this::toField)
.doc(field, "foo", "bar")
.build(analyzer, reader -> {
assertThat(
highlights(reader, new IntervalQuery(field,
Intervals.unordered(
Intervals.term("foo"),
Intervals.term("bar")))),
containsInAnyOrder(
fmt("0: (field_text: '>foo< | >bar<')", field)
));
});
}
@Test
public void testIntervalQueries() throws IOException {
String field = FLD_TEXT_POS_OFFS;

View File

@ -16,19 +16,16 @@
*/
package org.apache.lucene.search.matchhighlight;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.Matchers;
import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
public class TestPassageSelector extends LuceneTestCase {
@Test
@ -72,12 +69,12 @@ public class TestPassageSelector extends LuceneTestCase {
checkPassages("0123456789a", "0123456789a", 300, 1);
checkPassages("01234...", "0123456789a", 5, 1);
checkPassages(
"0123",
"0123|45678",
"0123456789a",
15,
2,
new OffsetRange[0],
new OffsetRange[] {new OffsetRange(0, 4), new OffsetRange(4, 9)});
new OffsetRange[]{new OffsetRange(0, 4), new OffsetRange(4, 9)});
}
@Test
@ -137,6 +134,36 @@ public class TestPassageSelector extends LuceneTestCase {
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
}
@Test
public void defaultPassages() {
// No highlights, multiple value ranges.
checkPassages(
"01|23|4567",
"0123456789",
100,
100,
ranges(),
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
// No highlights, multiple value ranges, maxPassages = 1
checkPassages(
"01",
"0123456789",
100,
1,
ranges(),
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
// No highlights, multiple value ranges, maxWindows size too short.
checkPassages(
"0123...|5678...",
"0123456789",
4,
2,
ranges(),
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
}
@Test
public void passageScoring() {
// More highlights per passage -> better passage
@ -197,12 +224,24 @@ public class TestPassageSelector extends LuceneTestCase {
new OffsetRange(18, Integer.MAX_VALUE)));
}
@Test
public void testHighlightAcrossAllowedValueRange() {
checkPassages(
"012>34<|>56<789",
"0123456789",
100,
10,
ranges(new OffsetRange(3, 7)),
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
}
@Test
public void randomizedSanityCheck() {
PassageSelector selector = new PassageSelector();
PassageFormatter formatter = new PassageFormatter("...", ">", "<");
ArrayList<OffsetRange> highlights = new ArrayList<>();
ArrayList<OffsetRange> ranges = new ArrayList<>();
for (int i = 0; i < 5000; i++) {
String value =
randomBoolean()
@ -272,8 +311,8 @@ public class TestPassageSelector extends LuceneTestCase {
OffsetRange[] ranges) {
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
PassageSelector selector = new PassageSelector();
List<OffsetRange> hlist = Arrays.asList(highlights);
List<OffsetRange> rangeList = Arrays.asList(ranges);
List<OffsetRange> hlist = Arrays.asList(highlights);
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
return String.join("|", passageFormatter.format(value, passages, rangeList));
}