mirror of https://github.com/apache/lucene.git
LUCENE-9519: Correct behavior for highlights that cross multi-value boundaries (#1853)
This commit is contained in:
parent
e2f3f626ee
commit
59d83428bc
|
@ -62,4 +62,15 @@ public class OffsetRange {
|
|||
public int hashCode() {
|
||||
return Objects.hash(from, to);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a sub-range of this range (a copy). Subclasses should
|
||||
* override and return an appropriate type covariant so that payloads
|
||||
* are not lost.
|
||||
*/
|
||||
public OffsetRange slice(int from, int to) {
|
||||
assert from >= this.from;
|
||||
assert to <= this.to;
|
||||
return new OffsetRange(from, to);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -80,22 +80,16 @@ public class PassageSelector {
|
|||
int maxPassageWindow,
|
||||
int maxPassages,
|
||||
List<OffsetRange> permittedPassageRanges) {
|
||||
assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess;
|
||||
assert markers instanceof RandomAccess;
|
||||
assert permittedPassageRanges instanceof RandomAccess;
|
||||
assert sortedAndNonOverlapping(permittedPassageRanges);
|
||||
|
||||
// Handle odd special cases early.
|
||||
if (value.length() == 0 || maxPassageWindow == 0) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
// Sort markers by their start offset, shortest first.
|
||||
markers.sort(
|
||||
(a, b) -> {
|
||||
int v = Integer.compare(a.from, b.from);
|
||||
return v != 0 ? v : Integer.compare(a.to, b.to);
|
||||
});
|
||||
|
||||
// Determine a maximum offset window around each highlight marker and
|
||||
// pick the best scoring passage candidates.
|
||||
// Best passages so far.
|
||||
PriorityQueue<Passage> pq =
|
||||
new PriorityQueue<>(maxPassages) {
|
||||
@Override
|
||||
|
@ -104,7 +98,10 @@ public class PassageSelector {
|
|||
}
|
||||
};
|
||||
|
||||
assert sortedAndNonOverlapping(permittedPassageRanges);
|
||||
markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges);
|
||||
|
||||
// Sort markers by their start offset, shortest first.
|
||||
markers.sort(Comparator.<OffsetRange> comparingInt(r -> r.from).thenComparingInt(r -> r.to));
|
||||
|
||||
final int max = markers.size();
|
||||
int markerIndex = 0;
|
||||
|
@ -180,7 +177,7 @@ public class PassageSelector {
|
|||
}
|
||||
} else {
|
||||
// Handle the default, no highlighting markers case.
|
||||
passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges);
|
||||
passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges);
|
||||
}
|
||||
|
||||
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
|
||||
|
@ -224,11 +221,43 @@ public class PassageSelector {
|
|||
}
|
||||
|
||||
// Sort in the offset order again.
|
||||
Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from));
|
||||
Arrays.sort(passages, Comparator.comparingInt(a -> a.from));
|
||||
|
||||
return Arrays.asList(passages);
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate or split highlight markers that cross permitted value boundaries.
|
||||
*/
|
||||
private List<? extends OffsetRange> splitOrTruncateToWindows(List<? extends OffsetRange> markers, int maxPassageWindow, List<OffsetRange> permittedPassageRanges) {
|
||||
// Process markers overlapping with each permitted window.
|
||||
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
|
||||
for (OffsetRange marker : markers) {
|
||||
for (OffsetRange permitted : permittedPassageRanges) {
|
||||
boolean needsNew = false;
|
||||
int from = marker.from;
|
||||
if (from < permitted.from) {
|
||||
from = permitted.from;
|
||||
needsNew = true;
|
||||
}
|
||||
|
||||
int to = marker.to;
|
||||
if (to > permitted.to) {
|
||||
to = permitted.to;
|
||||
needsNew = true;
|
||||
}
|
||||
|
||||
if (from >= to || (to - from) > maxPassageWindow) {
|
||||
continue;
|
||||
}
|
||||
|
||||
processedMarkers.add(needsNew ? marker.slice(from, to) : marker);
|
||||
}
|
||||
}
|
||||
markers = processedMarkers;
|
||||
return markers;
|
||||
}
|
||||
|
||||
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
|
||||
if (permittedPassageRanges.size() > 1) {
|
||||
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
|
||||
|
@ -248,19 +277,21 @@ public class PassageSelector {
|
|||
* Invoked when no passages could be selected (due to constraints or lack of highlight markers).
|
||||
*/
|
||||
protected Passage[] pickDefaultPassage(
|
||||
CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) {
|
||||
CharSequence value, int maxCharacterWindow, int maxPassages, List<OffsetRange> permittedPassageRanges) {
|
||||
// Search for the first range that is not empty.
|
||||
ArrayList<Passage> defaultPassages = new ArrayList<>();
|
||||
for (OffsetRange o : permittedPassageRanges) {
|
||||
if (defaultPassages.size() >= maxPassages) {
|
||||
break;
|
||||
}
|
||||
|
||||
int to = Math.min(value.length(), o.to);
|
||||
if (o.from < to) {
|
||||
return new Passage[] {
|
||||
new Passage(
|
||||
o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
|
||||
};
|
||||
defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList()));
|
||||
}
|
||||
}
|
||||
|
||||
return new Passage[] {};
|
||||
return defaultPassages.toArray(Passage[]::new);
|
||||
}
|
||||
|
||||
private static boolean adjecentOrOverlapping(Passage a, Passage b) {
|
||||
|
|
|
@ -279,6 +279,23 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
|||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIntervalQueryHighlightCrossingMultivalueBoundary() throws IOException {
|
||||
String field = FLD_TEXT_POS;
|
||||
new IndexBuilder(this::toField)
|
||||
.doc(field, "foo", "bar")
|
||||
.build(analyzer, reader -> {
|
||||
assertThat(
|
||||
highlights(reader, new IntervalQuery(field,
|
||||
Intervals.unordered(
|
||||
Intervals.term("foo"),
|
||||
Intervals.term("bar")))),
|
||||
containsInAnyOrder(
|
||||
fmt("0: (field_text: '>foo< | >bar<')", field)
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIntervalQueries() throws IOException {
|
||||
String field = FLD_TEXT_POS_OFFS;
|
||||
|
|
|
@ -16,19 +16,16 @@
|
|||
*/
|
||||
package org.apache.lucene.search.matchhighlight;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.junit.Test;
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||
|
||||
public class TestPassageSelector extends LuceneTestCase {
|
||||
@Test
|
||||
|
@ -72,12 +69,12 @@ public class TestPassageSelector extends LuceneTestCase {
|
|||
checkPassages("0123456789a", "0123456789a", 300, 1);
|
||||
checkPassages("01234...", "0123456789a", 5, 1);
|
||||
checkPassages(
|
||||
"0123",
|
||||
"0123|45678",
|
||||
"0123456789a",
|
||||
15,
|
||||
2,
|
||||
new OffsetRange[0],
|
||||
new OffsetRange[] {new OffsetRange(0, 4), new OffsetRange(4, 9)});
|
||||
new OffsetRange[]{new OffsetRange(0, 4), new OffsetRange(4, 9)});
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -137,6 +134,36 @@ public class TestPassageSelector extends LuceneTestCase {
|
|||
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void defaultPassages() {
|
||||
// No highlights, multiple value ranges.
|
||||
checkPassages(
|
||||
"01|23|4567",
|
||||
"0123456789",
|
||||
100,
|
||||
100,
|
||||
ranges(),
|
||||
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
|
||||
|
||||
// No highlights, multiple value ranges, maxPassages = 1
|
||||
checkPassages(
|
||||
"01",
|
||||
"0123456789",
|
||||
100,
|
||||
1,
|
||||
ranges(),
|
||||
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
|
||||
|
||||
// No highlights, multiple value ranges, maxWindows size too short.
|
||||
checkPassages(
|
||||
"0123...|5678...",
|
||||
"0123456789",
|
||||
4,
|
||||
2,
|
||||
ranges(),
|
||||
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void passageScoring() {
|
||||
// More highlights per passage -> better passage
|
||||
|
@ -197,12 +224,24 @@ public class TestPassageSelector extends LuceneTestCase {
|
|||
new OffsetRange(18, Integer.MAX_VALUE)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHighlightAcrossAllowedValueRange() {
|
||||
checkPassages(
|
||||
"012>34<|>56<789",
|
||||
"0123456789",
|
||||
100,
|
||||
10,
|
||||
ranges(new OffsetRange(3, 7)),
|
||||
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void randomizedSanityCheck() {
|
||||
PassageSelector selector = new PassageSelector();
|
||||
PassageFormatter formatter = new PassageFormatter("...", ">", "<");
|
||||
ArrayList<OffsetRange> highlights = new ArrayList<>();
|
||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < 5000; i++) {
|
||||
String value =
|
||||
randomBoolean()
|
||||
|
@ -272,8 +311,8 @@ public class TestPassageSelector extends LuceneTestCase {
|
|||
OffsetRange[] ranges) {
|
||||
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
|
||||
PassageSelector selector = new PassageSelector();
|
||||
List<OffsetRange> hlist = Arrays.asList(highlights);
|
||||
List<OffsetRange> rangeList = Arrays.asList(ranges);
|
||||
List<OffsetRange> hlist = Arrays.asList(highlights);
|
||||
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
|
||||
return String.join("|", passageFormatter.format(value, passages, rangeList));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue