mirror of https://github.com/apache/lucene.git
LUCENE-9519: Correct behavior for highlights that cross multi-value boundaries (#1853)
This commit is contained in:
parent
e2f3f626ee
commit
59d83428bc
|
@ -62,4 +62,15 @@ public class OffsetRange {
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hash(from, to);
|
return Objects.hash(from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a sub-range of this range (a copy). Subclasses should
|
||||||
|
* override and return an appropriate type covariant so that payloads
|
||||||
|
* are not lost.
|
||||||
|
*/
|
||||||
|
public OffsetRange slice(int from, int to) {
|
||||||
|
assert from >= this.from;
|
||||||
|
assert to <= this.to;
|
||||||
|
return new OffsetRange(from, to);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,22 +80,16 @@ public class PassageSelector {
|
||||||
int maxPassageWindow,
|
int maxPassageWindow,
|
||||||
int maxPassages,
|
int maxPassages,
|
||||||
List<OffsetRange> permittedPassageRanges) {
|
List<OffsetRange> permittedPassageRanges) {
|
||||||
assert markers instanceof RandomAccess && permittedPassageRanges instanceof RandomAccess;
|
assert markers instanceof RandomAccess;
|
||||||
|
assert permittedPassageRanges instanceof RandomAccess;
|
||||||
|
assert sortedAndNonOverlapping(permittedPassageRanges);
|
||||||
|
|
||||||
// Handle odd special cases early.
|
// Handle odd special cases early.
|
||||||
if (value.length() == 0 || maxPassageWindow == 0) {
|
if (value.length() == 0 || maxPassageWindow == 0) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort markers by their start offset, shortest first.
|
// Best passages so far.
|
||||||
markers.sort(
|
|
||||||
(a, b) -> {
|
|
||||||
int v = Integer.compare(a.from, b.from);
|
|
||||||
return v != 0 ? v : Integer.compare(a.to, b.to);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Determine a maximum offset window around each highlight marker and
|
|
||||||
// pick the best scoring passage candidates.
|
|
||||||
PriorityQueue<Passage> pq =
|
PriorityQueue<Passage> pq =
|
||||||
new PriorityQueue<>(maxPassages) {
|
new PriorityQueue<>(maxPassages) {
|
||||||
@Override
|
@Override
|
||||||
|
@ -104,7 +98,10 @@ public class PassageSelector {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
assert sortedAndNonOverlapping(permittedPassageRanges);
|
markers = splitOrTruncateToWindows(markers, maxPassageWindow, permittedPassageRanges);
|
||||||
|
|
||||||
|
// Sort markers by their start offset, shortest first.
|
||||||
|
markers.sort(Comparator.<OffsetRange> comparingInt(r -> r.from).thenComparingInt(r -> r.to));
|
||||||
|
|
||||||
final int max = markers.size();
|
final int max = markers.size();
|
||||||
int markerIndex = 0;
|
int markerIndex = 0;
|
||||||
|
@ -180,7 +177,7 @@ public class PassageSelector {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Handle the default, no highlighting markers case.
|
// Handle the default, no highlighting markers case.
|
||||||
passages = pickDefaultPassage(value, maxPassageWindow, permittedPassageRanges);
|
passages = pickDefaultPassage(value, maxPassageWindow, maxPassages, permittedPassageRanges);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
|
// Correct passage boundaries from maxExclusive window. Typically shrink boundaries until we're
|
||||||
|
@ -224,11 +221,43 @@ public class PassageSelector {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort in the offset order again.
|
// Sort in the offset order again.
|
||||||
Arrays.sort(passages, (a, b) -> Integer.compare(a.from, b.from));
|
Arrays.sort(passages, Comparator.comparingInt(a -> a.from));
|
||||||
|
|
||||||
return Arrays.asList(passages);
|
return Arrays.asList(passages);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Truncate or split highlight markers that cross permitted value boundaries.
|
||||||
|
*/
|
||||||
|
private List<? extends OffsetRange> splitOrTruncateToWindows(List<? extends OffsetRange> markers, int maxPassageWindow, List<OffsetRange> permittedPassageRanges) {
|
||||||
|
// Process markers overlapping with each permitted window.
|
||||||
|
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
|
||||||
|
for (OffsetRange marker : markers) {
|
||||||
|
for (OffsetRange permitted : permittedPassageRanges) {
|
||||||
|
boolean needsNew = false;
|
||||||
|
int from = marker.from;
|
||||||
|
if (from < permitted.from) {
|
||||||
|
from = permitted.from;
|
||||||
|
needsNew = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int to = marker.to;
|
||||||
|
if (to > permitted.to) {
|
||||||
|
to = permitted.to;
|
||||||
|
needsNew = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (from >= to || (to - from) > maxPassageWindow) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
processedMarkers.add(needsNew ? marker.slice(from, to) : marker);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
markers = processedMarkers;
|
||||||
|
return markers;
|
||||||
|
}
|
||||||
|
|
||||||
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
|
static boolean sortedAndNonOverlapping(List<? extends OffsetRange> permittedPassageRanges) {
|
||||||
if (permittedPassageRanges.size() > 1) {
|
if (permittedPassageRanges.size() > 1) {
|
||||||
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
|
Iterator<? extends OffsetRange> i = permittedPassageRanges.iterator();
|
||||||
|
@ -248,19 +277,21 @@ public class PassageSelector {
|
||||||
* Invoked when no passages could be selected (due to constraints or lack of highlight markers).
|
* Invoked when no passages could be selected (due to constraints or lack of highlight markers).
|
||||||
*/
|
*/
|
||||||
protected Passage[] pickDefaultPassage(
|
protected Passage[] pickDefaultPassage(
|
||||||
CharSequence value, int maxCharacterWindow, List<OffsetRange> permittedPassageRanges) {
|
CharSequence value, int maxCharacterWindow, int maxPassages, List<OffsetRange> permittedPassageRanges) {
|
||||||
// Search for the first range that is not empty.
|
// Search for the first range that is not empty.
|
||||||
|
ArrayList<Passage> defaultPassages = new ArrayList<>();
|
||||||
for (OffsetRange o : permittedPassageRanges) {
|
for (OffsetRange o : permittedPassageRanges) {
|
||||||
|
if (defaultPassages.size() >= maxPassages) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
int to = Math.min(value.length(), o.to);
|
int to = Math.min(value.length(), o.to);
|
||||||
if (o.from < to) {
|
if (o.from < to) {
|
||||||
return new Passage[] {
|
defaultPassages.add(new Passage(o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList()));
|
||||||
new Passage(
|
|
||||||
o.from, o.from + Math.min(maxCharacterWindow, o.length()), Collections.emptyList())
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Passage[] {};
|
return defaultPassages.toArray(Passage[]::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean adjecentOrOverlapping(Passage a, Passage b) {
|
private static boolean adjecentOrOverlapping(Passage a, Passage b) {
|
||||||
|
|
|
@ -279,6 +279,23 @@ public class TestMatchRegionRetriever extends LuceneTestCase {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIntervalQueryHighlightCrossingMultivalueBoundary() throws IOException {
|
||||||
|
String field = FLD_TEXT_POS;
|
||||||
|
new IndexBuilder(this::toField)
|
||||||
|
.doc(field, "foo", "bar")
|
||||||
|
.build(analyzer, reader -> {
|
||||||
|
assertThat(
|
||||||
|
highlights(reader, new IntervalQuery(field,
|
||||||
|
Intervals.unordered(
|
||||||
|
Intervals.term("foo"),
|
||||||
|
Intervals.term("bar")))),
|
||||||
|
containsInAnyOrder(
|
||||||
|
fmt("0: (field_text: '>foo< | >bar<')", field)
|
||||||
|
));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIntervalQueries() throws IOException {
|
public void testIntervalQueries() throws IOException {
|
||||||
String field = FLD_TEXT_POS_OFFS;
|
String field = FLD_TEXT_POS_OFFS;
|
||||||
|
|
|
@ -16,19 +16,16 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.search.matchhighlight;
|
package org.apache.lucene.search.matchhighlight;
|
||||||
|
|
||||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomAsciiLettersOfLengthBetween;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean;
|
import org.hamcrest.Matchers;
|
||||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
|
import org.junit.Test;
|
||||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomRealisticUnicodeOfCodepointLengthBetween;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import static com.carrotsearch.randomizedtesting.RandomizedTest.*;
|
||||||
import org.hamcrest.Matchers;
|
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class TestPassageSelector extends LuceneTestCase {
|
public class TestPassageSelector extends LuceneTestCase {
|
||||||
@Test
|
@Test
|
||||||
|
@ -72,7 +69,7 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
checkPassages("0123456789a", "0123456789a", 300, 1);
|
checkPassages("0123456789a", "0123456789a", 300, 1);
|
||||||
checkPassages("01234...", "0123456789a", 5, 1);
|
checkPassages("01234...", "0123456789a", 5, 1);
|
||||||
checkPassages(
|
checkPassages(
|
||||||
"0123",
|
"0123|45678",
|
||||||
"0123456789a",
|
"0123456789a",
|
||||||
15,
|
15,
|
||||||
2,
|
2,
|
||||||
|
@ -137,6 +134,36 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
|
ranges(new OffsetRange(0, 0), new OffsetRange(2, 2), new OffsetRange(6, 11)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void defaultPassages() {
|
||||||
|
// No highlights, multiple value ranges.
|
||||||
|
checkPassages(
|
||||||
|
"01|23|4567",
|
||||||
|
"0123456789",
|
||||||
|
100,
|
||||||
|
100,
|
||||||
|
ranges(),
|
||||||
|
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
|
||||||
|
|
||||||
|
// No highlights, multiple value ranges, maxPassages = 1
|
||||||
|
checkPassages(
|
||||||
|
"01",
|
||||||
|
"0123456789",
|
||||||
|
100,
|
||||||
|
1,
|
||||||
|
ranges(),
|
||||||
|
ranges(new OffsetRange(0, 2), new OffsetRange(2, 4), new OffsetRange(4, 8)));
|
||||||
|
|
||||||
|
// No highlights, multiple value ranges, maxWindows size too short.
|
||||||
|
checkPassages(
|
||||||
|
"0123...|5678...",
|
||||||
|
"0123456789",
|
||||||
|
4,
|
||||||
|
2,
|
||||||
|
ranges(),
|
||||||
|
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void passageScoring() {
|
public void passageScoring() {
|
||||||
// More highlights per passage -> better passage
|
// More highlights per passage -> better passage
|
||||||
|
@ -197,12 +224,24 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
new OffsetRange(18, Integer.MAX_VALUE)));
|
new OffsetRange(18, Integer.MAX_VALUE)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHighlightAcrossAllowedValueRange() {
|
||||||
|
checkPassages(
|
||||||
|
"012>34<|>56<789",
|
||||||
|
"0123456789",
|
||||||
|
100,
|
||||||
|
10,
|
||||||
|
ranges(new OffsetRange(3, 7)),
|
||||||
|
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void randomizedSanityCheck() {
|
public void randomizedSanityCheck() {
|
||||||
PassageSelector selector = new PassageSelector();
|
PassageSelector selector = new PassageSelector();
|
||||||
PassageFormatter formatter = new PassageFormatter("...", ">", "<");
|
PassageFormatter formatter = new PassageFormatter("...", ">", "<");
|
||||||
ArrayList<OffsetRange> highlights = new ArrayList<>();
|
ArrayList<OffsetRange> highlights = new ArrayList<>();
|
||||||
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
ArrayList<OffsetRange> ranges = new ArrayList<>();
|
||||||
|
|
||||||
for (int i = 0; i < 5000; i++) {
|
for (int i = 0; i < 5000; i++) {
|
||||||
String value =
|
String value =
|
||||||
randomBoolean()
|
randomBoolean()
|
||||||
|
@ -272,8 +311,8 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
OffsetRange[] ranges) {
|
OffsetRange[] ranges) {
|
||||||
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
|
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
|
||||||
PassageSelector selector = new PassageSelector();
|
PassageSelector selector = new PassageSelector();
|
||||||
List<OffsetRange> hlist = Arrays.asList(highlights);
|
|
||||||
List<OffsetRange> rangeList = Arrays.asList(ranges);
|
List<OffsetRange> rangeList = Arrays.asList(ranges);
|
||||||
|
List<OffsetRange> hlist = Arrays.asList(highlights);
|
||||||
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
|
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
|
||||||
return String.join("|", passageFormatter.format(value, passages, rangeList));
|
return String.join("|", passageFormatter.format(value, passages, rangeList));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue