mirror of https://github.com/apache/lucene.git
LUCENE-10135: Correct passage selector behavior for long matching snippets (#334)
This commit is contained in:
parent
797cfbf477
commit
1bb4554832
|
@ -157,6 +157,8 @@ API Changes
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
|
* LUCENE-10135: Correct passage selector behavior for long matching snippets (Dawid Weiss).
|
||||||
|
|
||||||
* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)
|
* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.search.matchhighlight;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
/** A non-empty range of offset positions. */
|
/** A non-empty range of offset positions. */
|
||||||
public class OffsetRange {
|
public class OffsetRange implements Cloneable {
|
||||||
/** Start index, inclusive. */
|
/** Start index, inclusive. */
|
||||||
public final int from;
|
public final int from;
|
||||||
|
|
||||||
|
@ -70,4 +70,14 @@ public class OffsetRange {
|
||||||
assert to <= this.to;
|
assert to <= this.to;
|
||||||
return new OffsetRange(from, to);
|
return new OffsetRange(from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @return {@code true} if this range contains or is equal to {@code other}. */
|
||||||
|
public boolean contains(OffsetRange other) {
|
||||||
|
return from <= other.from && to >= other.to;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OffsetRange clone() {
|
||||||
|
return new OffsetRange(from, to);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -127,8 +127,8 @@ public class PassageSelector {
|
||||||
if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) {
|
if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) {
|
||||||
|
|
||||||
// Adjust the window range to center the highlight marker.
|
// Adjust the window range to center the highlight marker.
|
||||||
int from = (m.from + m.to - maxPassageWindow) / 2;
|
int from = Math.toIntExact(((long) m.from + m.to - maxPassageWindow) / 2);
|
||||||
int to = (m.from + m.to + maxPassageWindow) / 2;
|
int to = Math.toIntExact(((long) m.from + m.to + maxPassageWindow) / 2);
|
||||||
if (from < range.from) {
|
if (from < range.from) {
|
||||||
to += range.from - from;
|
to += range.from - from;
|
||||||
from = range.from;
|
from = range.from;
|
||||||
|
@ -234,24 +234,30 @@ public class PassageSelector {
|
||||||
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
|
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
|
||||||
for (OffsetRange marker : markers) {
|
for (OffsetRange marker : markers) {
|
||||||
for (OffsetRange permitted : permittedPassageRanges) {
|
for (OffsetRange permitted : permittedPassageRanges) {
|
||||||
boolean needsNew = false;
|
boolean newSlice = false;
|
||||||
|
|
||||||
int from = marker.from;
|
int from = marker.from;
|
||||||
if (from < permitted.from) {
|
if (from < permitted.from) {
|
||||||
from = permitted.from;
|
from = permitted.from;
|
||||||
needsNew = true;
|
newSlice = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int to = marker.to;
|
int to = marker.to;
|
||||||
if (to > permitted.to) {
|
if (to > permitted.to) {
|
||||||
to = permitted.to;
|
to = permitted.to;
|
||||||
needsNew = true;
|
newSlice = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (from >= to || (to - from) > maxPassageWindow) {
|
if (from >= to) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
processedMarkers.add(needsNew ? marker.slice(from, to) : marker);
|
if (to - from > maxPassageWindow) {
|
||||||
|
to = from + maxPassageWindow;
|
||||||
|
newSlice = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
processedMarkers.add(newSlice ? marker.slice(from, to) : marker);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
markers = processedMarkers;
|
markers = processedMarkers;
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.hamcrest.MatcherAssert;
|
||||||
import org.hamcrest.Matchers;
|
import org.hamcrest.Matchers;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -56,6 +57,17 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
new OffsetRange(8, 11));
|
new OffsetRange(8, 11));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void checkOddOverlaps() {
|
||||||
|
checkPassages(
|
||||||
|
"foo >bar >baz<<> abc< xyz",
|
||||||
|
"foo bar baz abc xyz",
|
||||||
|
300,
|
||||||
|
100,
|
||||||
|
new OffsetRange(4, 11),
|
||||||
|
new OffsetRange(8, 15));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void oneMarker() {
|
public void oneMarker() {
|
||||||
checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1));
|
checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1));
|
||||||
|
@ -87,7 +99,27 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
@Test
|
@Test
|
||||||
public void highlightLargerThanWindow() {
|
public void highlightLargerThanWindow() {
|
||||||
String value = "0123456789a";
|
String value = "0123456789a";
|
||||||
checkPassages("0123...", value, 4, 1, new OffsetRange(0, value.length()));
|
checkPassages(">0123<...", value, 4, 1, new OffsetRange(0, value.length()));
|
||||||
|
checkPassages("...>123456<...", value, 6, 1, new OffsetRange(1, value.length()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void highlightLargerThanWindowWithSubranges() {
|
||||||
|
String value = "0123456789a";
|
||||||
|
checkPassages(
|
||||||
|
"0>12<|>456789<...",
|
||||||
|
value,
|
||||||
|
6,
|
||||||
|
2,
|
||||||
|
new OffsetRange[] {new OffsetRange(1, value.length())},
|
||||||
|
new OffsetRange[] {new OffsetRange(0, 3), new OffsetRange(4, value.length())});
|
||||||
|
checkPassages(
|
||||||
|
">01<...|>45<...",
|
||||||
|
value,
|
||||||
|
2,
|
||||||
|
2,
|
||||||
|
new OffsetRange[] {new OffsetRange(0, value.length())},
|
||||||
|
new OffsetRange[] {new OffsetRange(0, 3), new OffsetRange(4, value.length())});
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -109,6 +141,11 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200));
|
checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void largeWindow() {
|
||||||
|
checkPassages("01234>567<89a", "0123456789a", Integer.MAX_VALUE, 1, new OffsetRange(5, 8));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void twoPassages() {
|
public void twoPassages() {
|
||||||
checkPassages(
|
checkPassages(
|
||||||
|
@ -163,6 +200,14 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
|
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void whitespaceBoundaries() {
|
||||||
|
PassageSelector selector =
|
||||||
|
new PassageSelector(PassageSelector.DEFAULT_SCORER, new BreakIteratorShrinkingAdjuster());
|
||||||
|
checkPassages(
|
||||||
|
"...> value <...", "x value y", 9, 1, selector, new OffsetRange(9, 18));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void passageScoring() {
|
public void passageScoring() {
|
||||||
// More highlights per passage -> better passage
|
// More highlights per passage -> better passage
|
||||||
|
@ -278,19 +323,42 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
// Just make sure there are no exceptions.
|
// Just make sure there are no exceptions.
|
||||||
List<Passage> passages =
|
List<Passage> passages =
|
||||||
selector.pickBest(value, highlights, charWindow, maxPassages, permittedRanges);
|
selector.pickBest(value, highlights, charWindow, maxPassages, permittedRanges);
|
||||||
formatter.format(value, passages, permittedRanges);
|
|
||||||
|
// Make sure no empty ranges are returned.
|
||||||
|
passages.forEach(
|
||||||
|
p -> {
|
||||||
|
assertNotEquals(p.from, p.to);
|
||||||
|
p.markers.forEach(
|
||||||
|
r -> {
|
||||||
|
assertNotEquals(p.from, p.to);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
List<String> format = formatter.format(value, passages, permittedRanges);
|
||||||
|
MatcherAssert.assertThat(format, Matchers.not(Matchers.hasItem("><")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkPassages(
|
private void checkPassages(
|
||||||
String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) {
|
String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) {
|
||||||
|
checkPassages(expected, value, charWindow, maxPassages, new PassageSelector(), highlights);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkPassages(
|
||||||
|
String expected,
|
||||||
|
String value,
|
||||||
|
int charWindow,
|
||||||
|
int maxPassages,
|
||||||
|
PassageSelector selector,
|
||||||
|
OffsetRange... highlights) {
|
||||||
checkPassages(
|
checkPassages(
|
||||||
expected,
|
expected,
|
||||||
value,
|
value,
|
||||||
charWindow,
|
charWindow,
|
||||||
maxPassages,
|
maxPassages,
|
||||||
highlights,
|
highlights,
|
||||||
ranges(new OffsetRange(0, value.length())));
|
ranges(new OffsetRange(0, value.length())),
|
||||||
|
selector);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkPassages(
|
private void checkPassages(
|
||||||
|
@ -300,13 +368,25 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
int maxPassages,
|
int maxPassages,
|
||||||
OffsetRange[] highlights,
|
OffsetRange[] highlights,
|
||||||
OffsetRange[] ranges) {
|
OffsetRange[] ranges) {
|
||||||
String result = getPassages(value, charWindow, maxPassages, highlights, ranges);
|
checkPassages(
|
||||||
|
expected, value, charWindow, maxPassages, highlights, ranges, new PassageSelector());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkPassages(
|
||||||
|
String expected,
|
||||||
|
String value,
|
||||||
|
int charWindow,
|
||||||
|
int maxPassages,
|
||||||
|
OffsetRange[] highlights,
|
||||||
|
OffsetRange[] ranges,
|
||||||
|
PassageSelector selector) {
|
||||||
|
String result = getPassages(value, charWindow, maxPassages, highlights, ranges, selector);
|
||||||
if (!Objects.equals(result, expected)) {
|
if (!Objects.equals(result, expected)) {
|
||||||
System.out.println("Value: " + value);
|
System.out.println("Value: " + value);
|
||||||
System.out.println("Result: " + result);
|
System.out.println("Result: " + result);
|
||||||
System.out.println("Expect: " + expected);
|
System.out.println("Expect: " + expected);
|
||||||
}
|
}
|
||||||
assertThat(result, Matchers.equalTo(expected));
|
MatcherAssert.assertThat(result, Matchers.equalTo(expected));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getPassages(
|
protected String getPassages(
|
||||||
|
@ -314,9 +394,9 @@ public class TestPassageSelector extends LuceneTestCase {
|
||||||
int charWindow,
|
int charWindow,
|
||||||
int maxPassages,
|
int maxPassages,
|
||||||
OffsetRange[] highlights,
|
OffsetRange[] highlights,
|
||||||
OffsetRange[] ranges) {
|
OffsetRange[] ranges,
|
||||||
|
PassageSelector selector) {
|
||||||
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
|
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
|
||||||
PassageSelector selector = new PassageSelector();
|
|
||||||
List<OffsetRange> rangeList = Arrays.asList(ranges);
|
List<OffsetRange> rangeList = Arrays.asList(ranges);
|
||||||
List<OffsetRange> hlist = Arrays.asList(highlights);
|
List<OffsetRange> hlist = Arrays.asList(highlights);
|
||||||
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
|
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);
|
||||||
|
|
Loading…
Reference in New Issue