LUCENE-10135: Correct passage selector behavior for long matching snippets (#334)

This commit is contained in:
Dawid Weiss 2021-09-30 15:05:41 +02:00 committed by GitHub
parent 797cfbf477
commit 1bb4554832
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 113 additions and 15 deletions

View File

@ -157,6 +157,8 @@ API Changes
Improvements Improvements
* LUCENE-10135: Correct passage selector behavior for long matching snippets (Dawid Weiss).
* LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss) * LUCENE-9960: Avoid unnecessary top element replacement for equal elements in PriorityQueue. (Dawid Weiss)
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions). * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).

View File

@ -19,7 +19,7 @@ package org.apache.lucene.search.matchhighlight;
import java.util.Objects; import java.util.Objects;
/** A non-empty range of offset positions. */ /** A non-empty range of offset positions. */
public class OffsetRange { public class OffsetRange implements Cloneable {
/** Start index, inclusive. */ /** Start index, inclusive. */
public final int from; public final int from;
@ -70,4 +70,14 @@ public class OffsetRange {
assert to <= this.to; assert to <= this.to;
return new OffsetRange(from, to); return new OffsetRange(from, to);
} }
/** @return {@code true} if this range contains or is equal to {@code other}. */
public boolean contains(OffsetRange other) {
return from <= other.from && to >= other.to;
}
@Override
public OffsetRange clone() {
return new OffsetRange(from, to);
}
} }

View File

@ -127,8 +127,8 @@ public class PassageSelector {
if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) { if (m.from >= range.from && m.to <= rangeTo && m.length() <= maxPassageWindow) {
// Adjust the window range to center the highlight marker. // Adjust the window range to center the highlight marker.
int from = (m.from + m.to - maxPassageWindow) / 2; int from = Math.toIntExact(((long) m.from + m.to - maxPassageWindow) / 2);
int to = (m.from + m.to + maxPassageWindow) / 2; int to = Math.toIntExact(((long) m.from + m.to + maxPassageWindow) / 2);
if (from < range.from) { if (from < range.from) {
to += range.from - from; to += range.from - from;
from = range.from; from = range.from;
@ -234,24 +234,30 @@ public class PassageSelector {
ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size()); ArrayList<OffsetRange> processedMarkers = new ArrayList<>(markers.size());
for (OffsetRange marker : markers) { for (OffsetRange marker : markers) {
for (OffsetRange permitted : permittedPassageRanges) { for (OffsetRange permitted : permittedPassageRanges) {
boolean needsNew = false; boolean newSlice = false;
int from = marker.from; int from = marker.from;
if (from < permitted.from) { if (from < permitted.from) {
from = permitted.from; from = permitted.from;
needsNew = true; newSlice = true;
} }
int to = marker.to; int to = marker.to;
if (to > permitted.to) { if (to > permitted.to) {
to = permitted.to; to = permitted.to;
needsNew = true; newSlice = true;
} }
if (from >= to || (to - from) > maxPassageWindow) { if (from >= to) {
continue; continue;
} }
processedMarkers.add(needsNew ? marker.slice(from, to) : marker); if (to - from > maxPassageWindow) {
to = from + maxPassageWindow;
newSlice = true;
}
processedMarkers.add(newSlice ? marker.slice(from, to) : marker);
} }
} }
markers = processedMarkers; markers = processedMarkers;

View File

@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.hamcrest.MatcherAssert;
import org.hamcrest.Matchers; import org.hamcrest.Matchers;
import org.junit.Test; import org.junit.Test;
@ -56,6 +57,17 @@ public class TestPassageSelector extends LuceneTestCase {
new OffsetRange(8, 11)); new OffsetRange(8, 11));
} }
@Test
public void checkOddOverlaps() {
checkPassages(
"foo >bar >baz<<> abc< xyz",
"foo bar baz abc xyz",
300,
100,
new OffsetRange(4, 11),
new OffsetRange(8, 15));
}
@Test @Test
public void oneMarker() { public void oneMarker() {
checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1)); checkPassages(">0<123456789a", "0123456789a", 300, 1, new OffsetRange(0, 1));
@ -87,7 +99,27 @@ public class TestPassageSelector extends LuceneTestCase {
@Test @Test
public void highlightLargerThanWindow() { public void highlightLargerThanWindow() {
String value = "0123456789a"; String value = "0123456789a";
checkPassages("0123...", value, 4, 1, new OffsetRange(0, value.length())); checkPassages(">0123<...", value, 4, 1, new OffsetRange(0, value.length()));
checkPassages("...>123456<...", value, 6, 1, new OffsetRange(1, value.length()));
}
@Test
public void highlightLargerThanWindowWithSubranges() {
String value = "0123456789a";
checkPassages(
"0>12<|>456789<...",
value,
6,
2,
new OffsetRange[] {new OffsetRange(1, value.length())},
new OffsetRange[] {new OffsetRange(0, 3), new OffsetRange(4, value.length())});
checkPassages(
">01<...|>45<...",
value,
2,
2,
new OffsetRange[] {new OffsetRange(0, value.length())},
new OffsetRange[] {new OffsetRange(0, 3), new OffsetRange(4, value.length())});
} }
@Test @Test
@ -109,6 +141,11 @@ public class TestPassageSelector extends LuceneTestCase {
checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200)); checkPassages("0123456789a", "0123456789a", 300, 1, new OffsetRange(100, 200));
} }
@Test
public void largeWindow() {
checkPassages("01234>567<89a", "0123456789a", Integer.MAX_VALUE, 1, new OffsetRange(5, 8));
}
@Test @Test
public void twoPassages() { public void twoPassages() {
checkPassages( checkPassages(
@ -163,6 +200,14 @@ public class TestPassageSelector extends LuceneTestCase {
ranges(new OffsetRange(0, 5), new OffsetRange(5, 10))); ranges(new OffsetRange(0, 5), new OffsetRange(5, 10)));
} }
@Test
public void whitespaceBoundaries() {
PassageSelector selector =
new PassageSelector(PassageSelector.DEFAULT_SCORER, new BreakIteratorShrinkingAdjuster());
checkPassages(
"...> value <...", "x value y", 9, 1, selector, new OffsetRange(9, 18));
}
@Test @Test
public void passageScoring() { public void passageScoring() {
// More highlights per passage -> better passage // More highlights per passage -> better passage
@ -278,19 +323,42 @@ public class TestPassageSelector extends LuceneTestCase {
// Just make sure there are no exceptions. // Just make sure there are no exceptions.
List<Passage> passages = List<Passage> passages =
selector.pickBest(value, highlights, charWindow, maxPassages, permittedRanges); selector.pickBest(value, highlights, charWindow, maxPassages, permittedRanges);
formatter.format(value, passages, permittedRanges);
// Make sure no empty ranges are returned.
passages.forEach(
p -> {
assertNotEquals(p.from, p.to);
p.markers.forEach(
r -> {
assertNotEquals(p.from, p.to);
});
});
List<String> format = formatter.format(value, passages, permittedRanges);
MatcherAssert.assertThat(format, Matchers.not(Matchers.hasItem("><")));
} }
} }
private void checkPassages( private void checkPassages(
String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) { String expected, String value, int charWindow, int maxPassages, OffsetRange... highlights) {
checkPassages(expected, value, charWindow, maxPassages, new PassageSelector(), highlights);
}
private void checkPassages(
String expected,
String value,
int charWindow,
int maxPassages,
PassageSelector selector,
OffsetRange... highlights) {
checkPassages( checkPassages(
expected, expected,
value, value,
charWindow, charWindow,
maxPassages, maxPassages,
highlights, highlights,
ranges(new OffsetRange(0, value.length()))); ranges(new OffsetRange(0, value.length())),
selector);
} }
private void checkPassages( private void checkPassages(
@ -300,13 +368,25 @@ public class TestPassageSelector extends LuceneTestCase {
int maxPassages, int maxPassages,
OffsetRange[] highlights, OffsetRange[] highlights,
OffsetRange[] ranges) { OffsetRange[] ranges) {
String result = getPassages(value, charWindow, maxPassages, highlights, ranges); checkPassages(
expected, value, charWindow, maxPassages, highlights, ranges, new PassageSelector());
}
private void checkPassages(
String expected,
String value,
int charWindow,
int maxPassages,
OffsetRange[] highlights,
OffsetRange[] ranges,
PassageSelector selector) {
String result = getPassages(value, charWindow, maxPassages, highlights, ranges, selector);
if (!Objects.equals(result, expected)) { if (!Objects.equals(result, expected)) {
System.out.println("Value: " + value); System.out.println("Value: " + value);
System.out.println("Result: " + result); System.out.println("Result: " + result);
System.out.println("Expect: " + expected); System.out.println("Expect: " + expected);
} }
assertThat(result, Matchers.equalTo(expected)); MatcherAssert.assertThat(result, Matchers.equalTo(expected));
} }
protected String getPassages( protected String getPassages(
@ -314,9 +394,9 @@ public class TestPassageSelector extends LuceneTestCase {
int charWindow, int charWindow,
int maxPassages, int maxPassages,
OffsetRange[] highlights, OffsetRange[] highlights,
OffsetRange[] ranges) { OffsetRange[] ranges,
PassageSelector selector) {
PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<"); PassageFormatter passageFormatter = new PassageFormatter("...", ">", "<");
PassageSelector selector = new PassageSelector();
List<OffsetRange> rangeList = Arrays.asList(ranges); List<OffsetRange> rangeList = Arrays.asList(ranges);
List<OffsetRange> hlist = Arrays.asList(highlights); List<OffsetRange> hlist = Arrays.asList(highlights);
List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList); List<Passage> passages = selector.pickBest(value, hlist, charWindow, maxPassages, rangeList);