Prevent DefaultPassageFormatter from taking shorter overlapping passages (#13384)

This commit is contained in:
Zack Kendall 2024-06-21 02:57:59 -07:00 committed by GitHub
parent c7f4b8dee2
commit 06c4a4b9e0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 42 additions and 8 deletions

View File

@ -280,6 +280,8 @@ Optimizations
Bug Fixes
---------------------
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
some corner cases. (Greg Miller)

View File

@ -76,10 +76,11 @@ public class DefaultPassageFormatter extends PassageFormatter {
int end = passage.getMatchEnds()[i];
assert end > start;
// its possible to have overlapping terms.
// Look ahead to expand 'end' past all overlapping:
// It's possible to have overlapping terms.
// Look ahead to expand 'end' past all overlapping.
// Only take new end if it is larger than current end.
while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
end = passage.getMatchEnds()[++i];
end = Math.max(end, passage.getMatchEnds()[++i]);
}
end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage

View File

@ -18,6 +18,7 @@
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
public class TestDefaultPassageFormatter extends LuceneTestCase {
public void testBasic() throws Exception {
@ -52,4 +53,26 @@ public class TestDefaultPassageFormatter extends LuceneTestCase {
+ "&lt;&#x2F;div&gt; of this <u>very</u> formatter.\u2026 It&#x27;s not <u>very</u> N&#x2F;A!",
formatter.format(passages, text));
}
public void testOverlappingPassages() throws Exception {
String content = "Yin yang loooooooooong, yin gap yang yong";
Passage[] passages = new Passage[1];
passages[0] = new Passage();
passages[0].setStartOffset(0);
passages[0].setEndOffset(41);
passages[0].setScore(5.93812f);
passages[0].setScore(5.93812f);
passages[0].addMatch(0, 3, new BytesRef("yin"), 1);
passages[0].addMatch(0, 22, new BytesRef("yin yang loooooooooooong"), 1);
passages[0].addMatch(4, 8, new BytesRef("yang"), 1);
passages[0].addMatch(9, 22, new BytesRef("loooooooooong"), 1);
passages[0].addMatch(24, 27, new BytesRef("yin"), 1);
passages[0].addMatch(32, 36, new BytesRef("yang"), 1);
// test default
DefaultPassageFormatter formatter = new DefaultPassageFormatter();
assertEquals(
"<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong",
formatter.format(passages, content));
}
}

View File

@ -1422,7 +1422,7 @@ public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
Set.of("field_tripples", "field_characters"),
"danc",
"<b>danc</b>e with star",
"<b>da</b><b>n</b><b>c</b>e with star");
"<b>dan</b><b>c</b>e with star");
}
}
}

View File

@ -183,13 +183,15 @@ public class TestUnifiedHighlighterStrictPhrases extends UnifiedHighlighterTestB
}
public void testWithSameTermQuery() throws IOException {
indexWriter.addDocument(newDoc("Yin yang, yin gap yang"));
indexWriter.addDocument(newDoc("Yin yang loooooooooong, yin gap yang yong"));
initReaderSearcherHighlighter();
BooleanQuery query =
new BooleanQuery.Builder()
.add(new TermQuery(new Term("body", "yin")), BooleanClause.Occur.MUST)
.add(newPhraseQuery("body", "yin yang"), BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("body", "yang")), BooleanClause.Occur.MUST)
.add(new TermQuery(new Term("body", "loooooooooong")), BooleanClause.Occur.MUST)
.add(newPhraseQuery("body", "yin\\ yang\\ loooooooooong"), BooleanClause.Occur.MUST)
// add queries for other fields; we shouldn't highlight these because of that.
.add(new TermQuery(new Term("title", "yang")), BooleanClause.Occur.SHOULD)
.build();
@ -199,9 +201,15 @@ public class TestUnifiedHighlighterStrictPhrases extends UnifiedHighlighterTestB
false); // We don't want duplicates from "Yin" being in TermQuery & PhraseQuery.
String[] snippets = highlighter.highlight("body", query, topDocs);
if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) {
assertArrayEquals(new String[] {"<b>Yin yang</b>, <b>yin</b> gap yang"}, snippets);
assertArrayEquals(
new String[] {"<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong"},
snippets);
} else {
assertArrayEquals(new String[] {"<b>Yin</b> <b>yang</b>, <b>yin</b> gap yang"}, snippets);
assertArrayEquals(
new String[] {
"<b>Yin</b> <b>yang</b> <b>loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong"
},
snippets);
}
}