mirror of https://github.com/apache/lucene.git
Prevent DefaultPassageFormatter from taking shorter overlapping passages (#13384)
This commit is contained in:
parent
c7f4b8dee2
commit
06c4a4b9e0
|
@ -280,6 +280,8 @@ Optimizations
|
|||
Bug Fixes
|
||||
---------------------
|
||||
|
||||
* GITHUB#13384: Fix highlighter to use longer passages instead of shorter individual terms. (Zack Kendall)
|
||||
|
||||
* GITHUB#13463: Address bug in MultiLeafKnnCollector causing #minCompetitiveSimilarity to stay artificially low in
|
||||
some corner cases. (Greg Miller)
|
||||
|
||||
|
|
|
@ -76,10 +76,11 @@ public class DefaultPassageFormatter extends PassageFormatter {
|
|||
|
||||
int end = passage.getMatchEnds()[i];
|
||||
assert end > start;
|
||||
// its possible to have overlapping terms.
|
||||
// Look ahead to expand 'end' past all overlapping:
|
||||
// It's possible to have overlapping terms.
|
||||
// Look ahead to expand 'end' past all overlapping.
|
||||
// Only take new end if it is larger than current end.
|
||||
while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
|
||||
end = passage.getMatchEnds()[++i];
|
||||
end = Math.max(end, passage.getMatchEnds()[++i]);
|
||||
}
|
||||
end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
public class TestDefaultPassageFormatter extends LuceneTestCase {
|
||||
public void testBasic() throws Exception {
|
||||
|
@ -52,4 +53,26 @@ public class TestDefaultPassageFormatter extends LuceneTestCase {
|
|||
+ "</div> of this <u>very</u> formatter.\u2026 It's not <u>very</u> N/A!",
|
||||
formatter.format(passages, text));
|
||||
}
|
||||
|
||||
public void testOverlappingPassages() throws Exception {
|
||||
String content = "Yin yang loooooooooong, yin gap yang yong";
|
||||
Passage[] passages = new Passage[1];
|
||||
passages[0] = new Passage();
|
||||
passages[0].setStartOffset(0);
|
||||
passages[0].setEndOffset(41);
|
||||
passages[0].setScore(5.93812f);
|
||||
passages[0].setScore(5.93812f);
|
||||
passages[0].addMatch(0, 3, new BytesRef("yin"), 1);
|
||||
passages[0].addMatch(0, 22, new BytesRef("yin yang loooooooooooong"), 1);
|
||||
passages[0].addMatch(4, 8, new BytesRef("yang"), 1);
|
||||
passages[0].addMatch(9, 22, new BytesRef("loooooooooong"), 1);
|
||||
passages[0].addMatch(24, 27, new BytesRef("yin"), 1);
|
||||
passages[0].addMatch(32, 36, new BytesRef("yang"), 1);
|
||||
|
||||
// test default
|
||||
DefaultPassageFormatter formatter = new DefaultPassageFormatter();
|
||||
assertEquals(
|
||||
"<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong",
|
||||
formatter.format(passages, content));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1422,7 +1422,7 @@ public class TestUnifiedHighlighter extends UnifiedHighlighterTestBase {
|
|||
Set.of("field_tripples", "field_characters"),
|
||||
"danc",
|
||||
"<b>danc</b>e with star",
|
||||
"<b>da</b><b>n</b><b>c</b>e with star");
|
||||
"<b>dan</b><b>c</b>e with star");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -183,13 +183,15 @@ public class TestUnifiedHighlighterStrictPhrases extends UnifiedHighlighterTestB
|
|||
}
|
||||
|
||||
public void testWithSameTermQuery() throws IOException {
|
||||
indexWriter.addDocument(newDoc("Yin yang, yin gap yang"));
|
||||
indexWriter.addDocument(newDoc("Yin yang loooooooooong, yin gap yang yong"));
|
||||
initReaderSearcherHighlighter();
|
||||
|
||||
BooleanQuery query =
|
||||
new BooleanQuery.Builder()
|
||||
.add(new TermQuery(new Term("body", "yin")), BooleanClause.Occur.MUST)
|
||||
.add(newPhraseQuery("body", "yin yang"), BooleanClause.Occur.MUST)
|
||||
.add(new TermQuery(new Term("body", "yang")), BooleanClause.Occur.MUST)
|
||||
.add(new TermQuery(new Term("body", "loooooooooong")), BooleanClause.Occur.MUST)
|
||||
.add(newPhraseQuery("body", "yin\\ yang\\ loooooooooong"), BooleanClause.Occur.MUST)
|
||||
// add queries for other fields; we shouldn't highlight these because of that.
|
||||
.add(new TermQuery(new Term("title", "yang")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
@ -199,9 +201,15 @@ public class TestUnifiedHighlighterStrictPhrases extends UnifiedHighlighterTestB
|
|||
false); // We don't want duplicates from "Yin" being in TermQuery & PhraseQuery.
|
||||
String[] snippets = highlighter.highlight("body", query, topDocs);
|
||||
if (highlighter.getFlags("body").contains(HighlightFlag.WEIGHT_MATCHES)) {
|
||||
assertArrayEquals(new String[] {"<b>Yin yang</b>, <b>yin</b> gap yang"}, snippets);
|
||||
assertArrayEquals(
|
||||
new String[] {"<b>Yin yang loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong"},
|
||||
snippets);
|
||||
} else {
|
||||
assertArrayEquals(new String[] {"<b>Yin</b> <b>yang</b>, <b>yin</b> gap yang"}, snippets);
|
||||
assertArrayEquals(
|
||||
new String[] {
|
||||
"<b>Yin</b> <b>yang</b> <b>loooooooooong</b>, <b>yin</b> gap <b>yang</b> yong"
|
||||
},
|
||||
snippets);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue