mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-24 17:09:48 +00:00
Fix highlighting of overlapping terms in the unified highlighter (#47227)
The passage formatter that the unified highlighter use doesn't handle terms with overlapping offsets. For tokenizer that provides multiple segmentation of the same terms (edge ngram for instance) the formatter should select the largest span in order to highlight the term only once. This change implements this logic.
This commit is contained in:
parent
4f722f0f53
commit
c340814b34
@ -165,7 +165,7 @@ setup:
|
||||
- match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
|
||||
- match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
|
||||
- match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> lazy dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em> lazy dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump</em> lazy dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em> lazy dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._4gram: null }
|
||||
|
||||
@ -197,6 +197,6 @@ setup:
|
||||
- match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" }
|
||||
- match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" }
|
||||
- match: { hits.hits.0.highlight.a_field: ["quick <em>brown</em> <em>fox</em> <em>jump</em> <em>lazy</em> dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox</em><em> jump</em><em> lazy</em> dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump</em><em> lazy</em> dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._2gram: ["quick <em>brown fox jump lazy</em> dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._3gram: ["quick <em>brown fox jump lazy</em> dog"] }
|
||||
- match: { hits.hits.0.highlight.a_field\._4gram: ["quick <em>brown fox jump lazy</em> dog"] }
|
||||
|
@ -49,17 +49,23 @@ public class CustomPassageFormatter extends PassageFormatter {
|
||||
pos = passage.getStartOffset();
|
||||
for (int i = 0; i < passage.getNumMatches(); i++) {
|
||||
int start = passage.getMatchStarts()[i];
|
||||
assert start >= pos && start < passage.getEndOffset();
|
||||
// append content before this start
|
||||
append(sb, content, pos, start);
|
||||
|
||||
int end = passage.getMatchEnds()[i];
|
||||
// its possible to have overlapping terms
|
||||
if (start > pos) {
|
||||
append(sb, content, pos, start);
|
||||
}
|
||||
if (end > pos) {
|
||||
sb.append(preTag);
|
||||
append(sb, content, Math.max(pos, start), end);
|
||||
sb.append(postTag);
|
||||
pos = end;
|
||||
assert end > start;
|
||||
// Look ahead to expand 'end' past all overlapping:
|
||||
while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) {
|
||||
end = passage.getMatchEnds()[++i];
|
||||
}
|
||||
end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage
|
||||
|
||||
sb.append(preTag);
|
||||
append(sb, content, start, end);
|
||||
sb.append(postTag);
|
||||
|
||||
pos = end;
|
||||
}
|
||||
// its possible a "term" from the analyzer could span a sentence boundary.
|
||||
append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
|
||||
|
@ -20,6 +20,8 @@
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.custom.CustomAnalyzer;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
@ -33,6 +35,7 @@ import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.CommonTermsQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
@ -240,4 +243,33 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
|
||||
BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs);
|
||||
}
|
||||
|
||||
public void testOverlappingTerms() throws Exception {
|
||||
final String[] inputs = {
|
||||
"bro",
|
||||
"brown",
|
||||
"brownie",
|
||||
"browser"
|
||||
};
|
||||
final String[] outputs = {
|
||||
"<b>bro</b>",
|
||||
"<b>brown</b>",
|
||||
"<b>browni</b>e",
|
||||
"<b>browser</b>"
|
||||
};
|
||||
BooleanQuery query = new BooleanQuery.Builder()
|
||||
.add(new FuzzyQuery(new Term("text", "brow")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "b")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "br")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "bro")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "browni")), BooleanClause.Occur.SHOULD)
|
||||
.add(new TermQuery(new Term("text", "browser")), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
Analyzer analyzer = CustomAnalyzer.builder()
|
||||
.withTokenizer(EdgeNGramTokenizerFactory.class, "minGramSize", "1", "maxGramSize", "7")
|
||||
.build();
|
||||
assertHighlightOneDoc("text", inputs,
|
||||
analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs);
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user