Fix invalid break iterator highlighting on keyword field (#49566)

By default the unified highlighter splits the input into passages using
a sentence break iterator. However we don't check if the field is tokenized
or not so `keyword` field also applies the break iterator even though they can
only match on the entire content. This means that by default we'll split the
content of a `keyword` field on sentence break if the requested number of fragments
is set to a value different than 0 (default to 5). This commit changes this behavior
to ignore the break iterator on non-tokenized fields (keyword) in order to always
highlight the entire values. The number of requested fragments control the number of
matched values are returned but the boundary_scanner_type is now ignored.
Note that this is the behavior in 6x but some refactoring of the Lucene's highlighter
exposed this bug in Elasticsearch 7x.
This commit is contained in:
Jim Ferenczi 2019-12-04 11:13:32 +01:00 committed by jimczi
parent 1bc3e69fa3
commit 691421f287
2 changed files with 34 additions and 3 deletions

View File

@ -67,7 +67,7 @@ public class UnifiedHighlighter implements Highlighter {
final int maxAnalyzedOffset = context.getIndexSettings().getHighlightMaxAnalyzedOffset(); final int maxAnalyzedOffset = context.getIndexSettings().getHighlightMaxAnalyzedOffset();
List<Snippet> snippets = new ArrayList<>(); List<Snippet> snippets = new ArrayList<>();
int numberOfFragments; int numberOfFragments = field.fieldOptions().numberOfFragments();
try { try {
final Analyzer analyzer = getAnalyzer(context.getMapperService().documentMapper(hitContext.hit().getType()), final Analyzer analyzer = getAnalyzer(context.getMapperService().documentMapper(hitContext.hit().getType()),
@ -90,14 +90,16 @@ public class UnifiedHighlighter implements Highlighter {
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!"); "] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
} }
if (field.fieldOptions().numberOfFragments() == 0) { if (numberOfFragments == 0
// non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
|| fieldType.tokenized() == false) {
// we use a control char to separate values, which is the only char that the custom break iterator // we use a control char to separate values, which is the only char that the custom break iterator
// breaks the text on, so we don't lose the distinction between the different values of a field and we // breaks the text on, so we don't lose the distinction between the different values of a field and we
// get back a snippet per value // get back a snippet per value
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter, highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize()); field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value numberOfFragments = numberOfFragments == 0 ? fieldValues.size() : numberOfFragments;
} else { } else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
BreakIterator bi = getBreakIterator(field); BreakIterator bi = getBreakIterator(field);

View File

@ -121,6 +121,35 @@ public class HighlighterSearchIT extends ESIntegTestCase {
return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class, MockAnalysisPlugin.class); return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class, MockAnalysisPlugin.class);
} }
public void testHighlightingWithKeywordIgnoreBoundaryScanner() throws IOException {
XContentBuilder mappings = jsonBuilder();
mappings.startObject();
mappings.startObject("type")
.startObject("properties")
.startObject("tags")
.field("type", "keyword")
.endObject()
.endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
client().prepareIndex("test").setId("1")
.setSource(jsonBuilder().startObject().array("tags", "foo bar", "foo bar", "foo bar", "foo baz").endObject())
.get();
client().prepareIndex("test").setId("2")
.setSource(jsonBuilder().startObject().array("tags", "foo baz", "foo baz", "foo baz", "foo bar").endObject())
.get();
refresh();
for (BoundaryScannerType scanner : BoundaryScannerType.values()) {
SearchResponse search = client().prepareSearch().setQuery(matchQuery("tags", "foo bar"))
.highlighter(new HighlightBuilder().field(new Field("tags")).numOfFragments(2).boundaryScannerType(scanner)).get();
assertHighlight(search, 0, "tags", 0, 2, equalTo("<em>foo bar</em>"));
assertHighlight(search, 0, "tags", 1, 2, equalTo("<em>foo bar</em>"));
assertHighlight(search, 1, "tags", 0, 1, equalTo("<em>foo bar</em>"));
}
}
public void testHighlightingWithStoredKeyword() throws IOException { public void testHighlightingWithStoredKeyword() throws IOException {
XContentBuilder mappings = jsonBuilder(); XContentBuilder mappings = jsonBuilder();
mappings.startObject(); mappings.startObject();