From b8e9a7125f3967e94ba1b18849d8e2efb01c1a27 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Tue, 8 Sep 2020 15:49:50 -0400 Subject: [PATCH] Speed up empty highlighting many fields (backport of #61860) (#62122) Kibana often highlights *everything* like this: ``` POST /_search { "query": ..., "size": 500, "highlight": { "fields": { "*": { ... } } } } ``` This can get slow when there are hundreds of mapped fields. I tested this locally and unscientifically and it took a request from 20ms to 150ms when there are 100 fields. I've seen clusters with 2000 fields where simple search go from 500ms to 1500ms just by turning on this sort of highlighting. Even when the query is just a `range` that and the fields are all numbers and stuff so it won't highlight anything. This speeds up the `unified` highlighter in this case in a few ways: 1. Build the highlighting infrastructure once field rather than once pre document per field. This cuts out a *ton* of work analyzing the query over and over and over again. 2. Bail out of the highlighter before loading values if we can't produce any results. Combined these take that local 150ms case down to 65ms. This is unlikely to be really useful when there are only a few fetched docs and only a few fields, but we often end up having many fields with many fetched docs. --- .../AnnotatedTextFieldMapper.java | 12 +- .../highlight/AnnotatedPassageFormatter.java | 5 +- .../highlight/AnnotatedTextHighlighter.java | 47 +++-- .../AnnotatedTextHighlighterTests.java | 41 ++-- .../uhighlight/CustomFieldHighlighter.java | 26 ++- .../uhighlight/CustomUnifiedHighlighter.java | 97 +++++++--- .../highlight/UnifiedHighlighter.java | 180 ++++++++++-------- .../CustomUnifiedHighlighterTests.java | 23 ++- 8 files changed, 261 insertions(+), 170 deletions(-) rename plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/{ => fetch/subphase}/highlight/AnnotatedTextHighlighterTests.java (91%) diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java index 461d4e507c8..dfc9ac93f87 100644 --- a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/index/mapper/annotatedtext/AnnotatedTextFieldMapper.java @@ -46,7 +46,6 @@ import org.elasticsearch.index.mapper.TextFieldMapper; import org.elasticsearch.index.mapper.ValueFetcher; import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken; import org.elasticsearch.index.similarity.SimilarityProvider; -import org.elasticsearch.search.fetch.FetchSubPhase.HitContext; import java.io.IOException; import java.io.Reader; @@ -299,11 +298,11 @@ public class AnnotatedTextFieldMapper extends FieldMapper { // original markup form in order to inject annotations. public static final class AnnotatedHighlighterAnalyzer extends AnalyzerWrapper { private final Analyzer delegate; - private final HitContext hitContext; - public AnnotatedHighlighterAnalyzer(Analyzer delegate, HitContext hitContext){ + private AnnotatedText[] annotations; + + public AnnotatedHighlighterAnalyzer(Analyzer delegate){ super(delegate.getReuseStrategy()); this.delegate = delegate; - this.hitContext = hitContext; } @Override @@ -311,10 +310,13 @@ public class AnnotatedTextFieldMapper extends FieldMapper { return delegate; } + public void setAnnotations(AnnotatedText[] annotations) { + this.annotations = annotations; + } + @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream()); - AnnotatedText[] annotations = (AnnotatedText[]) hitContext.cache().get(AnnotatedText.class.getName()); AtomicInteger readerNum = new AtomicInteger(0); return new TokenStreamComponents(r -> { String plainText = readToString(r); diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java index 7d360dd0b9b..101743799cb 100644 --- a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedPassageFormatter.java @@ -44,8 +44,11 @@ public class AnnotatedPassageFormatter extends PassageFormatter { private final Encoder encoder; AnnotatedText[] annotations; - public AnnotatedPassageFormatter(AnnotatedText[] annotations, Encoder encoder) { + public AnnotatedPassageFormatter(Encoder encoder) { this.encoder = encoder; + } + + void setAnnotations(AnnotatedText[] annotations) { this.annotations = annotations; } diff --git a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java index 83880ec80fe..051803ee99c 100644 --- a/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java +++ b/plugins/mapper-annotated-text/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighter.java @@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.highlight.Encoder; +import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter; import org.apache.lucene.search.uhighlight.PassageFormatter; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.MappedFieldType; @@ -37,39 +38,37 @@ public class AnnotatedTextHighlighter extends UnifiedHighlighter { public static final String NAME = "annotated"; - @Override - protected Analyzer getAnalyzer(DocumentMapper docMapper, HitContext hitContext) { - return new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper, hitContext), hitContext); - } - // Convert the marked-up values held on-disk to plain-text versions for highlighting @Override - protected List loadFieldValues(MappedFieldType fieldType, - Field field, - HitContext hitContext, - boolean forceSource) throws IOException { - List fieldValues = super.loadFieldValues(fieldType, field, hitContext, forceSource); - String[] fieldValuesAsString = fieldValues.toArray(new String[fieldValues.size()]); + protected List loadFieldValues( + CustomUnifiedHighlighter highlighter, + MappedFieldType fieldType, + Field field, + HitContext hitContext, + boolean forceSource + ) throws IOException { + List fieldValues = super.loadFieldValues(highlighter, fieldType, field, hitContext, forceSource); - AnnotatedText[] annotations = new AnnotatedText[fieldValuesAsString.length]; - for (int i = 0; i < fieldValuesAsString.length; i++) { - annotations[i] = AnnotatedText.parse(fieldValuesAsString[i]); + List strings = new ArrayList<>(fieldValues.size()); + AnnotatedText[] annotations = new AnnotatedText[fieldValues.size()]; + for (int i = 0; i < fieldValues.size(); i++) { + annotations[i] = AnnotatedText.parse(fieldValues.get(i).toString()); + strings.add(annotations[i].textMinusMarkup); } - // Store the annotations in the hitContext - hitContext.cache().put(AnnotatedText.class.getName(), annotations); + // Store the annotations in the formatter and analyzer + ((AnnotatedPassageFormatter) highlighter.getFormatter()).setAnnotations(annotations); + ((AnnotatedHighlighterAnalyzer) highlighter.getIndexAnalyzer()).setAnnotations(annotations); + return strings; + } - ArrayList result = new ArrayList<>(annotations.length); - for (int i = 0; i < annotations.length; i++) { - result.add(annotations[i].textMinusMarkup); - } - return result; + @Override + protected Analyzer getAnalyzer(DocumentMapper docMapper) { + return new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper)); } @Override protected PassageFormatter getPassageFormatter(HitContext hitContext, SearchHighlightContext.Field field, Encoder encoder) { - // Retrieve the annotations from the hitContext - AnnotatedText[] annotations = (AnnotatedText[]) hitContext.cache().get(AnnotatedText.class.getName()); - return new AnnotatedPassageFormatter(annotations, encoder); + return new AnnotatedPassageFormatter(encoder); } } diff --git a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java similarity index 91% rename from plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java rename to plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java index ba1d313fa37..eeeb79aca37 100644 --- a/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/highlight/AnnotatedTextHighlighterTests.java +++ b/plugins/mapper-annotated-text/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/AnnotatedTextHighlighterTests.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.search.highlight; +package org.elasticsearch.search.fetch.subphase.highlight; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -28,7 +28,6 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; @@ -48,14 +47,11 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer; import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText; import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper; -import org.elasticsearch.search.fetch.FetchSubPhase.HitContext; -import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedPassageFormatter; import org.elasticsearch.test.ESTestCase; import java.net.URLEncoder; import java.text.BreakIterator; import java.util.ArrayList; -import java.util.HashMap; import java.util.Locale; import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; @@ -70,7 +66,6 @@ public class AnnotatedTextHighlighterTests extends ESTestCase { // Annotated fields wrap the usual analyzer with one that injects extra tokens Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer()); - Directory dir = newDirectory(); IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer); iwc.setMergePolicy(newTieredMergePolicy(random())); @@ -93,17 +88,14 @@ public class AnnotatedTextHighlighterTests extends ESTestCase { IndexSearcher searcher = newSearcher(reader); iw.close(); - LeafReaderContext context = searcher.getIndexReader().leaves().get(0); - HitContext mockHitContext = new HitContext(null, context, 0, null, new HashMap<>()); - AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer, mockHitContext); - AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length]; for (int i = 0; i < markedUpInputs.length; i++) { annotations[i] = AnnotatedText.parse(markedUpInputs[i]); } - mockHitContext.cache().put(AnnotatedText.class.getName(), annotations); - - AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(annotations,new DefaultEncoder()); + AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer); + hiliteAnalyzer.setAnnotations(annotations); + AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder()); + passageFormatter.setAnnotations(annotations); ArrayList plainTextForHighlighter = new ArrayList<>(annotations.length); for (int i = 0; i < annotations.length; i++) { @@ -113,13 +105,24 @@ public class AnnotatedTextHighlighterTests extends ESTestCase { TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER); assertThat(topDocs.totalHits.value, equalTo(1L)); String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR)); - - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, hiliteAnalyzer, null, - passageFormatter, locale, - breakIterator, rawValue, noMatchSize); + CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter( + searcher, + hiliteAnalyzer, + null, + passageFormatter, + locale, + breakIterator, + "index", + "text", + query, + noMatchSize, + expectedPassages.length, + name -> "text".equals(name), + Integer.MAX_VALUE, + Integer.MAX_VALUE + ); highlighter.setFieldMatcher((name) -> "text".equals(name)); - final Snippet[] snippets = - highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length); + final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue); assertEquals(expectedPassages.length, snippets.length); for (int i = 0; i < snippets.length; i++) { assertEquals(expectedPassages[i], snippets[i].getText()); diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java index dc47110570e..fa21b2b0736 100644 --- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java +++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomFieldHighlighter.java @@ -19,13 +19,15 @@ package org.apache.lucene.search.uhighlight; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; import java.text.BreakIterator; -import java.util.Locale; -import java.util.PriorityQueue; import java.util.Arrays; import java.util.Comparator; -import java.io.IOException; -import org.apache.lucene.util.BytesRef; +import java.util.Locale; +import java.util.PriorityQueue; import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; @@ -38,17 +40,27 @@ class CustomFieldHighlighter extends FieldHighlighter { private final Locale breakIteratorLocale; private final int noMatchSize; - private final String fieldValue; + private String fieldValue; CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy, Locale breakIteratorLocale, BreakIterator breakIterator, PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages, - PassageFormatter passageFormatter, int noMatchSize, String fieldValue) { + PassageFormatter passageFormatter, int noMatchSize) { super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages, maxNoHighlightPassages, passageFormatter); this.breakIteratorLocale = breakIteratorLocale; this.noMatchSize = noMatchSize; - this.fieldValue = fieldValue; + } + + @Override + public Object highlightFieldForDoc(LeafReader reader, int docId, String content) throws IOException { + this.fieldValue = content; + try { + return super.highlightFieldForDoc(reader, docId, content); + } finally { + // Clear the reference to the field value in case it is large + fieldValue = null; + } } @Override diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java index db79122fa3d..3a2693a2d42 100644 --- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java +++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java @@ -20,8 +20,8 @@ package org.apache.lucene.search.uhighlight; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Term; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; @@ -31,16 +31,16 @@ import org.apache.lucene.search.spans.SpanOrQuery; import org.apache.lucene.search.spans.SpanQuery; import org.apache.lucene.search.spans.SpanTermQuery; import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.CheckedSupplier; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery; +import org.elasticsearch.index.IndexSettings; import java.io.IOException; import java.text.BreakIterator; import java.util.Collection; import java.util.Collections; -import java.util.List; import java.util.Locale; -import java.util.Map; import java.util.Set; import java.util.function.Predicate; @@ -57,25 +57,36 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { private static final Snippet[] EMPTY_SNIPPET = new Snippet[0]; private final OffsetSource offsetSource; - private final String fieldValue; private final PassageFormatter passageFormatter; private final BreakIterator breakIterator; + private final String index; + private final String field; private final Locale breakIteratorLocale; private final int noMatchSize; + private final FieldHighlighter fieldHighlighter; + private final int keywordIgnoreAbove; + private final int maxAnalyzedOffset; /** * Creates a new instance of {@link CustomUnifiedHighlighter} * * @param analyzer the analyzer used for the field at index time, used for multi term queries internally. + * @param offsetSource the {@link OffsetSource} to used for offsets retrieval. * @param passageFormatter our own {@link CustomPassageFormatter} * which generates snippets in forms of {@link Snippet} objects. - * @param offsetSource the {@link OffsetSource} to used for offsets retrieval. * @param breakIteratorLocale the {@link Locale} to use for dividing text into passages. * If null {@link Locale#ROOT} is used. * @param breakIterator the {@link BreakIterator} to use for dividing text into passages. * If null {@link BreakIterator#getSentenceInstance(Locale)} is used. - * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR. + * @param index the index we're highlighting, mostly used for error messages + * @param field the name of the field we're highlighting + * @param query the query we're highlighting * @param noMatchSize The size of the text that should be returned when no highlighting can be performed. + * @param maxPassages the maximum number of passes to highlight + * @param fieldMatcher decides which terms should be highlighted + * @param keywordIgnoreAbove if the field's value is longer than this we'll skip it + * @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED + * offset source for it because it'd be super slow */ public CustomUnifiedHighlighter(IndexSearcher searcher, Analyzer analyzer, @@ -83,40 +94,62 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { PassageFormatter passageFormatter, @Nullable Locale breakIteratorLocale, @Nullable BreakIterator breakIterator, - String fieldValue, - int noMatchSize) { + String index, String field, Query query, + int noMatchSize, + int maxPassages, + Predicate fieldMatcher, + int keywordIgnoreAbove, + int maxAnalyzedOffset) throws IOException { super(searcher, analyzer); this.offsetSource = offsetSource; this.breakIterator = breakIterator; this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale; this.passageFormatter = passageFormatter; - this.fieldValue = fieldValue; + this.index = index; + this.field = field; this.noMatchSize = noMatchSize; + this.setFieldMatcher(fieldMatcher); + this.keywordIgnoreAbove = keywordIgnoreAbove; + this.maxAnalyzedOffset = maxAnalyzedOffset; + fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages); } /** - * Highlights terms extracted from the provided query within the content of the provided field name + * Highlights the field value. */ - public Snippet[] highlightField(String field, Query query, int docId, int maxPassages) throws IOException { - Map fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query, - new int[]{docId}, new int[]{maxPassages}); - Object[] snippetObjects = fieldsAsObjects.get(field); - if (snippetObjects != null) { - //one single document at a time - assert snippetObjects.length == 1; - Object snippetObject = snippetObjects[0]; - if (snippetObject != null && snippetObject instanceof Snippet[]) { - return (Snippet[]) snippetObject; - } + public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier loadFieldValue) throws IOException { + if (fieldHighlighter.fieldOffsetStrategy == NoOpOffsetStrategy.INSTANCE && noMatchSize == 0) { + // If the query is such that there can't possibly be any matches then skip doing *everything* + return EMPTY_SNIPPET; } - return EMPTY_SNIPPET; - } - - @Override - protected List loadFieldValues(String[] fields, DocIdSetIterator docIter, - int cacheCharsThreshold) throws IOException { - // we only highlight one field, one document at a time - return Collections.singletonList(new String[]{fieldValue}); + String fieldValue = loadFieldValue.get(); + if (fieldValue == null) { + return null; + } + int fieldValueLength = fieldValue.length(); + if (fieldValueLength > keywordIgnoreAbove) { + return null; // skip highlighting keyword terms that were ignored during indexing + } + if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) { + throw new IllegalArgumentException( + "The length of [" + + field + + "] field of [" + + docId + + "] doc of [" + + index + + "] index " + + "has exceeded [" + + maxAnalyzedOffset + + "] - maximum allowed to be analyzed for highlighting. " + + "This maximum can be set by changing the [" + + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + + "] index level setting. " + + "For large texts, indexing with offsets or term vectors is recommended!" + ); + } + Snippet[] result = (Snippet[]) fieldHighlighter.highlightFieldForDoc(reader, docId, fieldValue); + return result == null ? EMPTY_SNIPPET : result; } @Override @@ -124,6 +157,10 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { return breakIterator; } + public PassageFormatter getFormatter() { + return passageFormatter; + } + @Override protected PassageFormatter getFormatter(String field) { return passageFormatter; @@ -142,7 +179,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter { UnifiedHighlighter.MULTIVAL_SEP_CHAR); FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, components); return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator, - getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue); + getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize); } @Override diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java index 8457c9d73f9..aa11898e3e8 100644 --- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java +++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java @@ -30,15 +30,14 @@ import org.apache.lucene.search.uhighlight.Snippet; import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CollectionUtil; +import org.elasticsearch.common.CheckedSupplier; import org.elasticsearch.common.Strings; import org.elasticsearch.common.text.Text; -import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.KeywordFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.TextSearchInfo; -import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.search.fetch.FetchPhaseExecutionException; import org.elasticsearch.search.fetch.FetchSubPhase; import org.elasticsearch.search.fetch.FetchSubPhase.HitContext; @@ -46,8 +45,11 @@ import org.elasticsearch.search.fetch.FetchSubPhase.HitContext; import java.io.IOException; import java.text.BreakIterator; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Locale; +import java.util.Map; +import java.util.function.Predicate; import java.util.stream.Collectors; import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR; @@ -60,85 +62,97 @@ public class UnifiedHighlighter implements Highlighter { @Override public HighlightField highlight(FieldHighlightContext fieldContext) { + @SuppressWarnings("unchecked") + Map cache = (Map) fieldContext.hitContext.cache() + .computeIfAbsent(UnifiedHighlighter.class.getName(), k -> new HashMap<>()); + CustomUnifiedHighlighter highlighter = (CustomUnifiedHighlighter) cache.computeIfAbsent(fieldContext.fieldName, f -> { + Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html") + ? HighlightUtils.Encoders.HTML + : HighlightUtils.Encoders.DEFAULT; + int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset(); + int keywordIgnoreAbove = Integer.MAX_VALUE; + if (fieldContext.fieldType instanceof KeywordFieldMapper.KeywordFieldType) { + KeywordFieldMapper mapper = (KeywordFieldMapper) fieldContext.context.getMapperService().documentMapper() + .mappers().getMapper(fieldContext.fieldName); + keywordIgnoreAbove = mapper.ignoreAbove(); + } + int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments(); + Analyzer analyzer = getAnalyzer(fieldContext.context.getMapperService().documentMapper()); + PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder); + IndexSearcher searcher = fieldContext.context.searcher(); + OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType); + BreakIterator breakIterator; + int higlighterNumberOfFragments; + if (numberOfFragments == 0 + // non-tokenized fields should not use any break iterator (ignore boundaryScannerType) + || fieldContext.fieldType.getTextSearchInfo().isTokenized() == false) { + /* + * We use a control char to separate values, which is the + * only char that the custom break iterator breaks the text + * on, so we don't lose the distinction between the different + * values of a field and we get back a snippet per value + */ + breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); + higlighterNumberOfFragments = numberOfFragments == 0 ? Integer.MAX_VALUE - 1 : numberOfFragments; + } else { + //using paragraph separator we make sure that each field value holds a discrete passage for highlighting + breakIterator = getBreakIterator(fieldContext.field); + higlighterNumberOfFragments = numberOfFragments; + } + try { + return new CustomUnifiedHighlighter( + searcher, + analyzer, + offsetSource, + passageFormatter, + fieldContext.field.fieldOptions().boundaryScannerLocale(), + breakIterator, + fieldContext.context.getFullyQualifiedIndex().getName(), + fieldContext.fieldName, + fieldContext.query, + fieldContext.field.fieldOptions().noMatchSize(), + higlighterNumberOfFragments, + fieldMatcher(fieldContext), + keywordIgnoreAbove, + maxAnalyzedOffset + ); + } catch (IOException e) { + throw new FetchPhaseExecutionException(fieldContext.shardTarget, + "Failed to highlight field [" + fieldContext.fieldName + "]", e); + } + }); MappedFieldType fieldType = fieldContext.fieldType; SearchHighlightContext.Field field = fieldContext.field; - QueryShardContext context = fieldContext.context; FetchSubPhase.HitContext hitContext = fieldContext.hitContext; - Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT; - final int maxAnalyzedOffset = context.getIndexSettings().getHighlightMaxAnalyzedOffset(); - Integer keywordIgnoreAbove = null; - if (fieldType instanceof KeywordFieldMapper.KeywordFieldType) { - KeywordFieldMapper mapper = (KeywordFieldMapper) context.getMapperService().documentMapper() - .mappers().getMapper(fieldContext.fieldName); - keywordIgnoreAbove = mapper.ignoreAbove(); - } - List snippets = new ArrayList<>(); - int numberOfFragments = field.fieldOptions().numberOfFragments(); - try { - final Analyzer analyzer = getAnalyzer(context.getMapperService().documentMapper(hitContext.hit().getType()), - hitContext); - List fieldValues = loadFieldValues(fieldType, field, hitContext, fieldContext.forceSource); + CheckedSupplier loadFieldValues = () -> { + List fieldValues = loadFieldValues(highlighter, fieldType, field, hitContext, fieldContext.forceSource); if (fieldValues.size() == 0) { return null; } - final PassageFormatter passageFormatter = getPassageFormatter(hitContext, field, encoder); - final IndexSearcher searcher = new IndexSearcher(hitContext.reader()); - final CustomUnifiedHighlighter highlighter; - final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); - final OffsetSource offsetSource = getOffsetSource(fieldType); - int fieldValueLength = fieldValue.length(); - if (keywordIgnoreAbove != null && fieldValueLength > keywordIgnoreAbove) { - return null; // skip highlighting keyword terms that were ignored during indexing - } - if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) { - throw new IllegalArgumentException( - "The length of [" + fieldContext.fieldName + "] field of [" + hitContext.hit().getId() + - "] doc of [" + context.index().getName() + "] index " + "has exceeded [" + - maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " + - "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + - "] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!"); - } - if (numberOfFragments == 0 - // non-tokenized fields should not use any break iterator (ignore boundaryScannerType) - || fieldType.getTextSearchInfo().isTokenized() == false) { - // we use a control char to separate values, which is the only char that the custom break iterator - // breaks the text on, so we don't lose the distinction between the different values of a field and we - // get back a snippet per value - CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR); - highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter, - field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize()); - numberOfFragments = numberOfFragments == 0 ? fieldValues.size() : numberOfFragments; - } else { - //using paragraph separator we make sure that each field value holds a discrete passage for highlighting - BreakIterator bi = getBreakIterator(field); - highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter, - field.fieldOptions().boundaryScannerLocale(), bi, - fieldValue, field.fieldOptions().noMatchSize()); - numberOfFragments = field.fieldOptions().numberOfFragments(); - } - - if (field.fieldOptions().requireFieldMatch()) { - final String fieldName = fieldContext.fieldName; - highlighter.setFieldMatcher((name) -> fieldName.equals(name)); - } else { - // ignore terms that targets the _id field since they use a different encoding - // that is not compatible with utf8 - highlighter.setFieldMatcher(name -> IdFieldMapper.NAME.equals(name) == false); - } - - Snippet[] fieldSnippets = highlighter.highlightField(fieldContext.fieldName, - fieldContext.query, hitContext.docId(), numberOfFragments); - for (Snippet fieldSnippet : fieldSnippets) { - if (Strings.hasText(fieldSnippet.getText())) { - snippets.add(fieldSnippet); - } - } + return mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR); + }; + Snippet[] fieldSnippets; + try { + fieldSnippets = highlighter.highlightField(hitContext.reader(), hitContext.docId(), loadFieldValues); } catch (IOException e) { throw new FetchPhaseExecutionException(fieldContext.shardTarget, "Failed to highlight field [" + fieldContext.fieldName + "]", e); } + if (fieldSnippets == null || fieldSnippets.length == 0) { + return null; + } + List snippets = new ArrayList<>(fieldSnippets.length); + for (Snippet fieldSnippet : fieldSnippets) { + if (Strings.hasText(fieldSnippet.getText())) { + snippets.add(fieldSnippet); + } + } + if (snippets.isEmpty()) { + return null; + } + if (field.fieldOptions().scoreOrdered()) { //let's sort the snippets by score if needed CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore())); @@ -149,10 +163,7 @@ public class UnifiedHighlighter implements Highlighter { fragments[i] = snippets.get(i).getText(); } - if (fragments.length > 0) { - return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments)); - } - return null; + return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments)); } protected PassageFormatter getPassageFormatter(HitContext hitContext, SearchHighlightContext.Field field, Encoder encoder) { @@ -162,14 +173,17 @@ public class UnifiedHighlighter implements Highlighter { } - protected Analyzer getAnalyzer(DocumentMapper docMapper, HitContext hitContext) { + protected Analyzer getAnalyzer(DocumentMapper docMapper) { return docMapper.mappers().indexAnalyzer(); } - protected List loadFieldValues(MappedFieldType fieldType, - SearchHighlightContext.Field field, - FetchSubPhase.HitContext hitContext, - boolean forceSource) throws IOException { + protected List loadFieldValues( + CustomUnifiedHighlighter highlighter, + MappedFieldType fieldType, + SearchHighlightContext.Field field, + FetchSubPhase.HitContext hitContext, + boolean forceSource + ) throws IOException { List fieldValues = HighlightUtils.loadFieldValues(fieldType, hitContext, forceSource); fieldValues = fieldValues.stream() .map((s) -> convertFieldValue(fieldType, s)) @@ -226,4 +240,14 @@ public class UnifiedHighlighter implements Highlighter { } return OffsetSource.ANALYSIS; } + + private Predicate fieldMatcher(FieldHighlightContext fieldContext) { + if (fieldContext.field.fieldOptions().requireFieldMatch()) { + String fieldName = fieldContext.fieldName; + return name -> fieldName.equals(name); + } + // ignore terms that targets the _id field since they use a different encoding + // that is not compatible with utf8 + return name -> IdFieldMapper.NAME.equals(name) == false; + } } diff --git a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 6abb67c3ea1..86d5de4db2e 100644 --- a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -79,12 +79,23 @@ public class CustomUnifiedHighlighterTests extends ESTestCase { TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER); assertThat(topDocs.totalHits.value, equalTo(1L)); String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR)); - CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null, - new CustomPassageFormatter("", "", new DefaultEncoder()), locale, - breakIterator, rawValue, noMatchSize); - highlighter.setFieldMatcher((name) -> "text".equals(name)); - final Snippet[] snippets = - highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length); + CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter( + searcher, + analyzer, + null, + new CustomPassageFormatter("", "", new DefaultEncoder()), + locale, + breakIterator, + "index", + "text", + query, + noMatchSize, + expectedPassages.length, + name -> "text".equals(name), + Integer.MAX_VALUE, + Integer.MAX_VALUE + ); + final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue); assertEquals(snippets.length, expectedPassages.length); for (int i = 0; i < snippets.length; i++) { assertEquals(snippets[i].getText(), expectedPassages[i]);