Kibana often highlights *everything* like this: ``` POST /_search { "query": ..., "size": 500, "highlight": { "fields": { "*": { ... } } } } ``` This can get slow when there are hundreds of mapped fields. I tested this locally and unscientifically and it took a request from 20ms to 150ms when there are 100 fields. I've seen clusters with 2000 fields where simple search go from 500ms to 1500ms just by turning on this sort of highlighting. Even when the query is just a `range` that and the fields are all numbers and stuff so it won't highlight anything. This speeds up the `unified` highlighter in this case in a few ways: 1. Build the highlighting infrastructure once field rather than once pre document per field. This cuts out a *ton* of work analyzing the query over and over and over again. 2. Bail out of the highlighter before loading values if we can't produce any results. Combined these take that local 150ms case down to 65ms. This is unlikely to be really useful when there are only a few fetched docs and only a few fields, but we often end up having many fields with many fetched docs.
This commit is contained in:
parent
28fd4a2ae8
commit
b8e9a7125f
|
@ -46,7 +46,6 @@ import org.elasticsearch.index.mapper.TextFieldMapper;
|
|||
import org.elasticsearch.index.mapper.ValueFetcher;
|
||||
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken;
|
||||
import org.elasticsearch.index.similarity.SimilarityProvider;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -299,11 +298,11 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
|
|||
// original markup form in order to inject annotations.
|
||||
public static final class AnnotatedHighlighterAnalyzer extends AnalyzerWrapper {
|
||||
private final Analyzer delegate;
|
||||
private final HitContext hitContext;
|
||||
public AnnotatedHighlighterAnalyzer(Analyzer delegate, HitContext hitContext){
|
||||
private AnnotatedText[] annotations;
|
||||
|
||||
public AnnotatedHighlighterAnalyzer(Analyzer delegate){
|
||||
super(delegate.getReuseStrategy());
|
||||
this.delegate = delegate;
|
||||
this.hitContext = hitContext;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -311,10 +310,13 @@ public class AnnotatedTextFieldMapper extends FieldMapper {
|
|||
return delegate;
|
||||
}
|
||||
|
||||
public void setAnnotations(AnnotatedText[] annotations) {
|
||||
this.annotations = annotations;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
|
||||
AnnotationsInjector injector = new AnnotationsInjector(components.getTokenStream());
|
||||
AnnotatedText[] annotations = (AnnotatedText[]) hitContext.cache().get(AnnotatedText.class.getName());
|
||||
AtomicInteger readerNum = new AtomicInteger(0);
|
||||
return new TokenStreamComponents(r -> {
|
||||
String plainText = readToString(r);
|
||||
|
|
|
@ -44,8 +44,11 @@ public class AnnotatedPassageFormatter extends PassageFormatter {
|
|||
private final Encoder encoder;
|
||||
AnnotatedText[] annotations;
|
||||
|
||||
public AnnotatedPassageFormatter(AnnotatedText[] annotations, Encoder encoder) {
|
||||
public AnnotatedPassageFormatter(Encoder encoder) {
|
||||
this.encoder = encoder;
|
||||
}
|
||||
|
||||
void setAnnotations(AnnotatedText[] annotations) {
|
||||
this.annotations = annotations;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.search.fetch.subphase.highlight;
|
|||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.search.highlight.Encoder;
|
||||
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
|
||||
import org.apache.lucene.search.uhighlight.PassageFormatter;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
|
@ -37,39 +38,37 @@ public class AnnotatedTextHighlighter extends UnifiedHighlighter {
|
|||
|
||||
public static final String NAME = "annotated";
|
||||
|
||||
@Override
|
||||
protected Analyzer getAnalyzer(DocumentMapper docMapper, HitContext hitContext) {
|
||||
return new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper, hitContext), hitContext);
|
||||
}
|
||||
|
||||
// Convert the marked-up values held on-disk to plain-text versions for highlighting
|
||||
@Override
|
||||
protected List<Object> loadFieldValues(MappedFieldType fieldType,
|
||||
Field field,
|
||||
HitContext hitContext,
|
||||
boolean forceSource) throws IOException {
|
||||
List<Object> fieldValues = super.loadFieldValues(fieldType, field, hitContext, forceSource);
|
||||
String[] fieldValuesAsString = fieldValues.toArray(new String[fieldValues.size()]);
|
||||
protected List<Object> loadFieldValues(
|
||||
CustomUnifiedHighlighter highlighter,
|
||||
MappedFieldType fieldType,
|
||||
Field field,
|
||||
HitContext hitContext,
|
||||
boolean forceSource
|
||||
) throws IOException {
|
||||
List<Object> fieldValues = super.loadFieldValues(highlighter, fieldType, field, hitContext, forceSource);
|
||||
|
||||
AnnotatedText[] annotations = new AnnotatedText[fieldValuesAsString.length];
|
||||
for (int i = 0; i < fieldValuesAsString.length; i++) {
|
||||
annotations[i] = AnnotatedText.parse(fieldValuesAsString[i]);
|
||||
List<Object> strings = new ArrayList<>(fieldValues.size());
|
||||
AnnotatedText[] annotations = new AnnotatedText[fieldValues.size()];
|
||||
for (int i = 0; i < fieldValues.size(); i++) {
|
||||
annotations[i] = AnnotatedText.parse(fieldValues.get(i).toString());
|
||||
strings.add(annotations[i].textMinusMarkup);
|
||||
}
|
||||
// Store the annotations in the hitContext
|
||||
hitContext.cache().put(AnnotatedText.class.getName(), annotations);
|
||||
// Store the annotations in the formatter and analyzer
|
||||
((AnnotatedPassageFormatter) highlighter.getFormatter()).setAnnotations(annotations);
|
||||
((AnnotatedHighlighterAnalyzer) highlighter.getIndexAnalyzer()).setAnnotations(annotations);
|
||||
return strings;
|
||||
}
|
||||
|
||||
ArrayList<Object> result = new ArrayList<>(annotations.length);
|
||||
for (int i = 0; i < annotations.length; i++) {
|
||||
result.add(annotations[i].textMinusMarkup);
|
||||
}
|
||||
return result;
|
||||
@Override
|
||||
protected Analyzer getAnalyzer(DocumentMapper docMapper) {
|
||||
return new AnnotatedHighlighterAnalyzer(super.getAnalyzer(docMapper));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PassageFormatter getPassageFormatter(HitContext hitContext, SearchHighlightContext.Field field, Encoder encoder) {
|
||||
// Retrieve the annotations from the hitContext
|
||||
AnnotatedText[] annotations = (AnnotatedText[]) hitContext.cache().get(AnnotatedText.class.getName());
|
||||
return new AnnotatedPassageFormatter(annotations, encoder);
|
||||
return new AnnotatedPassageFormatter(encoder);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.search.highlight;
|
||||
package org.elasticsearch.search.fetch.subphase.highlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
|
@ -28,7 +28,6 @@ import org.apache.lucene.document.TextField;
|
|||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -48,14 +47,11 @@ import org.elasticsearch.common.Strings;
|
|||
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedHighlighterAnalyzer;
|
||||
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText;
|
||||
import org.elasticsearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotationAnalyzerWrapper;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.AnnotatedPassageFormatter;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
|
@ -70,7 +66,6 @@ public class AnnotatedTextHighlighterTests extends ESTestCase {
|
|||
|
||||
// Annotated fields wrap the usual analyzer with one that injects extra tokens
|
||||
Analyzer wrapperAnalyzer = new AnnotationAnalyzerWrapper(new StandardAnalyzer());
|
||||
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(wrapperAnalyzer);
|
||||
iwc.setMergePolicy(newTieredMergePolicy(random()));
|
||||
|
@ -93,17 +88,14 @@ public class AnnotatedTextHighlighterTests extends ESTestCase {
|
|||
IndexSearcher searcher = newSearcher(reader);
|
||||
iw.close();
|
||||
|
||||
LeafReaderContext context = searcher.getIndexReader().leaves().get(0);
|
||||
HitContext mockHitContext = new HitContext(null, context, 0, null, new HashMap<>());
|
||||
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer, mockHitContext);
|
||||
|
||||
AnnotatedText[] annotations = new AnnotatedText[markedUpInputs.length];
|
||||
for (int i = 0; i < markedUpInputs.length; i++) {
|
||||
annotations[i] = AnnotatedText.parse(markedUpInputs[i]);
|
||||
}
|
||||
mockHitContext.cache().put(AnnotatedText.class.getName(), annotations);
|
||||
|
||||
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(annotations,new DefaultEncoder());
|
||||
AnnotatedHighlighterAnalyzer hiliteAnalyzer = new AnnotatedHighlighterAnalyzer(wrapperAnalyzer);
|
||||
hiliteAnalyzer.setAnnotations(annotations);
|
||||
AnnotatedPassageFormatter passageFormatter = new AnnotatedPassageFormatter(new DefaultEncoder());
|
||||
passageFormatter.setAnnotations(annotations);
|
||||
|
||||
ArrayList<Object> plainTextForHighlighter = new ArrayList<>(annotations.length);
|
||||
for (int i = 0; i < annotations.length; i++) {
|
||||
|
@ -113,13 +105,24 @@ public class AnnotatedTextHighlighterTests extends ESTestCase {
|
|||
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits.value, equalTo(1L));
|
||||
String rawValue = Strings.collectionToDelimitedString(plainTextForHighlighter, String.valueOf(MULTIVAL_SEP_CHAR));
|
||||
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, hiliteAnalyzer, null,
|
||||
passageFormatter, locale,
|
||||
breakIterator, rawValue, noMatchSize);
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
|
||||
searcher,
|
||||
hiliteAnalyzer,
|
||||
null,
|
||||
passageFormatter,
|
||||
locale,
|
||||
breakIterator,
|
||||
"index",
|
||||
"text",
|
||||
query,
|
||||
noMatchSize,
|
||||
expectedPassages.length,
|
||||
name -> "text".equals(name),
|
||||
Integer.MAX_VALUE,
|
||||
Integer.MAX_VALUE
|
||||
);
|
||||
highlighter.setFieldMatcher((name) -> "text".equals(name));
|
||||
final Snippet[] snippets =
|
||||
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
|
||||
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
|
||||
assertEquals(expectedPassages.length, snippets.length);
|
||||
for (int i = 0; i < snippets.length; i++) {
|
||||
assertEquals(expectedPassages[i], snippets[i].getText());
|
|
@ -19,13 +19,15 @@
|
|||
|
||||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Locale;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import java.util.Locale;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
|
||||
|
@ -38,17 +40,27 @@ class CustomFieldHighlighter extends FieldHighlighter {
|
|||
|
||||
private final Locale breakIteratorLocale;
|
||||
private final int noMatchSize;
|
||||
private final String fieldValue;
|
||||
private String fieldValue;
|
||||
|
||||
CustomFieldHighlighter(String field, FieldOffsetStrategy fieldOffsetStrategy,
|
||||
Locale breakIteratorLocale, BreakIterator breakIterator,
|
||||
PassageScorer passageScorer, int maxPassages, int maxNoHighlightPassages,
|
||||
PassageFormatter passageFormatter, int noMatchSize, String fieldValue) {
|
||||
PassageFormatter passageFormatter, int noMatchSize) {
|
||||
super(field, fieldOffsetStrategy, breakIterator, passageScorer, maxPassages,
|
||||
maxNoHighlightPassages, passageFormatter);
|
||||
this.breakIteratorLocale = breakIteratorLocale;
|
||||
this.noMatchSize = noMatchSize;
|
||||
this.fieldValue = fieldValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object highlightFieldForDoc(LeafReader reader, int docId, String content) throws IOException {
|
||||
this.fieldValue = content;
|
||||
try {
|
||||
return super.highlightFieldForDoc(reader, docId, content);
|
||||
} finally {
|
||||
// Clear the reference to the field value in case it is large
|
||||
fieldValue = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -20,8 +20,8 @@
|
|||
package org.apache.lucene.search.uhighlight;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -31,16 +31,16 @@ import org.apache.lucene.search.spans.SpanOrQuery;
|
|||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.CheckedSupplier;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
|
@ -57,25 +57,36 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
|
||||
|
||||
private final OffsetSource offsetSource;
|
||||
private final String fieldValue;
|
||||
private final PassageFormatter passageFormatter;
|
||||
private final BreakIterator breakIterator;
|
||||
private final String index;
|
||||
private final String field;
|
||||
private final Locale breakIteratorLocale;
|
||||
private final int noMatchSize;
|
||||
private final FieldHighlighter fieldHighlighter;
|
||||
private final int keywordIgnoreAbove;
|
||||
private final int maxAnalyzedOffset;
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@link CustomUnifiedHighlighter}
|
||||
*
|
||||
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally.
|
||||
* @param offsetSource the {@link OffsetSource} to used for offsets retrieval.
|
||||
* @param passageFormatter our own {@link CustomPassageFormatter}
|
||||
* which generates snippets in forms of {@link Snippet} objects.
|
||||
* @param offsetSource the {@link OffsetSource} to used for offsets retrieval.
|
||||
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
|
||||
* If null {@link Locale#ROOT} is used.
|
||||
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
|
||||
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
||||
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
|
||||
* @param index the index we're highlighting, mostly used for error messages
|
||||
* @param field the name of the field we're highlighting
|
||||
* @param query the query we're highlighting
|
||||
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
|
||||
* @param maxPassages the maximum number of passes to highlight
|
||||
* @param fieldMatcher decides which terms should be highlighted
|
||||
* @param keywordIgnoreAbove if the field's value is longer than this we'll skip it
|
||||
* @param maxAnalyzedOffset if the field is more than this long we'll refuse to use the ANALYZED
|
||||
* offset source for it because it'd be super slow
|
||||
*/
|
||||
public CustomUnifiedHighlighter(IndexSearcher searcher,
|
||||
Analyzer analyzer,
|
||||
|
@ -83,40 +94,62 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
PassageFormatter passageFormatter,
|
||||
@Nullable Locale breakIteratorLocale,
|
||||
@Nullable BreakIterator breakIterator,
|
||||
String fieldValue,
|
||||
int noMatchSize) {
|
||||
String index, String field, Query query,
|
||||
int noMatchSize,
|
||||
int maxPassages,
|
||||
Predicate<String> fieldMatcher,
|
||||
int keywordIgnoreAbove,
|
||||
int maxAnalyzedOffset) throws IOException {
|
||||
super(searcher, analyzer);
|
||||
this.offsetSource = offsetSource;
|
||||
this.breakIterator = breakIterator;
|
||||
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
|
||||
this.passageFormatter = passageFormatter;
|
||||
this.fieldValue = fieldValue;
|
||||
this.index = index;
|
||||
this.field = field;
|
||||
this.noMatchSize = noMatchSize;
|
||||
this.setFieldMatcher(fieldMatcher);
|
||||
this.keywordIgnoreAbove = keywordIgnoreAbove;
|
||||
this.maxAnalyzedOffset = maxAnalyzedOffset;
|
||||
fieldHighlighter = getFieldHighlighter(field, query, extractTerms(query), maxPassages);
|
||||
}
|
||||
|
||||
/**
|
||||
* Highlights terms extracted from the provided query within the content of the provided field name
|
||||
* Highlights the field value.
|
||||
*/
|
||||
public Snippet[] highlightField(String field, Query query, int docId, int maxPassages) throws IOException {
|
||||
Map<String, Object[]> fieldsAsObjects = super.highlightFieldsAsObjects(new String[]{field}, query,
|
||||
new int[]{docId}, new int[]{maxPassages});
|
||||
Object[] snippetObjects = fieldsAsObjects.get(field);
|
||||
if (snippetObjects != null) {
|
||||
//one single document at a time
|
||||
assert snippetObjects.length == 1;
|
||||
Object snippetObject = snippetObjects[0];
|
||||
if (snippetObject != null && snippetObject instanceof Snippet[]) {
|
||||
return (Snippet[]) snippetObject;
|
||||
}
|
||||
public Snippet[] highlightField(LeafReader reader, int docId, CheckedSupplier<String, IOException> loadFieldValue) throws IOException {
|
||||
if (fieldHighlighter.fieldOffsetStrategy == NoOpOffsetStrategy.INSTANCE && noMatchSize == 0) {
|
||||
// If the query is such that there can't possibly be any matches then skip doing *everything*
|
||||
return EMPTY_SNIPPET;
|
||||
}
|
||||
return EMPTY_SNIPPET;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
|
||||
int cacheCharsThreshold) throws IOException {
|
||||
// we only highlight one field, one document at a time
|
||||
return Collections.singletonList(new String[]{fieldValue});
|
||||
String fieldValue = loadFieldValue.get();
|
||||
if (fieldValue == null) {
|
||||
return null;
|
||||
}
|
||||
int fieldValueLength = fieldValue.length();
|
||||
if (fieldValueLength > keywordIgnoreAbove) {
|
||||
return null; // skip highlighting keyword terms that were ignored during indexing
|
||||
}
|
||||
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of ["
|
||||
+ field
|
||||
+ "] field of ["
|
||||
+ docId
|
||||
+ "] doc of ["
|
||||
+ index
|
||||
+ "] index "
|
||||
+ "has exceeded ["
|
||||
+ maxAnalyzedOffset
|
||||
+ "] - maximum allowed to be analyzed for highlighting. "
|
||||
+ "This maximum can be set by changing the ["
|
||||
+ IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey()
|
||||
+ "] index level setting. "
|
||||
+ "For large texts, indexing with offsets or term vectors is recommended!"
|
||||
);
|
||||
}
|
||||
Snippet[] result = (Snippet[]) fieldHighlighter.highlightFieldForDoc(reader, docId, fieldValue);
|
||||
return result == null ? EMPTY_SNIPPET : result;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -124,6 +157,10 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
return breakIterator;
|
||||
}
|
||||
|
||||
public PassageFormatter getFormatter() {
|
||||
return passageFormatter;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected PassageFormatter getFormatter(String field) {
|
||||
return passageFormatter;
|
||||
|
@ -142,7 +179,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
UnifiedHighlighter.MULTIVAL_SEP_CHAR);
|
||||
FieldOffsetStrategy strategy = getOffsetStrategy(offsetSource, components);
|
||||
return new CustomFieldHighlighter(field, strategy, breakIteratorLocale, breakIterator,
|
||||
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize, fieldValue);
|
||||
getScorer(field), maxPassages, (noMatchSize > 0 ? 1 : 0), getFormatter(field), noMatchSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,15 +30,14 @@ import org.apache.lucene.search.uhighlight.Snippet;
|
|||
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.elasticsearch.common.CheckedSupplier;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.IdFieldMapper;
|
||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.mapper.TextSearchInfo;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
|
||||
|
@ -46,8 +45,11 @@ import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
|
|||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
|
@ -60,85 +62,97 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
|
||||
@Override
|
||||
public HighlightField highlight(FieldHighlightContext fieldContext) {
|
||||
@SuppressWarnings("unchecked")
|
||||
Map<String, CustomUnifiedHighlighter> cache = (Map<String, CustomUnifiedHighlighter>) fieldContext.hitContext.cache()
|
||||
.computeIfAbsent(UnifiedHighlighter.class.getName(), k -> new HashMap<>());
|
||||
CustomUnifiedHighlighter highlighter = (CustomUnifiedHighlighter) cache.computeIfAbsent(fieldContext.fieldName, f -> {
|
||||
Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html")
|
||||
? HighlightUtils.Encoders.HTML
|
||||
: HighlightUtils.Encoders.DEFAULT;
|
||||
int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
|
||||
int keywordIgnoreAbove = Integer.MAX_VALUE;
|
||||
if (fieldContext.fieldType instanceof KeywordFieldMapper.KeywordFieldType) {
|
||||
KeywordFieldMapper mapper = (KeywordFieldMapper) fieldContext.context.getMapperService().documentMapper()
|
||||
.mappers().getMapper(fieldContext.fieldName);
|
||||
keywordIgnoreAbove = mapper.ignoreAbove();
|
||||
}
|
||||
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
|
||||
Analyzer analyzer = getAnalyzer(fieldContext.context.getMapperService().documentMapper());
|
||||
PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
|
||||
IndexSearcher searcher = fieldContext.context.searcher();
|
||||
OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
|
||||
BreakIterator breakIterator;
|
||||
int higlighterNumberOfFragments;
|
||||
if (numberOfFragments == 0
|
||||
// non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
|
||||
|| fieldContext.fieldType.getTextSearchInfo().isTokenized() == false) {
|
||||
/*
|
||||
* We use a control char to separate values, which is the
|
||||
* only char that the custom break iterator breaks the text
|
||||
* on, so we don't lose the distinction between the different
|
||||
* values of a field and we get back a snippet per value
|
||||
*/
|
||||
breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
|
||||
higlighterNumberOfFragments = numberOfFragments == 0 ? Integer.MAX_VALUE - 1 : numberOfFragments;
|
||||
} else {
|
||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||
breakIterator = getBreakIterator(fieldContext.field);
|
||||
higlighterNumberOfFragments = numberOfFragments;
|
||||
}
|
||||
try {
|
||||
return new CustomUnifiedHighlighter(
|
||||
searcher,
|
||||
analyzer,
|
||||
offsetSource,
|
||||
passageFormatter,
|
||||
fieldContext.field.fieldOptions().boundaryScannerLocale(),
|
||||
breakIterator,
|
||||
fieldContext.context.getFullyQualifiedIndex().getName(),
|
||||
fieldContext.fieldName,
|
||||
fieldContext.query,
|
||||
fieldContext.field.fieldOptions().noMatchSize(),
|
||||
higlighterNumberOfFragments,
|
||||
fieldMatcher(fieldContext),
|
||||
keywordIgnoreAbove,
|
||||
maxAnalyzedOffset
|
||||
);
|
||||
} catch (IOException e) {
|
||||
throw new FetchPhaseExecutionException(fieldContext.shardTarget,
|
||||
"Failed to highlight field [" + fieldContext.fieldName + "]", e);
|
||||
}
|
||||
});
|
||||
MappedFieldType fieldType = fieldContext.fieldType;
|
||||
SearchHighlightContext.Field field = fieldContext.field;
|
||||
QueryShardContext context = fieldContext.context;
|
||||
FetchSubPhase.HitContext hitContext = fieldContext.hitContext;
|
||||
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
|
||||
final int maxAnalyzedOffset = context.getIndexSettings().getHighlightMaxAnalyzedOffset();
|
||||
Integer keywordIgnoreAbove = null;
|
||||
if (fieldType instanceof KeywordFieldMapper.KeywordFieldType) {
|
||||
KeywordFieldMapper mapper = (KeywordFieldMapper) context.getMapperService().documentMapper()
|
||||
.mappers().getMapper(fieldContext.fieldName);
|
||||
keywordIgnoreAbove = mapper.ignoreAbove();
|
||||
}
|
||||
|
||||
List<Snippet> snippets = new ArrayList<>();
|
||||
int numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||
try {
|
||||
final Analyzer analyzer = getAnalyzer(context.getMapperService().documentMapper(hitContext.hit().getType()),
|
||||
hitContext);
|
||||
List<Object> fieldValues = loadFieldValues(fieldType, field, hitContext, fieldContext.forceSource);
|
||||
CheckedSupplier<String, IOException> loadFieldValues = () -> {
|
||||
List<Object> fieldValues = loadFieldValues(highlighter, fieldType, field, hitContext, fieldContext.forceSource);
|
||||
if (fieldValues.size() == 0) {
|
||||
return null;
|
||||
}
|
||||
final PassageFormatter passageFormatter = getPassageFormatter(hitContext, field, encoder);
|
||||
final IndexSearcher searcher = new IndexSearcher(hitContext.reader());
|
||||
final CustomUnifiedHighlighter highlighter;
|
||||
final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||
final OffsetSource offsetSource = getOffsetSource(fieldType);
|
||||
int fieldValueLength = fieldValue.length();
|
||||
if (keywordIgnoreAbove != null && fieldValueLength > keywordIgnoreAbove) {
|
||||
return null; // skip highlighting keyword terms that were ignored during indexing
|
||||
}
|
||||
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValueLength > maxAnalyzedOffset)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of [" + fieldContext.fieldName + "] field of [" + hitContext.hit().getId() +
|
||||
"] doc of [" + context.index().getName() + "] index " + "has exceeded [" +
|
||||
maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
|
||||
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
|
||||
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
|
||||
}
|
||||
if (numberOfFragments == 0
|
||||
// non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
|
||||
|| fieldType.getTextSearchInfo().isTokenized() == false) {
|
||||
// we use a control char to separate values, which is the only char that the custom break iterator
|
||||
// breaks the text on, so we don't lose the distinction between the different values of a field and we
|
||||
// get back a snippet per value
|
||||
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
|
||||
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
|
||||
numberOfFragments = numberOfFragments == 0 ? fieldValues.size() : numberOfFragments;
|
||||
} else {
|
||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||
BreakIterator bi = getBreakIterator(field);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
|
||||
field.fieldOptions().boundaryScannerLocale(), bi,
|
||||
fieldValue, field.fieldOptions().noMatchSize());
|
||||
numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||
}
|
||||
|
||||
if (field.fieldOptions().requireFieldMatch()) {
|
||||
final String fieldName = fieldContext.fieldName;
|
||||
highlighter.setFieldMatcher((name) -> fieldName.equals(name));
|
||||
} else {
|
||||
// ignore terms that targets the _id field since they use a different encoding
|
||||
// that is not compatible with utf8
|
||||
highlighter.setFieldMatcher(name -> IdFieldMapper.NAME.equals(name) == false);
|
||||
}
|
||||
|
||||
Snippet[] fieldSnippets = highlighter.highlightField(fieldContext.fieldName,
|
||||
fieldContext.query, hitContext.docId(), numberOfFragments);
|
||||
for (Snippet fieldSnippet : fieldSnippets) {
|
||||
if (Strings.hasText(fieldSnippet.getText())) {
|
||||
snippets.add(fieldSnippet);
|
||||
}
|
||||
}
|
||||
return mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||
};
|
||||
Snippet[] fieldSnippets;
|
||||
try {
|
||||
fieldSnippets = highlighter.highlightField(hitContext.reader(), hitContext.docId(), loadFieldValues);
|
||||
} catch (IOException e) {
|
||||
throw new FetchPhaseExecutionException(fieldContext.shardTarget,
|
||||
"Failed to highlight field [" + fieldContext.fieldName + "]", e);
|
||||
}
|
||||
|
||||
if (fieldSnippets == null || fieldSnippets.length == 0) {
|
||||
return null;
|
||||
}
|
||||
List<Snippet> snippets = new ArrayList<>(fieldSnippets.length);
|
||||
for (Snippet fieldSnippet : fieldSnippets) {
|
||||
if (Strings.hasText(fieldSnippet.getText())) {
|
||||
snippets.add(fieldSnippet);
|
||||
}
|
||||
}
|
||||
if (snippets.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (field.fieldOptions().scoreOrdered()) {
|
||||
//let's sort the snippets by score if needed
|
||||
CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
|
||||
|
@ -149,10 +163,7 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
fragments[i] = snippets.get(i).getText();
|
||||
}
|
||||
|
||||
if (fragments.length > 0) {
|
||||
return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
|
||||
}
|
||||
return null;
|
||||
return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
|
||||
}
|
||||
|
||||
protected PassageFormatter getPassageFormatter(HitContext hitContext, SearchHighlightContext.Field field, Encoder encoder) {
|
||||
|
@ -162,14 +173,17 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
}
|
||||
|
||||
|
||||
protected Analyzer getAnalyzer(DocumentMapper docMapper, HitContext hitContext) {
|
||||
protected Analyzer getAnalyzer(DocumentMapper docMapper) {
|
||||
return docMapper.mappers().indexAnalyzer();
|
||||
}
|
||||
|
||||
protected List<Object> loadFieldValues(MappedFieldType fieldType,
|
||||
SearchHighlightContext.Field field,
|
||||
FetchSubPhase.HitContext hitContext,
|
||||
boolean forceSource) throws IOException {
|
||||
protected List<Object> loadFieldValues(
|
||||
CustomUnifiedHighlighter highlighter,
|
||||
MappedFieldType fieldType,
|
||||
SearchHighlightContext.Field field,
|
||||
FetchSubPhase.HitContext hitContext,
|
||||
boolean forceSource
|
||||
) throws IOException {
|
||||
List<Object> fieldValues = HighlightUtils.loadFieldValues(fieldType, hitContext, forceSource);
|
||||
fieldValues = fieldValues.stream()
|
||||
.map((s) -> convertFieldValue(fieldType, s))
|
||||
|
@ -226,4 +240,14 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
}
|
||||
return OffsetSource.ANALYSIS;
|
||||
}
|
||||
|
||||
private Predicate<String> fieldMatcher(FieldHighlightContext fieldContext) {
|
||||
if (fieldContext.field.fieldOptions().requireFieldMatch()) {
|
||||
String fieldName = fieldContext.fieldName;
|
||||
return name -> fieldName.equals(name);
|
||||
}
|
||||
// ignore terms that targets the _id field since they use a different encoding
|
||||
// that is not compatible with utf8
|
||||
return name -> IdFieldMapper.NAME.equals(name) == false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -79,12 +79,23 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
|
|||
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
|
||||
assertThat(topDocs.totalHits.value, equalTo(1L));
|
||||
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
|
||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
|
||||
breakIterator, rawValue, noMatchSize);
|
||||
highlighter.setFieldMatcher((name) -> "text".equals(name));
|
||||
final Snippet[] snippets =
|
||||
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(
|
||||
searcher,
|
||||
analyzer,
|
||||
null,
|
||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()),
|
||||
locale,
|
||||
breakIterator,
|
||||
"index",
|
||||
"text",
|
||||
query,
|
||||
noMatchSize,
|
||||
expectedPassages.length,
|
||||
name -> "text".equals(name),
|
||||
Integer.MAX_VALUE,
|
||||
Integer.MAX_VALUE
|
||||
);
|
||||
final Snippet[] snippets = highlighter.highlightField(getOnlyLeafReader(reader), topDocs.scoreDocs[0].doc, () -> rawValue);
|
||||
assertEquals(snippets.length, expectedPassages.length);
|
||||
for (int i = 0; i < snippets.length; i++) {
|
||||
assertEquals(snippets[i].getText(), expectedPassages[i]);
|
||||
|
|
Loading…
Reference in New Issue