Picks offset source for the unified highlighter directly from the es mapping (#25747)

This commit changes how the offset source is picked for each field using the es mapping rather than the underlying Lucene field infos.
It's mandatory for large mappings where field infos retrieval can be costly (the global field infos is merged for each highlighted field in every hit by the Lucene impl).

Fixes #25699
This commit is contained in:
Jim Ferenczi 2017-07-17 19:10:46 +02:00 committed by GitHub
parent 610ba7e427
commit 41ea8fdcec
3 changed files with 49 additions and 17 deletions

View File

@ -20,6 +20,8 @@
package org.apache.lucene.search.uhighlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.CommonTermsQuery;
import org.apache.lucene.search.DocIdSetIterator;
@ -62,6 +64,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
public static final char MULTIVAL_SEP_CHAR = (char) 0;
private static final Snippet[] EMPTY_SNIPPET = new Snippet[0];
private final OffsetSource offsetSource;
private final String fieldValue;
private final PassageFormatter passageFormatter;
private final BreakIterator breakIterator;
@ -71,24 +74,27 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
/**
* Creates a new instance of {@link CustomUnifiedHighlighter}
*
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally
* @param analyzer the analyzer used for the field at index time, used for multi term queries internally.
* @param passageFormatter our own {@link CustomPassageFormatter}
* which generates snippets in forms of {@link Snippet} objects
* which generates snippets in forms of {@link Snippet} objects.
* @param offsetSource the {@link OffsetSource} to used for offsets retrieval.
* @param breakIteratorLocale the {@link Locale} to use for dividing text into passages.
* If null {@link Locale#ROOT} is used
* If null {@link Locale#ROOT} is used.
* @param breakIterator the {@link BreakIterator} to use for dividing text into passages.
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
*/
public CustomUnifiedHighlighter(IndexSearcher searcher,
Analyzer analyzer,
OffsetSource offsetSource,
PassageFormatter passageFormatter,
@Nullable Locale breakIteratorLocale,
@Nullable BreakIterator breakIterator,
String fieldValue,
int noMatchSize) {
super(searcher, analyzer);
this.offsetSource = offsetSource;
this.breakIterator = breakIterator;
this.breakIteratorLocale = breakIteratorLocale == null ? Locale.ROOT : breakIteratorLocale;
this.passageFormatter = passageFormatter;
@ -213,4 +219,16 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
return null;
}
}
/**
* Forces the offset source for this highlighter
*/
@Override
protected OffsetSource getOffsetSource(String field) {
if (offsetSource == null) {
return super.getOffsetSource(field);
}
return offsetSource;
}
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.search.fetch.subphase.highlight;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.uhighlight.Snippet;
@ -26,11 +27,13 @@ import org.apache.lucene.search.uhighlight.BoundedBreakIteratorScanner;
import org.apache.lucene.search.uhighlight.CustomPassageFormatter;
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.internal.SearchContext;
@ -90,34 +93,35 @@ public class UnifiedHighlighter implements Highlighter {
return obj;
}
}).collect(Collectors.toList());
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
CustomUnifiedHighlighter highlighter;
final IndexSearcher searcher = new IndexSearcher(hitContext.reader());
final CustomUnifiedHighlighter highlighter;
final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType());
if (field.fieldOptions().numberOfFragments() == 0) {
// we use a control char to separate values, which is the only char that the custom break iterator
// breaks the text on, so we don't lose the distinction between the different values of a field and we
// get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
highlighter =
new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter,
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue,
field.fieldOptions().noMatchSize());
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource,
mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(),
breakIterator, fieldValue, field.fieldOptions().noMatchSize());
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
BreakIterator bi = getBreakIterator(field);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource,
mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi,
fieldValue, field.fieldOptions().noMatchSize());
numberOfFragments = field.fieldOptions().numberOfFragments();
}
if (field.fieldOptions().requireFieldMatch()) {
final String fieldName = highlighterContext.fieldName;
highlighter.setFieldMatcher((name) -> fieldName.equals(name));
} else {
highlighter.setFieldMatcher((name) -> true);
}
Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName,
highlighterContext.query, hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
@ -213,6 +217,16 @@ public class UnifiedHighlighter implements Highlighter {
return rawValue.substring(0, Math.min(rawValue.length(), Integer.MAX_VALUE - 1));
}
private OffsetSource getOffsetSource(MappedFieldType fieldType) {
if (fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
return fieldType.storeTermVectorOffsets() ? OffsetSource.POSTINGS_WITH_TERM_VECTORS : OffsetSource.POSTINGS;
}
if (fieldType.storeTermVectors()) {
return OffsetSource.TERM_VECTORS;
}
return OffsetSource.ANALYSIS;
}
private static class HighlighterEntry {
Map<FieldMapper, MapperHighlighterEntry> mappers = new HashMap<>();

View File

@ -77,9 +77,9 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1L));
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer,
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue,
noMatchSize);
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
breakIterator, rawValue, noMatchSize);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets =
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);