Limit analyzed text for highlighting (improvements) (#28808)
Increase the default limit of `index.highlight.max_analyzed_offset` to 1M instead of previous 10K. Enhance an error message when offset increased to include field name, index name and doc_id. Relates to https://github.com/elastic/kibana/issues/16764
This commit is contained in:
parent
f207aacac0
commit
f53d159aa1
|
@ -202,7 +202,7 @@ specific index module:
|
|||
|
||||
The maximum number of characters that will be analyzed for a highlight request.
|
||||
This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
|
||||
Defaults to `10000`.
|
||||
Defaults to `1000000`.
|
||||
|
||||
`index.max_terms_count`::
|
||||
|
||||
|
|
|
@ -21,5 +21,5 @@ Highlighting a text that was indexed without offsets or term vectors,
|
|||
requires analysis of this text in memory real time during the search request.
|
||||
For large texts this analysis may take substantial amount of time and memory.
|
||||
To protect against this, the maximum number of characters that will be analyzed has been
|
||||
limited to 10000. This default limit can be changed
|
||||
limited to 1000000. This default limit can be changed
|
||||
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
|
|
@ -104,7 +104,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
|
|||
[WARNING]
|
||||
Plain highlighting for large texts may require substantial amount of time and memory.
|
||||
To protect against this, the maximum number of text characters that will be analyzed has been
|
||||
limited to 10000. This default limit can be changed
|
||||
limited to 1000000. This default limit can be changed
|
||||
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
|
||||
|
||||
[[highlighting-settings]]
|
||||
|
|
|
@ -37,7 +37,6 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -68,7 +67,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
private final BreakIterator breakIterator;
|
||||
private final Locale breakIteratorLocale;
|
||||
private final int noMatchSize;
|
||||
private final int maxAnalyzedOffset;
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@link CustomUnifiedHighlighter}
|
||||
|
@ -83,7 +81,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
||||
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
|
||||
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
|
||||
* @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
|
||||
*/
|
||||
public CustomUnifiedHighlighter(IndexSearcher searcher,
|
||||
Analyzer analyzer,
|
||||
|
@ -92,8 +89,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
@Nullable Locale breakIteratorLocale,
|
||||
@Nullable BreakIterator breakIterator,
|
||||
String fieldValue,
|
||||
int noMatchSize,
|
||||
int maxAnalyzedOffset) {
|
||||
int noMatchSize) {
|
||||
super(searcher, analyzer);
|
||||
this.offsetSource = offsetSource;
|
||||
this.breakIterator = breakIterator;
|
||||
|
@ -101,7 +97,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
this.passageFormatter = passageFormatter;
|
||||
this.fieldValue = fieldValue;
|
||||
this.noMatchSize = noMatchSize;
|
||||
this.maxAnalyzedOffset = maxAnalyzedOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -125,13 +120,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
@Override
|
||||
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
|
||||
int cacheCharsThreshold) throws IOException {
|
||||
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
|
||||
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
|
||||
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
|
||||
"For large texts, indexing with offsets or term vectors is recommended!");
|
||||
}
|
||||
// we only highlight one field, one document at a time
|
||||
return Collections.singletonList(new String[]{fieldValue});
|
||||
}
|
||||
|
|
|
@ -123,11 +123,11 @@ public final class IndexSettings {
|
|||
* A setting describing the maximum number of characters that will be analyzed for a highlight request.
|
||||
* This setting is only applicable when highlighting is requested on a text that was indexed without
|
||||
* offsets or term vectors.
|
||||
* The default maximum of 10000 characters is defensive as for highlighting larger texts,
|
||||
* The default maximum of 1M characters is defensive as for highlighting larger texts,
|
||||
* indexing with offsets or term vectors is recommended.
|
||||
*/
|
||||
public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
|
||||
Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
|
||||
Setting.intSetting("index.highlight.max_analyzed_offset", 1000000, 1, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
|
||||
/**
|
||||
|
|
|
@ -113,11 +113,12 @@ public class PlainHighlighter implements Highlighter {
|
|||
String text = convertFieldValue(mapper.fieldType(), textToHighlight);
|
||||
if (text.length() > maxAnalyzedOffset) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
|
||||
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
|
||||
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
|
||||
"For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
|
||||
"fvh highlighter is recommended!");
|
||||
"The length of [" + highlighterContext.fieldName + "] field of [" + hitContext.hit().getId() +
|
||||
"] doc of [" + context.indexShard().shardId().getIndexName() + "] index " +
|
||||
"has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
|
||||
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
|
||||
"] index level setting. " + "For large texts, indexing with offsets or term vectors, and highlighting " +
|
||||
"with unified or fvh highlighter is recommended!");
|
||||
}
|
||||
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
|
||||
|
|
|
@ -32,11 +32,11 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.index.mapper.KeywordFieldMapper;
|
||||
import org.elasticsearch.index.mapper.MappedFieldType;
|
||||
import org.elasticsearch.index.mapper.MapperService;
|
||||
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase;
|
||||
import org.elasticsearch.search.internal.SearchContext;
|
||||
|
@ -44,10 +44,8 @@ import org.elasticsearch.search.internal.SearchContext;
|
|||
import java.io.IOException;
|
||||
import java.text.BreakIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
|
||||
|
@ -83,21 +81,28 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
final CustomUnifiedHighlighter highlighter;
|
||||
final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
|
||||
final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType());
|
||||
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of [" + highlighterContext.fieldName + "] field of [" + hitContext.hit().getId() +
|
||||
"] doc of [" + context.indexShard().shardId().getIndexName() + "] index " + "has exceeded [" +
|
||||
maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
|
||||
"This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
|
||||
"] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
|
||||
}
|
||||
if (field.fieldOptions().numberOfFragments() == 0) {
|
||||
// we use a control char to separate values, which is the only char that the custom break iterator
|
||||
// breaks the text on, so we don't lose the distinction between the different values of a field and we
|
||||
// get back a snippet per value
|
||||
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
|
||||
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
|
||||
maxAnalyzedOffset);
|
||||
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
|
||||
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
|
||||
} else {
|
||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||
BreakIterator bi = getBreakIterator(field);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
|
||||
field.fieldOptions().boundaryScannerLocale(), bi,
|
||||
fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
|
||||
fieldValue, field.fieldOptions().noMatchSize());
|
||||
numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||
}
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
|
|||
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
|
||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
|
||||
breakIterator, rawValue, noMatchSize, 10000);
|
||||
breakIterator, rawValue, noMatchSize);
|
||||
highlighter.setFieldMatcher((name) -> "text".equals(name));
|
||||
final Snippet[] snippets =
|
||||
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
|
||||
|
|
Loading…
Reference in New Issue