Limit the analyzed text for highlighting (#27934)
* Limit the analyzed text for highlighting - Introduce index level settings to control the max number of character to be analyzed for highlighting - Throw an error if analysis is required on a larger text Closes #27517
This commit is contained in:
parent
d63b1efb14
commit
cbd271e497
|
@ -37,6 +37,7 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
|||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
|
||||
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -67,6 +68,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
private final BreakIterator breakIterator;
|
||||
private final Locale breakIteratorLocale;
|
||||
private final int noMatchSize;
|
||||
private final int maxAnalyzedOffset;
|
||||
|
||||
/**
|
||||
* Creates a new instance of {@link CustomUnifiedHighlighter}
|
||||
|
@ -81,6 +83,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
* If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
|
||||
* @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
|
||||
* @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
|
||||
* @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
|
||||
*/
|
||||
public CustomUnifiedHighlighter(IndexSearcher searcher,
|
||||
Analyzer analyzer,
|
||||
|
@ -89,7 +92,8 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
@Nullable Locale breakIteratorLocale,
|
||||
@Nullable BreakIterator breakIterator,
|
||||
String fieldValue,
|
||||
int noMatchSize) {
|
||||
int noMatchSize,
|
||||
int maxAnalyzedOffset) {
|
||||
super(searcher, analyzer);
|
||||
this.offsetSource = offsetSource;
|
||||
this.breakIterator = breakIterator;
|
||||
|
@ -97,6 +101,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
this.passageFormatter = passageFormatter;
|
||||
this.fieldValue = fieldValue;
|
||||
this.noMatchSize = noMatchSize;
|
||||
this.maxAnalyzedOffset = maxAnalyzedOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -120,6 +125,13 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
|
|||
@Override
|
||||
protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
|
||||
int cacheCharsThreshold) throws IOException {
|
||||
if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
|
||||
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
|
||||
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
|
||||
"For large texts, indexing with offsets or term vectors is recommended!");
|
||||
}
|
||||
// we only highlight one field, one document at a time
|
||||
return Collections.singletonList(new String[]{fieldValue});
|
||||
}
|
||||
|
|
|
@ -118,6 +118,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
|
|||
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
|
||||
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
|
||||
IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
|
||||
IndexSettings.MAX_ANALYZED_OFFSET_SETTING,
|
||||
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
|
||||
IndexSettings.DEFAULT_FIELD_SETTING,
|
||||
IndexSettings.QUERY_STRING_LENIENT_SETTING,
|
||||
|
|
|
@ -118,6 +118,17 @@ public final class IndexSettings {
|
|||
public static final Setting<Integer> MAX_TOKEN_COUNT_SETTING =
|
||||
Setting.intSetting("index.analyze.max_token_count", 10000, 1, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
|
||||
/**
|
||||
* A setting describing the maximum number of characters that will be analyzed for a highlight request.
|
||||
* This setting is only applicable when highlighting is requested on a text that was indexed without
|
||||
* offsets or term vectors.
|
||||
* The default maximum of 10000 characters is defensive as for highlighting larger texts,
|
||||
* indexing with offsets or term vectors is recommended.
|
||||
*/
|
||||
public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
|
||||
Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
/**
|
||||
* Index setting describing for NGramTokenizer and NGramTokenFilter
|
||||
* the maximum difference between
|
||||
|
@ -275,6 +286,7 @@ public final class IndexSettings {
|
|||
private volatile int maxShingleDiff;
|
||||
private volatile boolean TTLPurgeDisabled;
|
||||
private volatile TimeValue searchIdleAfter;
|
||||
private volatile int maxAnalyzedOffset;
|
||||
|
||||
/**
|
||||
* The maximum number of refresh listeners allows on this shard.
|
||||
|
@ -384,6 +396,7 @@ public final class IndexSettings {
|
|||
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
|
||||
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
|
||||
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
|
||||
maxAnalyzedOffset = scopedSettings.get(MAX_ANALYZED_OFFSET_SETTING);
|
||||
this.mergePolicyConfig = new MergePolicyConfig(logger, this);
|
||||
this.indexSortConfig = new IndexSortConfig(this);
|
||||
searchIdleAfter = scopedSettings.get(INDEX_SEARCH_IDLE_AFTER);
|
||||
|
@ -426,6 +439,7 @@ public final class IndexSettings {
|
|||
scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_RETENTION_SIZE_SETTING, this::setTranslogRetentionSize);
|
||||
scopedSettings.addSettingsUpdateConsumer(INDEX_REFRESH_INTERVAL_SETTING, this::setRefreshInterval);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_REFRESH_LISTENERS_PER_SHARD, this::setMaxRefreshListeners);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_ANALYZED_OFFSET_SETTING, this::setHighlightMaxAnalyzedOffset);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_SLICES_PER_SCROLL, this::setMaxSlicesPerScroll);
|
||||
scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
|
||||
scopedSettings.addSettingsUpdateConsumer(INDEX_SEARCH_IDLE_AFTER, this::setSearchIdleAfter);
|
||||
|
@ -713,6 +727,13 @@ public final class IndexSettings {
|
|||
|
||||
private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
|
||||
|
||||
/**
|
||||
* Returns the maximum number of chars that will be analyzed in a highlight request
|
||||
*/
|
||||
public int getHighlightMaxAnalyzedOffset() { return this.maxAnalyzedOffset; }
|
||||
|
||||
private void setHighlightMaxAnalyzedOffset(int maxAnalyzedOffset) { this.maxAnalyzedOffset = maxAnalyzedOffset; }
|
||||
|
||||
/**
|
||||
* Returns the maximum number of allowed script_fields to retrieve in a search request
|
||||
*/
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.util.BytesRefHash;
|
|||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.elasticsearch.ExceptionsHelper;
|
||||
import org.elasticsearch.common.text.Text;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
|
||||
import org.elasticsearch.search.fetch.FetchSubPhase;
|
||||
|
@ -103,11 +104,21 @@ public class PlainHighlighter implements Highlighter {
|
|||
ArrayList<TextFragment> fragsList = new ArrayList<>();
|
||||
List<Object> textsToHighlight;
|
||||
Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), mapper.fieldType());
|
||||
final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
|
||||
|
||||
try {
|
||||
textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
|
||||
|
||||
for (Object textToHighlight : textsToHighlight) {
|
||||
String text = convertFieldValue(mapper.fieldType(), textToHighlight);
|
||||
if (text.length() > maxAnalyzedOffset) {
|
||||
throw new IllegalArgumentException(
|
||||
"The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
|
||||
maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
|
||||
IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
|
||||
"For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
|
||||
"fvh highlighter is recommended!");
|
||||
}
|
||||
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
|
||||
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
|
||||
|
|
|
@ -67,6 +67,7 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
|
||||
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
|
||||
field.fieldOptions().postTags()[0], encoder);
|
||||
final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
|
||||
|
||||
List<Snippet> snippets = new ArrayList<>();
|
||||
int numberOfFragments;
|
||||
|
@ -88,14 +89,15 @@ public class UnifiedHighlighter implements Highlighter {
|
|||
// get back a snippet per value
|
||||
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
|
||||
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
|
||||
field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
|
||||
maxAnalyzedOffset);
|
||||
numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
|
||||
} else {
|
||||
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
|
||||
BreakIterator bi = getBreakIterator(field);
|
||||
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
|
||||
field.fieldOptions().boundaryScannerLocale(), bi,
|
||||
fieldValue, field.fieldOptions().noMatchSize());
|
||||
fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
|
||||
numberOfFragments = field.fieldOptions().numberOfFragments();
|
||||
}
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
|
|||
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
|
||||
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
|
||||
new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
|
||||
breakIterator, rawValue, noMatchSize);
|
||||
breakIterator, rawValue, noMatchSize, 10000);
|
||||
highlighter.setFieldMatcher((name) -> "text".equals(name));
|
||||
final Snippet[] snippets =
|
||||
highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
|
||||
|
|
|
@ -198,6 +198,12 @@ specific index module:
|
|||
The maximum number of tokens that can be produced using _analyze API.
|
||||
Defaults to `10000`.
|
||||
|
||||
`index.highlight.max_analyzed_offset`::
|
||||
|
||||
The maximum number of characters that will be analyzed for a highlight request.
|
||||
This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
|
||||
Defaults to `10000`.
|
||||
|
||||
|
||||
[float]
|
||||
=== Settings in other index modules
|
||||
|
|
|
@ -12,4 +12,14 @@ deprecated and will be removed at some point, so it should be replaced by
|
|||
|
||||
To safeguard against out of memory errors, the number of tokens that can be produced
|
||||
using the `_analyze` endpoint has been limited to 10000. This default limit can be changed
|
||||
for a particular index with the index setting `index.analyze.max_token_count`.
|
||||
for a particular index with the index setting `index.analyze.max_token_count`.
|
||||
|
||||
|
||||
==== Limiting the length of an analyzed text during highlighting
|
||||
|
||||
Highlighting a text that was indexed without offsets or term vectors,
|
||||
requires analysis of this text in memory real time during the search request.
|
||||
For large texts this analysis may take substantial amount of time and memory.
|
||||
To protect against this, the maximum number of characters that will be analyzed has been
|
||||
limited to 10000. This default limit can be changed
|
||||
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
|
|
@ -101,6 +101,12 @@ Lucene's query execution planner to get access to low-level match information on
|
|||
the current document. This is repeated for every field and every document that
|
||||
needs highlighting. The `plain` highlighter always uses plain highlighting.
|
||||
|
||||
[WARNING]
|
||||
Plain highlighting for large texts may require substantial amount of time and memory.
|
||||
To protect against this, the maximum number of text characters that will be analyzed has been
|
||||
limited to 10000. This default limit can be changed
|
||||
for a particular index with the index setting `index.highlight.max_analyzed_offset`.
|
||||
|
||||
[[highlighting-settings]]
|
||||
==== Highlighting Settings
|
||||
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
---
|
||||
setup:
|
||||
- do:
|
||||
indices.create:
|
||||
index: test1
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 1
|
||||
index.highlight.max_analyzed_offset: 10
|
||||
mappings:
|
||||
test_type:
|
||||
properties:
|
||||
field1:
|
||||
type: text
|
||||
field2:
|
||||
type: text
|
||||
index_options: offsets
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test1
|
||||
type: test_type
|
||||
id: 1
|
||||
body:
|
||||
"field1" : "The quick brown fox went to the forest and saw another fox."
|
||||
"field2" : "The quick brown fox went to the forest and saw another fox."
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
---
|
||||
"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.highlight.max_analyzed_offset setting has been added in 7.0.0
|
||||
- do:
|
||||
catch: bad_request
|
||||
search:
|
||||
index: test1
|
||||
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
|
||||
- match: { error.root_cause.0.type: "illegal_argument_exception" }
|
||||
|
||||
|
||||
---
|
||||
"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.highlight.max_analyzed_offset setting has been added in 7.0.0
|
||||
- do:
|
||||
catch: bad_request
|
||||
search:
|
||||
index: test1
|
||||
body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
|
||||
- match: { error.root_cause.0.type: "illegal_argument_exception" }
|
||||
|
||||
|
||||
---
|
||||
"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.highligt.max_analyzed_offset setting has been added in 7.0.0
|
||||
- do:
|
||||
search:
|
||||
index: test1
|
||||
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field2" : {}}}}
|
||||
- match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another <em>fox</em>."}
|
||||
|
||||
|
||||
---
|
||||
"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.highlight.max_analyzed_offset setting has been added in 7.0.0
|
||||
- do:
|
||||
catch: bad_request
|
||||
search:
|
||||
index: test1
|
||||
body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
|
||||
- match: { error.root_cause.0.type: "illegal_argument_exception" }
|
Loading…
Reference in New Issue