Limit the analyzed text for highlighting (#27934)

* Limit the analyzed text for highlighting - Introduce index level settings to control the max number of character to be analyzed for highlighting - Throw an error if analysis is required on a larger text Closes #27517
2017-12-21 10:19:58 -05:00 · 2017-12-21 10:19:58 -05:00 · cbd271e497
parent d63b1efb14
commit cbd271e497
10 changed files with 153 additions and 5 deletions
--- a/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
+++ b/core/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@ -37,6 +37,7 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.search.ESToParentBlockJoinQuery;

 import java.io.IOException;
@ -67,6 +68,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
    private final BreakIterator breakIterator;
    private final Locale breakIteratorLocale;
    private final int noMatchSize;
+    private final int maxAnalyzedOffset;

    /**
     * Creates a new instance of {@link CustomUnifiedHighlighter}
@ -81,6 +83,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     *                    If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
     * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
     * @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
+     * @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
     */
    public CustomUnifiedHighlighter(IndexSearcher searcher,
                                    Analyzer analyzer,
@ -89,7 +92,8 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
                                    @Nullable Locale breakIteratorLocale,
                                    @Nullable BreakIterator breakIterator,
                                    String fieldValue,
-                                    int noMatchSize) {
+                                    int noMatchSize,
+                                    int maxAnalyzedOffset) {
        super(searcher, analyzer);
        this.offsetSource = offsetSource;
        this.breakIterator = breakIterator;
@ -97,6 +101,7 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
        this.passageFormatter = passageFormatter;
        this.fieldValue = fieldValue;
        this.noMatchSize = noMatchSize;
+        this.maxAnalyzedOffset = maxAnalyzedOffset;
    }

    /**
@ -120,6 +125,13 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
    @Override
    protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
                                                   int cacheCharsThreshold) throws IOException {
+        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
+            throw new IllegalArgumentException(
+                "The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
+                    maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
+                    IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
+                    "For large texts, indexing with offsets or term vectors is recommended!");
+        }
        // we only highlight one field, one document at a time
        return Collections.singletonList(new String[]{fieldValue});
    }
--- a/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java
+++ b/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java
@ -118,6 +118,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
        IndexSettings.MAX_SHINGLE_DIFF_SETTING,
        IndexSettings.MAX_RESCORE_WINDOW_SETTING,
        IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
+        IndexSettings.MAX_ANALYZED_OFFSET_SETTING,
        IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
        IndexSettings.DEFAULT_FIELD_SETTING,
        IndexSettings.QUERY_STRING_LENIENT_SETTING,
--- a/core/src/main/java/org/elasticsearch/index/IndexSettings.java
+++ b/core/src/main/java/org/elasticsearch/index/IndexSettings.java
@ -118,6 +118,17 @@ public final class IndexSettings {
    public static final Setting<Integer> MAX_TOKEN_COUNT_SETTING =
        Setting.intSetting("index.analyze.max_token_count", 10000, 1, Property.Dynamic, Property.IndexScope);

+
+    /**
+     * A setting describing the maximum number of characters that will be analyzed for a highlight request.
+     * This setting is only applicable when highlighting is requested on a text that was indexed without
+     * offsets or term vectors.
+     * The default maximum of 10000 characters is defensive as for highlighting larger texts,
+     * indexing with offsets or term vectors is recommended.
+     */
+    public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
+        Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
+
    /**
     * Index setting describing for NGramTokenizer and NGramTokenFilter
     * the maximum difference between
@ -275,6 +286,7 @@ public final class IndexSettings {
    private volatile int maxShingleDiff;
    private volatile boolean TTLPurgeDisabled;
    private volatile TimeValue searchIdleAfter;
+    private volatile int maxAnalyzedOffset;

    /**
     * The maximum number of refresh listeners allows on this shard.
@ -384,6 +396,7 @@ public final class IndexSettings {
        TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
        maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
        maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
+        maxAnalyzedOffset = scopedSettings.get(MAX_ANALYZED_OFFSET_SETTING);
        this.mergePolicyConfig = new MergePolicyConfig(logger, this);
        this.indexSortConfig = new IndexSortConfig(this);
        searchIdleAfter = scopedSettings.get(INDEX_SEARCH_IDLE_AFTER);
@ -426,6 +439,7 @@ public final class IndexSettings {
        scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_RETENTION_SIZE_SETTING, this::setTranslogRetentionSize);
        scopedSettings.addSettingsUpdateConsumer(INDEX_REFRESH_INTERVAL_SETTING, this::setRefreshInterval);
        scopedSettings.addSettingsUpdateConsumer(MAX_REFRESH_LISTENERS_PER_SHARD, this::setMaxRefreshListeners);
+        scopedSettings.addSettingsUpdateConsumer(MAX_ANALYZED_OFFSET_SETTING, this::setHighlightMaxAnalyzedOffset);
        scopedSettings.addSettingsUpdateConsumer(MAX_SLICES_PER_SCROLL, this::setMaxSlicesPerScroll);
        scopedSettings.addSettingsUpdateConsumer(DEFAULT_FIELD_SETTING, this::setDefaultFields);
        scopedSettings.addSettingsUpdateConsumer(INDEX_SEARCH_IDLE_AFTER, this::setSearchIdleAfter);
@ -713,6 +727,13 @@ public final class IndexSettings {

    private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }

+    /**
+     *  Returns the maximum number of chars that will be analyzed in a highlight request
+     */
+    public int getHighlightMaxAnalyzedOffset() { return this.maxAnalyzedOffset; }
+
+    private void setHighlightMaxAnalyzedOffset(int maxAnalyzedOffset) { this.maxAnalyzedOffset = maxAnalyzedOffset; }
+
    /**
     * Returns the maximum number of allowed script_fields to retrieve in a search request
     */
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
@ -35,6 +35,7 @@ import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.common.text.Text;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
 import org.elasticsearch.search.fetch.FetchSubPhase;
@ -103,11 +104,21 @@ public class PlainHighlighter implements Highlighter {
        ArrayList<TextFragment> fragsList = new ArrayList<>();
        List<Object> textsToHighlight;
        Analyzer analyzer = getAnalyzer(context.mapperService().documentMapper(hitContext.hit().getType()), mapper.fieldType());
+        final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
+
        try {
            textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);

            for (Object textToHighlight : textsToHighlight) {
                String text = convertFieldValue(mapper.fieldType(), textToHighlight);
+                if (text.length() > maxAnalyzedOffset) {
+                    throw new IllegalArgumentException(
+                        "The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
+                            maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
+                            IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
+                            "For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
+                            "fvh highlighter is recommended!");
+                }

                try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
                    if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
--- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
+++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@ -67,6 +67,7 @@ public class UnifiedHighlighter implements Highlighter {
        Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
            field.fieldOptions().postTags()[0], encoder);
+        final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();

        List<Snippet> snippets = new ArrayList<>();
        int numberOfFragments;
@ -88,14 +89,15 @@ public class UnifiedHighlighter implements Highlighter {
                // get back a snippet per value
                CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
                highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
-                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
+                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
+                    maxAnalyzedOffset);
                numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
            } else {
                //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
                BreakIterator bi = getBreakIterator(field);
                highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
                    field.fieldOptions().boundaryScannerLocale(), bi,
-                    fieldValue, field.fieldOptions().noMatchSize());
+                    fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
                numberOfFragments = field.fieldOptions().numberOfFragments();
            }

--- a/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/core/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@ -78,7 +78,7 @@ public class CustomUnifiedHighlighterTests extends ESTestCase {
        String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
        CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
                new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
-                breakIterator, rawValue, noMatchSize);
+                breakIterator, rawValue, noMatchSize, 10000);
        highlighter.setFieldMatcher((name) -> "text".equals(name));
        final Snippet[] snippets =
            highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
--- a/docs/reference/index-modules.asciidoc
+++ b/docs/reference/index-modules.asciidoc
@ -198,6 +198,12 @@ specific index module:
    The maximum number of tokens that can be produced using _analyze API.
    Defaults to `10000`.

+ `index.highlight.max_analyzed_offset`::
+
+     The maximum number of characters that will be analyzed for a highlight request.
+     This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
+     Defaults to `10000`.
+

 [float]
 === Settings in other index modules
--- a/docs/reference/migration/migrate_7_0/analysis.asciidoc
+++ b/docs/reference/migration/migrate_7_0/analysis.asciidoc
@ -13,3 +13,13 @@ deprecated and will be removed at some point, so it should be replaced by
 To safeguard against out of memory errors, the number of tokens that can be produced
 using the `_analyze` endpoint has been limited to 10000. This default limit can be changed
 for a particular index with the index setting `index.analyze.max_token_count`.
+
+
+==== Limiting the length of an analyzed text during highlighting
+
+Highlighting a text that was indexed without offsets or term vectors,
+requires analysis of this text in memory real time during the search request.
+For large texts this analysis may take substantial amount of time and memory.
+To protect against this, the maximum number of characters that will be analyzed has been
+limited to 10000. This default limit can be changed
+for a particular index with the index setting `index.highlight.max_analyzed_offset`.
--- a/docs/reference/search/request/highlighting.asciidoc
+++ b/docs/reference/search/request/highlighting.asciidoc
@ -101,6 +101,12 @@ Lucene's query execution planner to get access to low-level match information on
 the current document. This is repeated for every field and every document that
 needs highlighting. The `plain` highlighter always uses plain highlighting.

+[WARNING]
+Plain highlighting for large texts may require substantial amount of time and memory.
+To protect against this, the maximum number of text characters that will be analyzed has been
+limited to 10000. This default limit can be changed
+for a particular index with the index setting `index.highlight.max_analyzed_offset`.
+
 [[highlighting-settings]]
 ==== Highlighting Settings

--- a/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/search.highlight/30_max_analyzed_offset.yml
@ -0,0 +1,79 @@
+---
+setup:
+  - do:
+      indices.create:
+          index: test1
+          body:
+              settings:
+                  number_of_shards: 1
+                  index.highlight.max_analyzed_offset: 10
+              mappings:
+                  test_type:
+                      properties:
+                          field1:
+                              type: text
+                          field2:
+                              type: text
+                              index_options: offsets
+
+  - do:
+      index:
+          index: test1
+          type:  test_type
+          id:    1
+          body:
+              "field1" : "The quick brown fox went to the forest and saw another fox."
+              "field2" : "The quick brown fox went to the forest and saw another fox."
+
+  - do:
+      indices.refresh: {}
+
+---
+"Unified highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
+  - skip:
+      version: " - 6.99.99"
+      reason: index.highlight.max_analyzed_offset setting has been added in 7.0.0
+  - do:
+      catch: bad_request
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
+  - match: { error.root_cause.0.type: "illegal_argument_exception" }
+
+
+---
+"Plain highlighter on a field WITHOUT OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
+  - skip:
+      version: " - 6.99.99"
+      reason: index.highlight.max_analyzed_offset setting has been added in 7.0.0
+  - do:
+      catch: bad_request
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field1" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field1" : {}}}}
+  - match: { error.root_cause.0.type: "illegal_argument_exception" }
+
+
+---
+"Unified highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should SUCCEED":
+  - skip:
+      version: " - 6.99.99"
+      reason: index.highligt.max_analyzed_offset setting has been added in 7.0.0
+  - do:
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "unified", "fields" : {"field2" : {}}}}
+  - match: {hits.hits.0.highlight.field2.0: "The quick brown <em>fox</em> went to the forest and saw another <em>fox</em>."}
+
+
+---
+"Plain highlighter on a field WITH OFFSETS exceeding index.highlight.max_analyzed_offset should FAIL":
+  - skip:
+      version: " - 6.99.99"
+      reason: index.highlight.max_analyzed_offset setting has been added in 7.0.0
+  - do:
+      catch: bad_request
+      search:
+          index: test1
+          body: {"query" : {"match" : {"field2" : "fox"}}, "highlight" : {"type" : "plain", "fields" : {"field2" : {}}}}
+  - match: { error.root_cause.0.type: "illegal_argument_exception" }