From 148376c2c57b4a494aec92d169aadf57c91851f5 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 7 Nov 2017 08:14:55 -0500 Subject: [PATCH] Add limits for ngram and shingle settings (#27211) * Add limits for ngram and shingle settings (#27211) Create index-level settings: max_ngram_diff - maximum allowed difference between max_gram and min_gram in NGramTokenFilter/NGramTokenizer. Default is 1. max_shingle_diff - maximum allowed difference between max_shingle_size and min_shingle_size in ShingleTokenFilter. Default is 3. Throw an IllegalArgumentException when trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter where difference between max_size and min_size exceeds the settings value. Closes #25887 --- .../common/settings/IndexScopedSettings.java | 3 ++ .../elasticsearch/index/IndexSettings.java | 40 +++++++++++++++++++ .../index/analysis/NGramTokenizerFactory.java | 14 +++++++ .../analysis/ShingleTokenFilterFactory.java | 16 ++++++++ .../ShingleTokenFilterFactoryTests.java | 22 ++++++++++ .../search/query/SearchQueryIT.java | 2 + .../search/suggest/SuggestSearchIT.java | 3 ++ .../index/analysis/shingle_analysis2.json | 15 +++++++ .../tokenfilters/ngram-tokenfilter.asciidoc | 3 ++ .../tokenfilters/shingle-tokenfilter.asciidoc | 2 + .../tokenizers/ngram-tokenizer.asciidoc | 3 ++ docs/reference/index-modules.asciidoc | 10 +++++ .../common/NGramTokenFilterFactory.java | 15 +++++++ .../common/HighlighterWithAnalyzersTests.java | 2 + .../common/NGramTokenizerFactoryTests.java | 28 ++++++++++++- .../test/analysis-common/30_tokenizers.yml | 15 +++++++ .../search.query/30_ngram_highligthing.yml | 1 + 17 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 core/src/test/resources/org/elasticsearch/index/analysis/shingle_analysis2.json diff --git a/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java b/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java index b37fbb0dce6..962e61b5c3c 100644 --- a/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java +++ b/core/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java @@ -114,6 +114,8 @@ public final class IndexScopedSettings extends AbstractScopedSettings { IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING, IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING, IndexSettings.MAX_SCRIPT_FIELDS_SETTING, + IndexSettings.MAX_NGRAM_DIFF_SETTING, + IndexSettings.MAX_SHINGLE_DIFF_SETTING, IndexSettings.MAX_RESCORE_WINDOW_SETTING, IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING, IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING, @@ -150,6 +152,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings { EngineConfig.INDEX_CODEC_SETTING, EngineConfig.INDEX_OPTIMIZE_AUTO_GENERATED_IDS, IndexMetaData.SETTING_WAIT_FOR_ACTIVE_SHARDS, + // validate that built-in similarities don't get redefined Setting.groupSetting("index.similarity.", (s) -> { Map groups = s.getAsGroups(); diff --git a/core/src/main/java/org/elasticsearch/index/IndexSettings.java b/core/src/main/java/org/elasticsearch/index/IndexSettings.java index 08d0e6c0cce..9e390fb5b22 100644 --- a/core/src/main/java/org/elasticsearch/index/IndexSettings.java +++ b/core/src/main/java/org/elasticsearch/index/IndexSettings.java @@ -107,6 +107,26 @@ public final class IndexSettings { public static final Setting MAX_SCRIPT_FIELDS_SETTING = Setting.intSetting("index.max_script_fields", 32, 0, Property.Dynamic, Property.IndexScope); + /** + * Index setting describing for NGramTokenizer and NGramTokenFilter + * the maximum difference between + * max_gram (maximum length of characters in a gram) and + * min_gram (minimum length of characters in a gram). + * The default value is 1 as this is default difference in NGramTokenizer, + * and is defensive as it prevents generating too many index terms. + */ + public static final Setting MAX_NGRAM_DIFF_SETTING = + Setting.intSetting("index.max_ngram_diff", 1, 0, Property.Dynamic, Property.IndexScope); + + /** + * Index setting describing for ShingleTokenFilter + * the maximum difference between + * max_shingle_size and min_shingle_size. + * The default value is 3 is defensive as it prevents generating too many tokens. + */ + public static final Setting MAX_SHINGLE_DIFF_SETTING = + Setting.intSetting("index.max_shingle_diff", 3, 0, Property.Dynamic, Property.IndexScope); + /** * Index setting describing the maximum value of allowed `docvalue_fields`that can be retrieved * per search request. The default maximum of 100 is defensive for the reason that retrieving @@ -239,6 +259,8 @@ public final class IndexSettings { private volatile int maxRescoreWindow; private volatile int maxDocvalueFields; private volatile int maxScriptFields; + private volatile int maxNgramDiff; + private volatile int maxShingleDiff; private volatile boolean TTLPurgeDisabled; /** * The maximum number of refresh listeners allows on this shard. @@ -342,6 +364,8 @@ public final class IndexSettings { maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING); maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING); maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING); + maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING); + maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING); TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING); maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD); maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL); @@ -373,6 +397,8 @@ public final class IndexSettings { scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow); scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields); scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields); + scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff); + scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff); scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer); scopedSettings.addSettingsUpdateConsumer(INDEX_GC_DELETES_SETTING, this::setGCDeletes); scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING, this::setTranslogFlushThresholdSize); @@ -641,6 +667,20 @@ public final class IndexSettings { this.maxDocvalueFields = maxDocvalueFields; } + /** + * Returns the maximum allowed difference between max and min length of ngram + */ + public int getMaxNgramDiff() { return this.maxNgramDiff; } + + private void setMaxNgramDiff(int maxNgramDiff) { this.maxNgramDiff = maxNgramDiff; } + + /** + * Returns the maximum allowed difference between max and min shingle_size + */ + public int getMaxShingleDiff() { return this.maxShingleDiff; } + + private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; } + /** * Returns the maximum number of allowed script_fields to retrieve in a search request */ diff --git a/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java index 2a31f1eb26a..a5774cd9ce3 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java @@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -84,8 +85,21 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory { public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); + int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff(); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); + int ngramDiff = maxGram - minGram; + if (ngramDiff > maxAllowedNgramDiff) { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) { + throw new IllegalArgumentException( + "The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + + maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the [" + + IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting."); + } else { + deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer," + + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]"); + } + } this.matcher = parseTokenChars(settings.getAsList("token_chars")); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java index a3eecfbdcbf..422941acc73 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java @@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -32,9 +33,24 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); + int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff(); Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true); + + int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0); + if (shingleDiff > maxAllowedShingleDiff) { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) { + throw new IllegalArgumentException( + "In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)" + + " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit" + + " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting."); + } else { + deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter," + + "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]"); + } + } + Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); diff --git a/core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java index 3997ece1361..3af58d4ef73 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactoryTests.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; @@ -102,4 +103,25 @@ public class ShingleTokenFilterFactoryTests extends ESTokenStreamTestCase { assertFalse(stream.hasAttribute(DisableGraphAttribute.class)); } } + + /*` + * test that throws an error when trying to get a ShingleTokenFilter where difference between max_shingle_size and min_shingle_size + * is greater than the allowed value of max_shingle_diff + */ + public void testMaxShingleDiffException() throws Exception{ + String RESOURCE2 = "/org/elasticsearch/index/analysis/shingle_analysis2.json"; + int maxAllowedShingleDiff = 3; + int shingleDiff = 8; + try { + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE2); + analysis.tokenFilter.get("shingle"); + fail(); + } catch (IllegalArgumentException ex) { + assertEquals( + "In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)" + + " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit" + + " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.", + ex.getMessage()); + } + } } diff --git a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java index 3ad7a83ef19..a94f499d0ba 100644 --- a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java +++ b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java @@ -30,6 +30,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.query.BoolQueryBuilder; import org.elasticsearch.index.query.MatchQueryBuilder; import org.elasticsearch.index.query.MultiMatchQueryBuilder; @@ -1802,6 +1803,7 @@ public class SearchQueryIT extends ESIntegTestCase { public void testNGramCopyField() { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() .put(indexSettings()) + .put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9) .put("index.analysis.analyzer.my_ngram_analyzer.type", "custom") .put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer") .put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram") diff --git a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java index b0b655b0f8b..3b1c88cfc57 100644 --- a/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/suggest/SuggestSearchIT.java @@ -28,6 +28,7 @@ import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.plugins.ScriptPlugin; import org.elasticsearch.script.ScriptContext; @@ -683,6 +684,7 @@ public class SuggestSearchIT extends ESIntegTestCase { public void testShardFailures() throws IOException, InterruptedException { CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() .put(indexSettings()) + .put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4) .put("index.analysis.analyzer.suggest.tokenizer", "standard") .putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler") .put("index.analysis.filter.shingler.type", "shingle") @@ -743,6 +745,7 @@ public class SuggestSearchIT extends ESIntegTestCase { endObject(); assertAcked(prepareCreate("test").setSettings(Settings.builder() .put(indexSettings()) + .put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4) .put("index.analysis.analyzer.suggest.tokenizer", "standard") .putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler") .put("index.analysis.filter.shingler.type", "shingle") diff --git a/core/src/test/resources/org/elasticsearch/index/analysis/shingle_analysis2.json b/core/src/test/resources/org/elasticsearch/index/analysis/shingle_analysis2.json new file mode 100644 index 00000000000..a81ea538f19 --- /dev/null +++ b/core/src/test/resources/org/elasticsearch/index/analysis/shingle_analysis2.json @@ -0,0 +1,15 @@ +{ + "index":{ + "analysis":{ + "filter":{ + "shingle_filler":{ + "type":"shingle", + "max_shingle_size" : 10, + "min_shingle_size" : 2, + "output_unigrams" : false, + "filler_token" : "FILLER" + } + } + } + } +} \ No newline at end of file diff --git a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc index 5f911360085..acc178a2741 100644 --- a/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/ngram-tokenfilter.asciidoc @@ -13,3 +13,6 @@ type: |`max_gram` |Defaults to `2`. |============================ +The index level setting `index.max_ngram_diff` controls the maximum allowed +difference between `max_gram` and `min_gram`. + diff --git a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc index 5e3565cf83c..386b45559fd 100644 --- a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc @@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used together with the `shingle` filter. Defaults to `"_"` |======================================================================= +The index level setting `index.max_shingle_diff` controls the maximum allowed +difference between `max_shingle_size` and `min_shingle_size`. diff --git a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc index ae3183f0fd1..c182ffacd1c 100644 --- a/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/ngram-tokenizer.asciidoc @@ -198,6 +198,9 @@ value. The smaller the length, the more documents will match but the lower the quality of the matches. The longer the length, the more specific the matches. A tri-gram (length `3`) is a good place to start. +The index level setting `index.max_ngram_diff` controls the maximum allowed +difference between `max_gram` and `min_gram`. + [float] === Example configuration diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc index d42587eafaf..bf93f62847f 100644 --- a/docs/reference/index-modules.asciidoc +++ b/docs/reference/index-modules.asciidoc @@ -144,6 +144,16 @@ specific index module: The maximum number of `script_fields` that are allowed in a query. Defaults to `32`. +`index.max_ngram_diff`:: + + The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter. + Defaults to `1`. + +`index.max_shingle_diff`:: + + The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter. + Defaults to `3`. + `index.blocks.read_only`:: Set to `true` to make the index and index metadata read only, `false` to diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java index 2d7a8c52fd6..22b06061316 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java @@ -25,6 +25,8 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.Version; + public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { @@ -36,8 +38,21 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory { NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); + int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff(); this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); + int ngramDiff = maxGram - minGram; + if (ngramDiff > maxAllowedNgramDiff) { + if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) { + throw new IllegalArgumentException( + "The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + + maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the [" + + IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting."); + } else { + deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer," + + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]"); + } + } } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java index 6b4682d04a1..96e8043570d 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java @@ -21,6 +21,7 @@ package org.elasticsearch.analysis.common; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.query.Operator; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; @@ -66,6 +67,7 @@ public class HighlighterWithAnalyzersTests extends ESIntegTestCase { .endObject()) .setSettings(Settings.builder() .put(indexSettings()) + .put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19) .put("analysis.tokenizer.autocomplete.max_gram", 20) .put("analysis.tokenizer.autocomplete.min_gram", 1) .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit") diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java index 3f4641c7c18..3c6250eacfa 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/NGramTokenizerFactoryTests.java @@ -76,7 +76,8 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { public void testNoTokenChars() throws IOException { final Index index = new Index("test", "_na_"); final String name = "ngr"; - final Settings indexSettings = newAnalysisSettingsBuilder().build(); + final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build(); + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4) .putList("token_chars", new String[0]).build(); Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings) @@ -152,6 +153,31 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase { } + /*` + * test that throws an error when trying to get a NGramTokenizer where difference between max_gram and min_gram + * is greater than the allowed value of max_ngram_diff + */ + public void testMaxNGramDiffException() throws Exception{ + final Index index = new Index("test", "_na_"); + final String name = "ngr"; + final Settings indexSettings = newAnalysisSettingsBuilder().build(); + IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings); + + int maxAllowedNgramDiff = indexProperties.getMaxNgramDiff(); + int ngramDiff = maxAllowedNgramDiff + 1; + int min_gram = 2; + int max_gram = min_gram + ngramDiff; + + final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build(); + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> + new NGramTokenizerFactory(indexProperties, null, name, settings).create()); + assertEquals( + "The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: [" + + maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the [" + + IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.", + ex.getMessage()); + } + private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException { Field[] declaredFields = Version.class.getFields(); List versionFields = new ArrayList<>(); diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml index c0945e047c5..e6b69db8a0e 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml @@ -27,6 +27,21 @@ - match: { detail.tokenizer.tokens.2.token: od } --- +"nGram_exception": + - skip: + version: " - 6.99.99" + reason: only starting from version 7.x this throws an error + - do: + catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./ + indices.analyze: + body: + text: good + explain: true + tokenizer: + type: nGram + min_gram: 2 + max_gram: 4 +--- "simple_pattern": - do: indices.analyze: diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml index b04496965eb..c1dca047f60 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/search.query/30_ngram_highligthing.yml @@ -6,6 +6,7 @@ settings: number_of_shards: 1 number_of_replicas: 0 + index.max_ngram_diff: 19 analysis: tokenizer: my_ngramt: