Add limits for ngram and shingle settings (#27211)
* Add limits for ngram and shingle settings (#27211) Create index-level settings: max_ngram_diff - maximum allowed difference between max_gram and min_gram in NGramTokenFilter/NGramTokenizer. Default is 1. max_shingle_diff - maximum allowed difference between max_shingle_size and min_shingle_size in ShingleTokenFilter. Default is 3. Throw an IllegalArgumentException when trying to create NGramTokenFilter, NGramTokenizer, ShingleTokenFilter where difference between max_size and min_size exceeds the settings value. Closes #25887
This commit is contained in:
parent
2fc6c64c82
commit
148376c2c5
|
@ -114,6 +114,8 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
|
|||
IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
|
||||
IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
|
||||
IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
|
||||
IndexSettings.MAX_NGRAM_DIFF_SETTING,
|
||||
IndexSettings.MAX_SHINGLE_DIFF_SETTING,
|
||||
IndexSettings.MAX_RESCORE_WINDOW_SETTING,
|
||||
IndexSettings.MAX_ADJACENCY_MATRIX_FILTERS_SETTING,
|
||||
IndexSettings.INDEX_TRANSLOG_SYNC_INTERVAL_SETTING,
|
||||
|
@ -150,6 +152,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
|
|||
EngineConfig.INDEX_CODEC_SETTING,
|
||||
EngineConfig.INDEX_OPTIMIZE_AUTO_GENERATED_IDS,
|
||||
IndexMetaData.SETTING_WAIT_FOR_ACTIVE_SHARDS,
|
||||
|
||||
// validate that built-in similarities don't get redefined
|
||||
Setting.groupSetting("index.similarity.", (s) -> {
|
||||
Map<String, Settings> groups = s.getAsGroups();
|
||||
|
|
|
@ -107,6 +107,26 @@ public final class IndexSettings {
|
|||
public static final Setting<Integer> MAX_SCRIPT_FIELDS_SETTING =
|
||||
Setting.intSetting("index.max_script_fields", 32, 0, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
/**
|
||||
* Index setting describing for NGramTokenizer and NGramTokenFilter
|
||||
* the maximum difference between
|
||||
* max_gram (maximum length of characters in a gram) and
|
||||
* min_gram (minimum length of characters in a gram).
|
||||
* The default value is 1 as this is default difference in NGramTokenizer,
|
||||
* and is defensive as it prevents generating too many index terms.
|
||||
*/
|
||||
public static final Setting<Integer> MAX_NGRAM_DIFF_SETTING =
|
||||
Setting.intSetting("index.max_ngram_diff", 1, 0, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
/**
|
||||
* Index setting describing for ShingleTokenFilter
|
||||
* the maximum difference between
|
||||
* max_shingle_size and min_shingle_size.
|
||||
* The default value is 3 is defensive as it prevents generating too many tokens.
|
||||
*/
|
||||
public static final Setting<Integer> MAX_SHINGLE_DIFF_SETTING =
|
||||
Setting.intSetting("index.max_shingle_diff", 3, 0, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
/**
|
||||
* Index setting describing the maximum value of allowed `docvalue_fields`that can be retrieved
|
||||
* per search request. The default maximum of 100 is defensive for the reason that retrieving
|
||||
|
@ -239,6 +259,8 @@ public final class IndexSettings {
|
|||
private volatile int maxRescoreWindow;
|
||||
private volatile int maxDocvalueFields;
|
||||
private volatile int maxScriptFields;
|
||||
private volatile int maxNgramDiff;
|
||||
private volatile int maxShingleDiff;
|
||||
private volatile boolean TTLPurgeDisabled;
|
||||
/**
|
||||
* The maximum number of refresh listeners allows on this shard.
|
||||
|
@ -342,6 +364,8 @@ public final class IndexSettings {
|
|||
maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
|
||||
maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
|
||||
maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
|
||||
maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
|
||||
maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
|
||||
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
|
||||
maxRefreshListeners = scopedSettings.get(MAX_REFRESH_LISTENERS_PER_SHARD);
|
||||
maxSlicesPerScroll = scopedSettings.get(MAX_SLICES_PER_SCROLL);
|
||||
|
@ -373,6 +397,8 @@ public final class IndexSettings {
|
|||
scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
|
||||
scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
|
||||
scopedSettings.addSettingsUpdateConsumer(INDEX_GC_DELETES_SETTING, this::setGCDeletes);
|
||||
scopedSettings.addSettingsUpdateConsumer(INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING, this::setTranslogFlushThresholdSize);
|
||||
|
@ -641,6 +667,20 @@ public final class IndexSettings {
|
|||
this.maxDocvalueFields = maxDocvalueFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum allowed difference between max and min length of ngram
|
||||
*/
|
||||
public int getMaxNgramDiff() { return this.maxNgramDiff; }
|
||||
|
||||
private void setMaxNgramDiff(int maxNgramDiff) { this.maxNgramDiff = maxNgramDiff; }
|
||||
|
||||
/**
|
||||
* Returns the maximum allowed difference between max and min shingle_size
|
||||
*/
|
||||
public int getMaxShingleDiff() { return this.maxShingleDiff; }
|
||||
|
||||
private void setMaxShingleDiff(int maxShingleDiff) { this.maxShingleDiff = maxShingleDiff; }
|
||||
|
||||
/**
|
||||
* Returns the maximum number of allowed script_fields to retrieve in a search request
|
||||
*/
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -84,8 +85,21 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
|
||||
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
||||
int ngramDiff = maxGram - minGram;
|
||||
if (ngramDiff > maxAllowedNgramDiff) {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
|
||||
} else {
|
||||
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
|
||||
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
|
||||
}
|
||||
}
|
||||
this.matcher = parseTokenChars(settings.getAsList("token_chars"));
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.index.analysis;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
@ -32,9 +33,24 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
|
||||
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
||||
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
|
||||
|
||||
int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
|
||||
if (shingleDiff > maxAllowedShingleDiff) {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
|
||||
throw new IllegalArgumentException(
|
||||
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
|
||||
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
|
||||
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.");
|
||||
} else {
|
||||
deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
|
||||
+ "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
|
||||
}
|
||||
}
|
||||
|
||||
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
||||
String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
|
||||
String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
|
||||
|
@ -102,4 +103,25 @@ public class ShingleTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
|||
assertFalse(stream.hasAttribute(DisableGraphAttribute.class));
|
||||
}
|
||||
}
|
||||
|
||||
/*`
|
||||
* test that throws an error when trying to get a ShingleTokenFilter where difference between max_shingle_size and min_shingle_size
|
||||
* is greater than the allowed value of max_shingle_diff
|
||||
*/
|
||||
public void testMaxShingleDiffException() throws Exception{
|
||||
String RESOURCE2 = "/org/elasticsearch/index/analysis/shingle_analysis2.json";
|
||||
int maxAllowedShingleDiff = 3;
|
||||
int shingleDiff = 8;
|
||||
try {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE2);
|
||||
analysis.tokenFilter.get("shingle");
|
||||
fail();
|
||||
} catch (IllegalArgumentException ex) {
|
||||
assertEquals(
|
||||
"In Shingle TokenFilter the difference between max_shingle_size and min_shingle_size (and +1 if outputting unigrams)"
|
||||
+ " must be less than or equal to: [" + maxAllowedShingleDiff + "] but was [" + shingleDiff + "]. This limit"
|
||||
+ " can be set by changing the [" + IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey() + "] index level setting.",
|
||||
ex.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ import org.elasticsearch.common.Strings;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.query.BoolQueryBuilder;
|
||||
import org.elasticsearch.index.query.MatchQueryBuilder;
|
||||
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
|
||||
|
@ -1802,6 +1803,7 @@ public class SearchQueryIT extends ESIntegTestCase {
|
|||
public void testNGramCopyField() {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
|
||||
.put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
|
||||
.put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.elasticsearch.action.search.SearchResponse;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.plugins.ScriptPlugin;
|
||||
import org.elasticsearch.script.ScriptContext;
|
||||
|
@ -683,6 +684,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
|||
public void testShardFailures() throws IOException, InterruptedException {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
|
||||
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
|
||||
.put("index.analysis.filter.shingler.type", "shingle")
|
||||
|
@ -743,6 +745,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
|||
endObject();
|
||||
assertAcked(prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put(IndexSettings.MAX_SHINGLE_DIFF_SETTING.getKey(), 4)
|
||||
.put("index.analysis.analyzer.suggest.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.suggest.filter", "standard", "lowercase", "shingler")
|
||||
.put("index.analysis.filter.shingler.type", "shingle")
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"index":{
|
||||
"analysis":{
|
||||
"filter":{
|
||||
"shingle_filler":{
|
||||
"type":"shingle",
|
||||
"max_shingle_size" : 10,
|
||||
"min_shingle_size" : 2,
|
||||
"output_unigrams" : false,
|
||||
"filler_token" : "FILLER"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -13,3 +13,6 @@ type:
|
|||
|`max_gram` |Defaults to `2`.
|
||||
|============================
|
||||
|
||||
The index level setting `index.max_ngram_diff` controls the maximum allowed
|
||||
difference between `max_gram` and `min_gram`.
|
||||
|
||||
|
|
|
@ -38,3 +38,5 @@ used if the position increment is greater than one when a `stop` filter is used
|
|||
together with the `shingle` filter. Defaults to `"_"`
|
||||
|=======================================================================
|
||||
|
||||
The index level setting `index.max_shingle_diff` controls the maximum allowed
|
||||
difference between `max_shingle_size` and `min_shingle_size`.
|
||||
|
|
|
@ -198,6 +198,9 @@ value. The smaller the length, the more documents will match but the lower
|
|||
the quality of the matches. The longer the length, the more specific the
|
||||
matches. A tri-gram (length `3`) is a good place to start.
|
||||
|
||||
The index level setting `index.max_ngram_diff` controls the maximum allowed
|
||||
difference between `max_gram` and `min_gram`.
|
||||
|
||||
[float]
|
||||
=== Example configuration
|
||||
|
||||
|
|
|
@ -144,6 +144,16 @@ specific index module:
|
|||
The maximum number of `script_fields` that are allowed in a query.
|
||||
Defaults to `32`.
|
||||
|
||||
`index.max_ngram_diff`::
|
||||
|
||||
The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter.
|
||||
Defaults to `1`.
|
||||
|
||||
`index.max_shingle_diff`::
|
||||
|
||||
The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter.
|
||||
Defaults to `3`.
|
||||
|
||||
`index.blocks.read_only`::
|
||||
|
||||
Set to `true` to make the index and index metadata read only, `false` to
|
||||
|
|
|
@ -25,6 +25,8 @@ import org.elasticsearch.common.settings.Settings;
|
|||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.Version;
|
||||
|
||||
|
||||
|
||||
public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
@ -36,8 +38,21 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
|
||||
int ngramDiff = maxGram - minGram;
|
||||
if (ngramDiff > maxAllowedNgramDiff) {
|
||||
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
|
||||
throw new IllegalArgumentException(
|
||||
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.");
|
||||
} else {
|
||||
deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer,"
|
||||
+ "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.elasticsearch.analysis.common;
|
|||
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.query.Operator;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
|
||||
|
@ -66,6 +67,7 @@ public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
|
|||
.endObject())
|
||||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 19)
|
||||
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
||||
.put("analysis.tokenizer.autocomplete.min_gram", 1)
|
||||
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
||||
|
|
|
@ -76,7 +76,8 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase {
|
|||
public void testNoTokenChars() throws IOException {
|
||||
final Index index = new Index("test", "_na_");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 2).build();
|
||||
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4)
|
||||
.putList("token_chars", new String[0]).build();
|
||||
Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings)
|
||||
|
@ -152,6 +153,31 @@ public class NGramTokenizerFactoryTests extends ESTokenStreamTestCase {
|
|||
}
|
||||
|
||||
|
||||
/*`
|
||||
* test that throws an error when trying to get a NGramTokenizer where difference between max_gram and min_gram
|
||||
* is greater than the allowed value of max_ngram_diff
|
||||
*/
|
||||
public void testMaxNGramDiffException() throws Exception{
|
||||
final Index index = new Index("test", "_na_");
|
||||
final String name = "ngr";
|
||||
final Settings indexSettings = newAnalysisSettingsBuilder().build();
|
||||
IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
|
||||
|
||||
int maxAllowedNgramDiff = indexProperties.getMaxNgramDiff();
|
||||
int ngramDiff = maxAllowedNgramDiff + 1;
|
||||
int min_gram = 2;
|
||||
int max_gram = min_gram + ngramDiff;
|
||||
|
||||
final Settings settings = newAnalysisSettingsBuilder().put("min_gram", min_gram).put("max_gram", max_gram).build();
|
||||
IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () ->
|
||||
new NGramTokenizerFactory(indexProperties, null, name, settings).create());
|
||||
assertEquals(
|
||||
"The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to: ["
|
||||
+ maxAllowedNgramDiff + "] but was [" + ngramDiff + "]. This limit can be set by changing the ["
|
||||
+ IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey() + "] index level setting.",
|
||||
ex.getMessage());
|
||||
}
|
||||
|
||||
private Version randomVersion(Random random) throws IllegalArgumentException, IllegalAccessException {
|
||||
Field[] declaredFields = Version.class.getFields();
|
||||
List<Field> versionFields = new ArrayList<>();
|
||||
|
|
|
@ -27,6 +27,21 @@
|
|||
- match: { detail.tokenizer.tokens.2.token: od }
|
||||
|
||||
---
|
||||
"nGram_exception":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: only starting from version 7.x this throws an error
|
||||
- do:
|
||||
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
|
||||
indices.analyze:
|
||||
body:
|
||||
text: good
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: nGram
|
||||
min_gram: 2
|
||||
max_gram: 4
|
||||
---
|
||||
"simple_pattern":
|
||||
- do:
|
||||
indices.analyze:
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
settings:
|
||||
number_of_shards: 1
|
||||
number_of_replicas: 0
|
||||
index.max_ngram_diff: 19
|
||||
analysis:
|
||||
tokenizer:
|
||||
my_ngramt:
|
||||
|
|
Loading…
Reference in New Issue