Limit the number of tokens produced by _analyze (#27529)

Add an index level setting `index.analyze.max_token_count` to control
the number of generated tokens in the  _analyze endpoint.
Defaults to 10000.

Throw an error if the number of generated tokens exceeds this limit.

Closes #27038
This commit is contained in:
Mayya Sharipova 2017-11-30 11:54:39 -05:00 committed by GitHub
parent 92a24de509
commit c6b73239ae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 241 additions and 30 deletions

View File

@ -158,15 +158,18 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
}
}
final AnalysisRegistry analysisRegistry = indicesService.getAnalysis();
return analyze(request, field, analyzer, indexService != null ? indexService.getIndexAnalyzers() : null, analysisRegistry, environment);
final int maxTokenCount = indexService == null ?
IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings) : indexService.getIndexSettings().getMaxTokenCount();
return analyze(request, field, analyzer, indexService != null ? indexService.getIndexAnalyzers() : null,
analysisRegistry, environment, maxTokenCount);
} catch (IOException e) {
throw new ElasticsearchException("analysis failed", e);
}
}
public static AnalyzeResponse analyze(AnalyzeRequest request, String field, Analyzer analyzer, IndexAnalyzers indexAnalyzers, AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
public static AnalyzeResponse analyze(AnalyzeRequest request, String field, Analyzer analyzer, IndexAnalyzers indexAnalyzers,
AnalysisRegistry analysisRegistry, Environment environment, int maxTokenCount) throws IOException {
boolean closeAnalyzer = false;
if (analyzer == null && request.analyzer() != null) {
if (indexAnalyzers == null) {
@ -235,9 +238,9 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
DetailAnalyzeResponse detail = null;
if (request.explain()) {
detail = detailAnalyze(request, analyzer, field);
detail = detailAnalyze(request, analyzer, field, maxTokenCount);
} else {
tokens = simpleAnalyze(request, analyzer, field);
tokens = simpleAnalyze(request, analyzer, field, maxTokenCount);
}
if (closeAnalyzer) {
@ -247,7 +250,9 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
return new AnalyzeResponse(tokens, detail);
}
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request,
Analyzer analyzer, String field, int maxTokenCount) {
TokenCounter tc = new TokenCounter(maxTokenCount);
List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
int lastPosition = -1;
int lastOffset = 0;
@ -267,7 +272,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
tc.increment();
}
stream.end();
lastOffset += offset.endOffset();
@ -282,7 +287,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
return tokens;
}
private static DetailAnalyzeResponse detailAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
private static DetailAnalyzeResponse detailAnalyze(AnalyzeRequest request, Analyzer analyzer, String field, int maxTokenCount) {
DetailAnalyzeResponse detailResponse;
final Set<String> includeAttributes = new HashSet<>();
if (request.attributes() != null) {
@ -307,7 +312,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
String[][] charFiltersTexts = new String[charFilterFactories != null ? charFilterFactories.length : 0][request.text().length];
TokenListCreator[] tokenFiltersTokenListCreator = new TokenListCreator[tokenFilterFactories != null ? tokenFilterFactories.length : 0];
TokenListCreator tokenizerTokenListCreator = new TokenListCreator();
TokenListCreator tokenizerTokenListCreator = new TokenListCreator(maxTokenCount);
for (int textIndex = 0; textIndex < request.text().length; textIndex++) {
String charFilteredSource = request.text()[textIndex];
@ -333,7 +338,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
if (tokenFilterFactories != null) {
for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFilterFactories.length; tokenFilterIndex++) {
if (tokenFiltersTokenListCreator[tokenFilterIndex] == null) {
tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator();
tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator(maxTokenCount);
}
TokenStream stream = createStackedTokenStream(request.text()[textIndex],
charFilterFactories, tokenizerFactory, tokenFilterFactories, tokenFilterIndex + 1);
@ -366,7 +371,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
name = analyzer.getClass().getName();
}
TokenListCreator tokenListCreator = new TokenListCreator();
TokenListCreator tokenListCreator = new TokenListCreator(maxTokenCount);
for (String text : request.text()) {
tokenListCreator.analyze(analyzer.tokenStream(field, text), analyzer, field,
includeAttributes);
@ -408,13 +413,32 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
return sb.toString();
}
private static class TokenCounter{
private int tokenCount = 0;
private int maxTokenCount;
private TokenCounter(int maxTokenCount){
this.maxTokenCount = maxTokenCount;
}
private void increment(){
tokenCount++;
if (tokenCount > maxTokenCount) {
throw new IllegalStateException(
"The number of tokens produced by calling _analyze has exceeded the allowed maximum of [" + maxTokenCount + "]."
+ " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
}
}
}
private static class TokenListCreator {
int lastPosition = -1;
int lastOffset = 0;
List<AnalyzeResponse.AnalyzeToken> tokens;
private TokenCounter tc;
TokenListCreator() {
TokenListCreator(int maxTokenCount) {
tokens = new ArrayList<>();
tc = new TokenCounter(maxTokenCount);
}
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
@ -433,7 +457,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), extractExtendedAttributes(stream, includeAttributes)));
tc.increment();
}
stream.end();
lastOffset += offset.endOffset();

View File

@ -111,6 +111,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
IndexSettings.INDEX_REFRESH_INTERVAL_SETTING,
IndexSettings.MAX_RESULT_WINDOW_SETTING,
IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
IndexSettings.MAX_TOKEN_COUNT_SETTING,
IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
IndexSettings.MAX_NGRAM_DIFF_SETTING,

View File

@ -110,6 +110,14 @@ public final class IndexSettings {
public static final Setting<Integer> MAX_SCRIPT_FIELDS_SETTING =
Setting.intSetting("index.max_script_fields", 32, 0, Property.Dynamic, Property.IndexScope);
/**
* A setting describing the maximum number of tokens that can be
* produced using _analyze API. The default maximum of 10000 is defensive
* to prevent generating too many token objects.
*/
public static final Setting<Integer> MAX_TOKEN_COUNT_SETTING =
Setting.intSetting("index.analyze.max_token_count", 10000, 1, Property.Dynamic, Property.IndexScope);
/**
* Index setting describing for NGramTokenizer and NGramTokenFilter
* the maximum difference between
@ -262,6 +270,7 @@ public final class IndexSettings {
private volatile int maxRescoreWindow;
private volatile int maxDocvalueFields;
private volatile int maxScriptFields;
private volatile int maxTokenCount;
private volatile int maxNgramDiff;
private volatile int maxShingleDiff;
private volatile boolean TTLPurgeDisabled;
@ -369,6 +378,7 @@ public final class IndexSettings {
maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
maxTokenCount = scopedSettings.get(MAX_TOKEN_COUNT_SETTING);
maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
@ -403,6 +413,7 @@ public final class IndexSettings {
scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
scopedSettings.addSettingsUpdateConsumer(MAX_TOKEN_COUNT_SETTING, this::setMaxTokenCount);
scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
@ -676,6 +687,18 @@ public final class IndexSettings {
this.maxDocvalueFields = maxDocvalueFields;
}
/**
* Returns the maximum number of tokens that can be produced
*/
public int getMaxTokenCount() {
return maxTokenCount;
}
private void setMaxTokenCount(int maxTokenCount) {
this.maxTokenCount = maxTokenCount;
}
/**
* Returns the maximum allowed difference between max and min length of ngram
*/

View File

@ -61,6 +61,8 @@ public class TransportAnalyzeActionTests extends ESTestCase {
private IndexAnalyzers indexAnalyzers;
private AnalysisRegistry registry;
private Environment environment;
private int maxTokenCount;
private int idxMaxTokenCount;
@Override
public void setUp() throws Exception {
@ -73,6 +75,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.put("index.analysis.analyzer.custom_analyzer.filter", "mock")
.put("index.analysis.normalizer.my_normalizer.type", "custom")
.put("index.analyze.max_token_count", 100)
.putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
environment = TestEnvironment.newEnvironment(settings);
@ -116,6 +119,8 @@ public class TransportAnalyzeActionTests extends ESTestCase {
};
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
indexAnalyzers = registry.build(idxSettings);
maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.getDefault(settings);
idxMaxTokenCount = idxSettings.getMaxTokenCount();
}
/**
@ -126,7 +131,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
AnalyzeRequest request = new AnalyzeRequest();
request.text("the quick brown fox");
request.analyzer("standard");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment);
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment, maxTokenCount);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(4, tokens.size());
@ -135,7 +140,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
request.text("the qu1ck brown fox");
request.tokenizer("standard");
request.addTokenFilter("mock");
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment);
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment, maxTokenCount);
tokens = analyze.getTokens();
assertEquals(3, tokens.size());
assertEquals("qu1ck", tokens.get(0).getTerm());
@ -147,7 +152,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
request.text("the qu1ck brown fox");
request.tokenizer("standard");
request.addCharFilter("append_foo");
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment);
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment, maxTokenCount);
tokens = analyze.getTokens();
assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
@ -161,7 +166,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
request.tokenizer("standard");
request.addCharFilter("append");
request.text("the qu1ck brown fox");
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment);
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment, maxTokenCount);
tokens = analyze.getTokens();
assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
@ -174,7 +179,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
AnalyzeRequest request = new AnalyzeRequest();
request.analyzer("standard");
request.text("the 1 brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment);
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment, maxTokenCount);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
@ -206,7 +211,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
AnalyzeRequest request = new AnalyzeRequest();
request.text("the quick brown fox");
request.analyzer("custom_analyzer");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
@ -214,7 +219,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
assertEquals("fox", tokens.get(2).getTerm());
request.analyzer("standard");
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
tokens = analyze.getTokens();
assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
@ -225,7 +230,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
// Switch the analyzer out for just a tokenizer
request.analyzer(null);
request.tokenizer("standard");
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
tokens = analyze.getTokens();
assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
@ -235,7 +240,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
// Now try applying our token filter
request.addTokenFilter("mock");
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
tokens = analyze.getTokens();
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
@ -249,7 +254,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
new AnalyzeRequest()
.analyzer("custom_analyzer")
.text("the qu1ck brown fox-dog"),
"text", null, null, registry, environment));
"text", null, null, registry, environment, maxTokenCount));
assertEquals(e.getMessage(), "failed to find global analyzer [custom_analyzer]");
}
@ -260,7 +265,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
new AnalyzeRequest()
.analyzer("foobar")
.text("the qu1ck brown fox"),
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
if (notGlobal) {
assertEquals(e.getMessage(), "failed to find analyzer [foobar]");
} else {
@ -272,7 +277,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
new AnalyzeRequest()
.tokenizer("foobar")
.text("the qu1ck brown fox"),
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
if (notGlobal) {
assertEquals(e.getMessage(), "failed to find tokenizer under [foobar]");
} else {
@ -285,7 +290,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
.tokenizer("whitespace")
.addTokenFilter("foobar")
.text("the qu1ck brown fox"),
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
if (notGlobal) {
assertEquals(e.getMessage(), "failed to find token filter under [foobar]");
} else {
@ -299,7 +304,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
.addTokenFilter("lowercase")
.addCharFilter("foobar")
.text("the qu1ck brown fox"),
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
if (notGlobal) {
assertEquals(e.getMessage(), "failed to find char filter under [foobar]");
} else {
@ -311,7 +316,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
new AnalyzeRequest()
.normalizer("foobar")
.text("the qu1ck brown fox"),
"text", null, indexAnalyzers, registry, environment));
"text", null, indexAnalyzers, registry, environment, maxTokenCount));
assertEquals(e.getMessage(), "failed to find normalizer under [foobar]");
}
@ -320,7 +325,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
request.tokenizer("whitespace");
request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
request.text("the quick brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
@ -332,10 +337,68 @@ public class TransportAnalyzeActionTests extends ESTestCase {
AnalyzeRequest request = new AnalyzeRequest("index");
request.normalizer("my_normalizer");
request.text("ABc");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(1, tokens.size());
assertEquals("abc", tokens.get(0).getTerm());
}
/**
* This test is equivalent of calling _analyze without a specific index.
* The default value for the maximum token count is used.
*/
public void testExceedDefaultMaxTokenLimit() throws IOException{
// create a string with No. words more than maxTokenCount
StringBuilder sbText = new StringBuilder();
for (int i = 0; i <= maxTokenCount; i++){
sbText.append('a');
sbText.append(' ');
}
String text = sbText.toString();
// request with explain=false to test simpleAnalyze path in TransportAnalyzeAction
AnalyzeRequest request = new AnalyzeRequest();
request.text(text);
request.analyzer("standard");
IllegalStateException e = expectThrows(IllegalStateException.class,
() -> TransportAnalyzeAction.analyze(
request, "text", null, null, registry, environment, maxTokenCount));
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
+ maxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
// request with explain=true to test detailAnalyze path in TransportAnalyzeAction
AnalyzeRequest request2 = new AnalyzeRequest();
request2.text(text);
request2.analyzer("standard");
request2.explain(true);
IllegalStateException e2 = expectThrows(IllegalStateException.class,
() -> TransportAnalyzeAction.analyze(
request2, "text", null, null, registry, environment, maxTokenCount));
assertEquals(e2.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
+ maxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
}
/**
* This test is equivalent of calling _analyze against a specific index.
* The index specific value for the maximum token count is used.
*/
public void testExceedSetMaxTokenLimit() throws IOException{
// create a string with No. words more than idxMaxTokenCount
StringBuilder sbText = new StringBuilder();
for (int i = 0; i <= idxMaxTokenCount; i++){
sbText.append('a');
sbText.append(' ');
}
String text = sbText.toString();
AnalyzeRequest request = new AnalyzeRequest();
request.text(text);
request.analyzer("standard");
IllegalStateException e = expectThrows(IllegalStateException.class,
() -> TransportAnalyzeAction.analyze(
request, "text", null, indexAnalyzers, registry, environment, idxMaxTokenCount));
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
+ idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
}
}

View File

@ -193,6 +193,11 @@ specific index module:
Maximum number of refresh listeners available on each shard of the index.
These listeners are used to implement <<docs-refresh,`refresh=wait_for`>>.
`index.analyze.max_token_count`::
The maximum number of tokens that can be produced using _analyze API.
Defaults to `10000`.
[float]
=== Settings in other index modules

View File

@ -207,3 +207,39 @@ The request returns the following result:
--------------------------------------------------
// TESTRESPONSE
<1> Output only "keyword" attribute, since specify "attributes" in the request.
[[tokens-limit-settings]]
[float]
== Settings to prevent tokens explosion
Generating excessive amount of tokens may cause a node to run out of memory.
The following setting allows to limit the number of tokens that can be produced:
`index.analyze.max_token_count`::
The maximum number of tokens that can be produced using `_analyze` API.
The default value is `10000`. If more than this limit of tokens gets
generated, an error will be thrown. The `_analyze` endpoint without a specified
index will always use `10000` value as a limit. This setting allows you to control
the limit for a specific index:
[source,js]
--------------------------------------------------
PUT analyze_sample
{
"settings" : {
"index.analyze.max_token_count" : 20000
}
}
--------------------------------------------------
// CONSOLE
[source,js]
--------------------------------------------------
GET analyze_sample/_analyze
{
"text" : "this is a test"
}
--------------------------------------------------
// CONSOLE
// TEST[setup:analyze_sample]

View File

@ -6,3 +6,10 @@
The `delimited_payload_filter` is renamed to `delimited_payload`, the old name is
deprecated and will be removed at some point, so it should be replaced by
`delimited_payload`.
==== Limiting the number of tokens produced by _analyze
To safeguard against out of memory errors, the number of tokens that can be produced
using the `_analyze` endpoint has been limited to 10000. This default limit can be changed
for a particular index with the index setting `index.analyze.max_token_count`.

View File

@ -0,0 +1,52 @@
---
setup:
- do:
indices.create:
index: test_1
body:
settings:
index.analyze.max_token_count: 3
---
"_analyze with No. generated tokens less than or equal to index.analyze.max_token_count should succeed":
- skip:
version: " - 6.99.99"
reason: index.analyze.max_token_count setting has been added in 7.0.0
- do:
indices.analyze:
index: test_1
body:
text: This should succeed
analyzer: standard
- length: { tokens: 3 }
- match: { tokens.0.token: this }
- match: { tokens.1.token: should }
- match: { tokens.2.token: succeed }
---
"_analyze with No. generated tokens more than index.analyze.max_token_count should fail":
- skip:
version: " - 6.99.99"
reason: index.analyze.max_token_count setting has been added in 7.0.0
- do:
catch: /The number of tokens produced by calling _analyze has exceeded the allowed maximum of \[3\]. This limit can be set by changing the \[index.analyze.max_token_count\] index level setting\./
indices.analyze:
index: test_1
body:
text: This should fail as it exceeds limit
analyzer: standard
---
"_analyze with explain with No. generated tokens more than index.analyze.max_token_count should fail":
- skip:
version: " - 6.99.99"
reason: index.analyze.max_token_count setting has been added in 7.0.0
- do:
catch: /The number of tokens produced by calling _analyze has exceeded the allowed maximum of \[3\]. This limit can be set by changing the \[index.analyze.max_token_count\] index level setting\./
indices.analyze:
index: test_1
body:
text: This should fail as it exceeds limit
analyzer: standard
explain: true