Limit the number of tokens produced by _analyze (#27529)
Add an index level setting `index.analyze.max_token_count` to control the number of generated tokens in the _analyze endpoint. Defaults to 10000. Throw an error if the number of generated tokens exceeds this limit. Closes #27038
This commit is contained in:
parent
92a24de509
commit
c6b73239ae
|
@ -158,15 +158,18 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
}
|
||||
}
|
||||
final AnalysisRegistry analysisRegistry = indicesService.getAnalysis();
|
||||
return analyze(request, field, analyzer, indexService != null ? indexService.getIndexAnalyzers() : null, analysisRegistry, environment);
|
||||
final int maxTokenCount = indexService == null ?
|
||||
IndexSettings.MAX_TOKEN_COUNT_SETTING.get(settings) : indexService.getIndexSettings().getMaxTokenCount();
|
||||
return analyze(request, field, analyzer, indexService != null ? indexService.getIndexAnalyzers() : null,
|
||||
analysisRegistry, environment, maxTokenCount);
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("analysis failed", e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static AnalyzeResponse analyze(AnalyzeRequest request, String field, Analyzer analyzer, IndexAnalyzers indexAnalyzers, AnalysisRegistry analysisRegistry, Environment environment) throws IOException {
|
||||
|
||||
public static AnalyzeResponse analyze(AnalyzeRequest request, String field, Analyzer analyzer, IndexAnalyzers indexAnalyzers,
|
||||
AnalysisRegistry analysisRegistry, Environment environment, int maxTokenCount) throws IOException {
|
||||
boolean closeAnalyzer = false;
|
||||
if (analyzer == null && request.analyzer() != null) {
|
||||
if (indexAnalyzers == null) {
|
||||
|
@ -235,9 +238,9 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
DetailAnalyzeResponse detail = null;
|
||||
|
||||
if (request.explain()) {
|
||||
detail = detailAnalyze(request, analyzer, field);
|
||||
detail = detailAnalyze(request, analyzer, field, maxTokenCount);
|
||||
} else {
|
||||
tokens = simpleAnalyze(request, analyzer, field);
|
||||
tokens = simpleAnalyze(request, analyzer, field, maxTokenCount);
|
||||
}
|
||||
|
||||
if (closeAnalyzer) {
|
||||
|
@ -247,7 +250,9 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
return new AnalyzeResponse(tokens, detail);
|
||||
}
|
||||
|
||||
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
|
||||
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request,
|
||||
Analyzer analyzer, String field, int maxTokenCount) {
|
||||
TokenCounter tc = new TokenCounter(maxTokenCount);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>();
|
||||
int lastPosition = -1;
|
||||
int lastOffset = 0;
|
||||
|
@ -267,7 +272,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
}
|
||||
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
|
||||
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), null));
|
||||
|
||||
tc.increment();
|
||||
}
|
||||
stream.end();
|
||||
lastOffset += offset.endOffset();
|
||||
|
@ -282,7 +287,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
return tokens;
|
||||
}
|
||||
|
||||
private static DetailAnalyzeResponse detailAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) {
|
||||
private static DetailAnalyzeResponse detailAnalyze(AnalyzeRequest request, Analyzer analyzer, String field, int maxTokenCount) {
|
||||
DetailAnalyzeResponse detailResponse;
|
||||
final Set<String> includeAttributes = new HashSet<>();
|
||||
if (request.attributes() != null) {
|
||||
|
@ -307,7 +312,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
String[][] charFiltersTexts = new String[charFilterFactories != null ? charFilterFactories.length : 0][request.text().length];
|
||||
TokenListCreator[] tokenFiltersTokenListCreator = new TokenListCreator[tokenFilterFactories != null ? tokenFilterFactories.length : 0];
|
||||
|
||||
TokenListCreator tokenizerTokenListCreator = new TokenListCreator();
|
||||
TokenListCreator tokenizerTokenListCreator = new TokenListCreator(maxTokenCount);
|
||||
|
||||
for (int textIndex = 0; textIndex < request.text().length; textIndex++) {
|
||||
String charFilteredSource = request.text()[textIndex];
|
||||
|
@ -333,7 +338,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
if (tokenFilterFactories != null) {
|
||||
for (int tokenFilterIndex = 0; tokenFilterIndex < tokenFilterFactories.length; tokenFilterIndex++) {
|
||||
if (tokenFiltersTokenListCreator[tokenFilterIndex] == null) {
|
||||
tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator();
|
||||
tokenFiltersTokenListCreator[tokenFilterIndex] = new TokenListCreator(maxTokenCount);
|
||||
}
|
||||
TokenStream stream = createStackedTokenStream(request.text()[textIndex],
|
||||
charFilterFactories, tokenizerFactory, tokenFilterFactories, tokenFilterIndex + 1);
|
||||
|
@ -366,7 +371,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
name = analyzer.getClass().getName();
|
||||
}
|
||||
|
||||
TokenListCreator tokenListCreator = new TokenListCreator();
|
||||
TokenListCreator tokenListCreator = new TokenListCreator(maxTokenCount);
|
||||
for (String text : request.text()) {
|
||||
tokenListCreator.analyze(analyzer.tokenStream(field, text), analyzer, field,
|
||||
includeAttributes);
|
||||
|
@ -408,13 +413,32 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
private static class TokenCounter{
|
||||
private int tokenCount = 0;
|
||||
private int maxTokenCount;
|
||||
|
||||
private TokenCounter(int maxTokenCount){
|
||||
this.maxTokenCount = maxTokenCount;
|
||||
}
|
||||
private void increment(){
|
||||
tokenCount++;
|
||||
if (tokenCount > maxTokenCount) {
|
||||
throw new IllegalStateException(
|
||||
"The number of tokens produced by calling _analyze has exceeded the allowed maximum of [" + maxTokenCount + "]."
|
||||
+ " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class TokenListCreator {
|
||||
int lastPosition = -1;
|
||||
int lastOffset = 0;
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens;
|
||||
private TokenCounter tc;
|
||||
|
||||
TokenListCreator() {
|
||||
TokenListCreator(int maxTokenCount) {
|
||||
tokens = new ArrayList<>();
|
||||
tc = new TokenCounter(maxTokenCount);
|
||||
}
|
||||
|
||||
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) {
|
||||
|
@ -433,7 +457,7 @@ public class TransportAnalyzeAction extends TransportSingleShardAction<AnalyzeRe
|
|||
}
|
||||
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(),
|
||||
lastOffset + offset.endOffset(), posLen.getPositionLength(), type.type(), extractExtendedAttributes(stream, includeAttributes)));
|
||||
|
||||
tc.increment();
|
||||
}
|
||||
stream.end();
|
||||
lastOffset += offset.endOffset();
|
||||
|
|
|
@ -111,6 +111,7 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
|
|||
IndexSettings.INDEX_REFRESH_INTERVAL_SETTING,
|
||||
IndexSettings.MAX_RESULT_WINDOW_SETTING,
|
||||
IndexSettings.MAX_INNER_RESULT_WINDOW_SETTING,
|
||||
IndexSettings.MAX_TOKEN_COUNT_SETTING,
|
||||
IndexSettings.MAX_DOCVALUE_FIELDS_SEARCH_SETTING,
|
||||
IndexSettings.MAX_SCRIPT_FIELDS_SETTING,
|
||||
IndexSettings.MAX_NGRAM_DIFF_SETTING,
|
||||
|
|
|
@ -110,6 +110,14 @@ public final class IndexSettings {
|
|||
public static final Setting<Integer> MAX_SCRIPT_FIELDS_SETTING =
|
||||
Setting.intSetting("index.max_script_fields", 32, 0, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
/**
|
||||
* A setting describing the maximum number of tokens that can be
|
||||
* produced using _analyze API. The default maximum of 10000 is defensive
|
||||
* to prevent generating too many token objects.
|
||||
*/
|
||||
public static final Setting<Integer> MAX_TOKEN_COUNT_SETTING =
|
||||
Setting.intSetting("index.analyze.max_token_count", 10000, 1, Property.Dynamic, Property.IndexScope);
|
||||
|
||||
/**
|
||||
* Index setting describing for NGramTokenizer and NGramTokenFilter
|
||||
* the maximum difference between
|
||||
|
@ -262,6 +270,7 @@ public final class IndexSettings {
|
|||
private volatile int maxRescoreWindow;
|
||||
private volatile int maxDocvalueFields;
|
||||
private volatile int maxScriptFields;
|
||||
private volatile int maxTokenCount;
|
||||
private volatile int maxNgramDiff;
|
||||
private volatile int maxShingleDiff;
|
||||
private volatile boolean TTLPurgeDisabled;
|
||||
|
@ -369,6 +378,7 @@ public final class IndexSettings {
|
|||
maxRescoreWindow = scopedSettings.get(MAX_RESCORE_WINDOW_SETTING);
|
||||
maxDocvalueFields = scopedSettings.get(MAX_DOCVALUE_FIELDS_SEARCH_SETTING);
|
||||
maxScriptFields = scopedSettings.get(MAX_SCRIPT_FIELDS_SETTING);
|
||||
maxTokenCount = scopedSettings.get(MAX_TOKEN_COUNT_SETTING);
|
||||
maxNgramDiff = scopedSettings.get(MAX_NGRAM_DIFF_SETTING);
|
||||
maxShingleDiff = scopedSettings.get(MAX_SHINGLE_DIFF_SETTING);
|
||||
TTLPurgeDisabled = scopedSettings.get(INDEX_TTL_DISABLE_PURGE_SETTING);
|
||||
|
@ -403,6 +413,7 @@ public final class IndexSettings {
|
|||
scopedSettings.addSettingsUpdateConsumer(MAX_RESCORE_WINDOW_SETTING, this::setMaxRescoreWindow);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_DOCVALUE_FIELDS_SEARCH_SETTING, this::setMaxDocvalueFields);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_SCRIPT_FIELDS_SETTING, this::setMaxScriptFields);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_TOKEN_COUNT_SETTING, this::setMaxTokenCount);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_NGRAM_DIFF_SETTING, this::setMaxNgramDiff);
|
||||
scopedSettings.addSettingsUpdateConsumer(MAX_SHINGLE_DIFF_SETTING, this::setMaxShingleDiff);
|
||||
scopedSettings.addSettingsUpdateConsumer(INDEX_WARMER_ENABLED_SETTING, this::setEnableWarmer);
|
||||
|
@ -676,6 +687,18 @@ public final class IndexSettings {
|
|||
this.maxDocvalueFields = maxDocvalueFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum number of tokens that can be produced
|
||||
*/
|
||||
public int getMaxTokenCount() {
|
||||
return maxTokenCount;
|
||||
}
|
||||
|
||||
private void setMaxTokenCount(int maxTokenCount) {
|
||||
this.maxTokenCount = maxTokenCount;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the maximum allowed difference between max and min length of ngram
|
||||
*/
|
||||
|
|
|
@ -61,6 +61,8 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
private IndexAnalyzers indexAnalyzers;
|
||||
private AnalysisRegistry registry;
|
||||
private Environment environment;
|
||||
private int maxTokenCount;
|
||||
private int idxMaxTokenCount;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
|
@ -73,6 +75,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.custom_analyzer.filter", "mock")
|
||||
.put("index.analysis.normalizer.my_normalizer.type", "custom")
|
||||
.put("index.analyze.max_token_count", 100)
|
||||
.putList("index.analysis.normalizer.my_normalizer.filter", "lowercase").build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
environment = TestEnvironment.newEnvironment(settings);
|
||||
|
@ -116,6 +119,8 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
};
|
||||
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
|
||||
indexAnalyzers = registry.build(idxSettings);
|
||||
maxTokenCount = IndexSettings.MAX_TOKEN_COUNT_SETTING.getDefault(settings);
|
||||
idxMaxTokenCount = idxSettings.getMaxTokenCount();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -126,7 +131,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.text("the quick brown fox");
|
||||
request.analyzer("standard");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment);
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment, maxTokenCount);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
|
||||
|
@ -135,7 +140,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.text("the qu1ck brown fox");
|
||||
request.tokenizer("standard");
|
||||
request.addTokenFilter("mock");
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment);
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("qu1ck", tokens.get(0).getTerm());
|
||||
|
@ -147,7 +152,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.text("the qu1ck brown fox");
|
||||
request.tokenizer("standard");
|
||||
request.addCharFilter("append_foo");
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment);
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -161,7 +166,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.tokenizer("standard");
|
||||
request.addCharFilter("append");
|
||||
request.text("the qu1ck brown fox");
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment);
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, randomBoolean() ? indexAnalyzers : null, registry, environment, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -174,7 +179,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.analyzer("standard");
|
||||
request.text("the 1 brown fox");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment);
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, null, registry, environment, maxTokenCount);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -206,7 +211,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.text("the quick brown fox");
|
||||
request.analyzer("custom_analyzer");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
|
@ -214,7 +219,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
assertEquals("fox", tokens.get(2).getTerm());
|
||||
|
||||
request.analyzer("standard");
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -225,7 +230,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
// Switch the analyzer out for just a tokenizer
|
||||
request.analyzer(null);
|
||||
request.tokenizer("standard");
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
|
@ -235,7 +240,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
|
||||
// Now try applying our token filter
|
||||
request.addTokenFilter("mock");
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
|
||||
analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
|
@ -249,7 +254,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeRequest()
|
||||
.analyzer("custom_analyzer")
|
||||
.text("the qu1ck brown fox-dog"),
|
||||
"text", null, null, registry, environment));
|
||||
"text", null, null, registry, environment, maxTokenCount));
|
||||
assertEquals(e.getMessage(), "failed to find global analyzer [custom_analyzer]");
|
||||
}
|
||||
|
||||
|
@ -260,7 +265,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeRequest()
|
||||
.analyzer("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find analyzer [foobar]");
|
||||
} else {
|
||||
|
@ -272,7 +277,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeRequest()
|
||||
.tokenizer("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find tokenizer under [foobar]");
|
||||
} else {
|
||||
|
@ -285,7 +290,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
.tokenizer("whitespace")
|
||||
.addTokenFilter("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find token filter under [foobar]");
|
||||
} else {
|
||||
|
@ -299,7 +304,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
.addTokenFilter("lowercase")
|
||||
.addCharFilter("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment));
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
|
||||
if (notGlobal) {
|
||||
assertEquals(e.getMessage(), "failed to find char filter under [foobar]");
|
||||
} else {
|
||||
|
@ -311,7 +316,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
new AnalyzeRequest()
|
||||
.normalizer("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
"text", null, indexAnalyzers, registry, environment));
|
||||
"text", null, indexAnalyzers, registry, environment, maxTokenCount));
|
||||
assertEquals(e.getMessage(), "failed to find normalizer under [foobar]");
|
||||
}
|
||||
|
||||
|
@ -320,7 +325,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
request.tokenizer("whitespace");
|
||||
request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
|
||||
request.text("the quick brown fox");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
|
@ -332,10 +337,68 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||
AnalyzeRequest request = new AnalyzeRequest("index");
|
||||
request.normalizer("my_normalizer");
|
||||
request.text("ABc");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment);
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||
|
||||
assertEquals(1, tokens.size());
|
||||
assertEquals("abc", tokens.get(0).getTerm());
|
||||
}
|
||||
|
||||
/**
|
||||
* This test is equivalent of calling _analyze without a specific index.
|
||||
* The default value for the maximum token count is used.
|
||||
*/
|
||||
public void testExceedDefaultMaxTokenLimit() throws IOException{
|
||||
// create a string with No. words more than maxTokenCount
|
||||
StringBuilder sbText = new StringBuilder();
|
||||
for (int i = 0; i <= maxTokenCount; i++){
|
||||
sbText.append('a');
|
||||
sbText.append(' ');
|
||||
}
|
||||
String text = sbText.toString();
|
||||
|
||||
// request with explain=false to test simpleAnalyze path in TransportAnalyzeAction
|
||||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.text(text);
|
||||
request.analyzer("standard");
|
||||
IllegalStateException e = expectThrows(IllegalStateException.class,
|
||||
() -> TransportAnalyzeAction.analyze(
|
||||
request, "text", null, null, registry, environment, maxTokenCount));
|
||||
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
|
||||
+ maxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
|
||||
// request with explain=true to test detailAnalyze path in TransportAnalyzeAction
|
||||
AnalyzeRequest request2 = new AnalyzeRequest();
|
||||
request2.text(text);
|
||||
request2.analyzer("standard");
|
||||
request2.explain(true);
|
||||
IllegalStateException e2 = expectThrows(IllegalStateException.class,
|
||||
() -> TransportAnalyzeAction.analyze(
|
||||
request2, "text", null, null, registry, environment, maxTokenCount));
|
||||
assertEquals(e2.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
|
||||
+ maxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
}
|
||||
|
||||
/**
|
||||
* This test is equivalent of calling _analyze against a specific index.
|
||||
* The index specific value for the maximum token count is used.
|
||||
*/
|
||||
public void testExceedSetMaxTokenLimit() throws IOException{
|
||||
// create a string with No. words more than idxMaxTokenCount
|
||||
StringBuilder sbText = new StringBuilder();
|
||||
for (int i = 0; i <= idxMaxTokenCount; i++){
|
||||
sbText.append('a');
|
||||
sbText.append(' ');
|
||||
}
|
||||
String text = sbText.toString();
|
||||
|
||||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.text(text);
|
||||
request.analyzer("standard");
|
||||
IllegalStateException e = expectThrows(IllegalStateException.class,
|
||||
() -> TransportAnalyzeAction.analyze(
|
||||
request, "text", null, indexAnalyzers, registry, environment, idxMaxTokenCount));
|
||||
assertEquals(e.getMessage(), "The number of tokens produced by calling _analyze has exceeded the allowed maximum of ["
|
||||
+ idxMaxTokenCount + "]." + " This limit can be set by changing the [index.analyze.max_token_count] index level setting.");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -193,6 +193,11 @@ specific index module:
|
|||
Maximum number of refresh listeners available on each shard of the index.
|
||||
These listeners are used to implement <<docs-refresh,`refresh=wait_for`>>.
|
||||
|
||||
`index.analyze.max_token_count`::
|
||||
|
||||
The maximum number of tokens that can be produced using _analyze API.
|
||||
Defaults to `10000`.
|
||||
|
||||
|
||||
[float]
|
||||
=== Settings in other index modules
|
||||
|
|
|
@ -207,3 +207,39 @@ The request returns the following result:
|
|||
--------------------------------------------------
|
||||
// TESTRESPONSE
|
||||
<1> Output only "keyword" attribute, since specify "attributes" in the request.
|
||||
|
||||
[[tokens-limit-settings]]
|
||||
[float]
|
||||
== Settings to prevent tokens explosion
|
||||
Generating excessive amount of tokens may cause a node to run out of memory.
|
||||
The following setting allows to limit the number of tokens that can be produced:
|
||||
|
||||
`index.analyze.max_token_count`::
|
||||
The maximum number of tokens that can be produced using `_analyze` API.
|
||||
The default value is `10000`. If more than this limit of tokens gets
|
||||
generated, an error will be thrown. The `_analyze` endpoint without a specified
|
||||
index will always use `10000` value as a limit. This setting allows you to control
|
||||
the limit for a specific index:
|
||||
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT analyze_sample
|
||||
{
|
||||
"settings" : {
|
||||
"index.analyze.max_token_count" : 20000
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET analyze_sample/_analyze
|
||||
{
|
||||
"text" : "this is a test"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// CONSOLE
|
||||
// TEST[setup:analyze_sample]
|
|
@ -6,3 +6,10 @@
|
|||
The `delimited_payload_filter` is renamed to `delimited_payload`, the old name is
|
||||
deprecated and will be removed at some point, so it should be replaced by
|
||||
`delimited_payload`.
|
||||
|
||||
|
||||
==== Limiting the number of tokens produced by _analyze
|
||||
|
||||
To safeguard against out of memory errors, the number of tokens that can be produced
|
||||
using the `_analyze` endpoint has been limited to 10000. This default limit can be changed
|
||||
for a particular index with the index setting `index.analyze.max_token_count`.
|
|
@ -0,0 +1,52 @@
|
|||
---
|
||||
setup:
|
||||
- do:
|
||||
indices.create:
|
||||
index: test_1
|
||||
body:
|
||||
settings:
|
||||
index.analyze.max_token_count: 3
|
||||
|
||||
---
|
||||
"_analyze with No. generated tokens less than or equal to index.analyze.max_token_count should succeed":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.analyze.max_token_count setting has been added in 7.0.0
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test_1
|
||||
body:
|
||||
text: This should succeed
|
||||
analyzer: standard
|
||||
- length: { tokens: 3 }
|
||||
- match: { tokens.0.token: this }
|
||||
- match: { tokens.1.token: should }
|
||||
- match: { tokens.2.token: succeed }
|
||||
|
||||
---
|
||||
"_analyze with No. generated tokens more than index.analyze.max_token_count should fail":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.analyze.max_token_count setting has been added in 7.0.0
|
||||
- do:
|
||||
catch: /The number of tokens produced by calling _analyze has exceeded the allowed maximum of \[3\]. This limit can be set by changing the \[index.analyze.max_token_count\] index level setting\./
|
||||
indices.analyze:
|
||||
index: test_1
|
||||
body:
|
||||
text: This should fail as it exceeds limit
|
||||
analyzer: standard
|
||||
|
||||
|
||||
---
|
||||
"_analyze with explain with No. generated tokens more than index.analyze.max_token_count should fail":
|
||||
- skip:
|
||||
version: " - 6.99.99"
|
||||
reason: index.analyze.max_token_count setting has been added in 7.0.0
|
||||
- do:
|
||||
catch: /The number of tokens produced by calling _analyze has exceeded the allowed maximum of \[3\]. This limit can be set by changing the \[index.analyze.max_token_count\] index level setting\./
|
||||
indices.analyze:
|
||||
index: test_1
|
||||
body:
|
||||
text: This should fail as it exceeds limit
|
||||
analyzer: standard
|
||||
explain: true
|
Loading…
Reference in New Issue