From eb2fed85f1f9e7105b576a0c379878c7bb4b40ee Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 2 Sep 2013 15:55:47 +0200 Subject: [PATCH] Add 'min_input_len' to completion suggester Restrict the size of the input length to a reasonable size otherwise very long strings can cause StackOverflowExceptions deep down in lucene land. Yet, this is simply a saftly limit set to `50` UTF-16 codepoints by default. This limit is only present at index time and not at query time. If prefix completions > 50 UTF-16 codepoints are expected / desired this limit should be raised. Critical string sizes are beyone the 1k UTF-16 Codepoints limit. Closes #3596 --- .../suggesters/completion-suggest.asciidoc | 10 +- .../mapper/core/CompletionFieldMapper.java | 92 +++++++++++++++---- .../hamcrest/ElasticsearchAssertions.java | 11 +++ .../suggest/CompletionPostingsFormatTest.java | 6 +- .../suggest/CompletionSuggestSearchTests.java | 58 ++++++++++++ .../CompletionFieldMapperTests.java | 4 +- 6 files changed, 160 insertions(+), 21 deletions(-) diff --git a/docs/reference/search/suggesters/completion-suggest.asciidoc b/docs/reference/search/suggesters/completion-suggest.asciidoc index 98dd1768732..32d70f50c22 100644 --- a/docs/reference/search/suggesters/completion-suggest.asciidoc +++ b/docs/reference/search/suggesters/completion-suggest.asciidoc @@ -65,7 +65,7 @@ Mapping supports the following parameters: `payloads`:: Enables the storing of payloads, defaults to `false` -`preserve_separators`: +`preserve_separators`:: Preserves the separators, defaults to `true`. If disabled, you could find a field starting with `Foo Fighters`, if you suggest for `foof`. @@ -78,6 +78,14 @@ Mapping supports the following parameters: `The Beatles`, no need to change a simple analyzer, if you are able to enrich your data. +`max_input_len`:: + Limits the length of a single input, defaults to `50` UTF-16 code points. + This limit is only used at index time to reduce the total number of + characters per input string in order to prevent massive inputs from + bloating the underlying datastructure. The most usecases won't be influenced + by the default value since prefix completions hardly grow beyond prefixes longer + than a handful of characters. + ==== Indexing [source,js] diff --git a/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java index d89956e0aa8..e4e4a36ae18 100644 --- a/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/core/CompletionFieldMapper.java @@ -31,16 +31,14 @@ import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; import org.elasticsearch.index.fielddata.FieldDataType; -import org.elasticsearch.index.mapper.Mapper; -import org.elasticsearch.index.mapper.MapperException; -import org.elasticsearch.index.mapper.MapperParsingException; -import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.*; import org.elasticsearch.index.similarity.SimilarityProvider; import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider; import org.elasticsearch.search.suggest.completion.CompletionPostingsFormatProvider; import org.elasticsearch.search.suggest.completion.CompletionTokenStream; import java.io.IOException; +import java.io.Reader; import java.util.List; import java.util.Map; @@ -62,6 +60,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper { public static final boolean DEFAULT_PRESERVE_SEPARATORS = true; public static final boolean DEFAULT_POSITION_INCREMENTS = true; public static final boolean DEFAULT_HAS_PAYLOADS = false; + public static final int DEFAULT_MAX_INPUT_LENGTH = 50; } public static class Fields { @@ -72,6 +71,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper { public static final String PRESERVE_POSITION_INCREMENTS = "preserve_position_increments"; public static final String PAYLOADS = "payloads"; public static final String TYPE = "type"; + public static final String MAX_INPUT_LENGTH = "max_input_len"; } public static class Builder extends AbstractFieldMapper.OpenBuilder { @@ -81,6 +81,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper { private boolean preserveSeparators = Defaults.DEFAULT_PRESERVE_SEPARATORS; private boolean payloads = Defaults.DEFAULT_HAS_PAYLOADS; private boolean preservePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS; + private int maxInputLength = Defaults.DEFAULT_MAX_INPUT_LENGTH; public Builder(String name) { super(name, Defaults.FIELD_TYPE); @@ -110,10 +111,19 @@ public class CompletionFieldMapper extends AbstractFieldMapper { this.preservePositionIncrements = preservePositionIncrements; return this; } + + public Builder maxInputLength(int maxInputLength) { + if (maxInputLength <= 0) { + throw new ElasticSearchIllegalArgumentException(Fields.MAX_INPUT_LENGTH + " must be > 0 but was [" + maxInputLength + "]"); + } + this.maxInputLength = maxInputLength; + return this; + } @Override public CompletionFieldMapper build(Mapper.BuilderContext context) { - return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads, preserveSeparators, preservePositionIncrements); + return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads, + preserveSeparators, preservePositionIncrements, maxInputLength); } } @@ -129,18 +139,23 @@ public class CompletionFieldMapper extends AbstractFieldMapper { continue; } if (fieldName.equals("analyzer")) { - builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString())); - builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString())); + NamedAnalyzer analyzer = getNamedAnalyzer(parserContext, fieldNode.toString()); + builder.indexAnalyzer(analyzer); + builder.searchAnalyzer(analyzer); } else if (fieldName.equals(Fields.INDEX_ANALYZER) || fieldName.equals("indexAnalyzer")) { - builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString())); + builder.indexAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString())); } else if (fieldName.equals(Fields.SEARCH_ANALYZER) || fieldName.equals("searchAnalyzer")) { - builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString())); + builder.searchAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString())); } else if (fieldName.equals(Fields.PAYLOADS)) { builder.payloads(Boolean.parseBoolean(fieldNode.toString())); } else if (fieldName.equals(Fields.PRESERVE_SEPARATORS) || fieldName.equals("preserveSeparators")) { builder.preserveSeparators(Boolean.parseBoolean(fieldNode.toString())); } else if (fieldName.equals(Fields.PRESERVE_POSITION_INCREMENTS) || fieldName.equals("preservePositionIncrements")) { builder.preservePositionIncrements(Boolean.parseBoolean(fieldNode.toString())); + } else if (fieldName.equals(Fields.MAX_INPUT_LENGTH) || fieldName.equals("maxInputLen")) { + builder.maxInputLength(Integer.parseInt(fieldNode.toString())); + } else { + throw new MapperParsingException("Unknown field [" + fieldName + "]"); } } @@ -155,6 +170,14 @@ public class CompletionFieldMapper extends AbstractFieldMapper { builder.postingsFormat(parserContext.postingFormatService().get("default")); return builder; } + + private NamedAnalyzer getNamedAnalyzer(ParserContext parserContext, String name) { + NamedAnalyzer analyzer = parserContext.analysisService().analyzer(name); + if (analyzer == null) { + throw new ElasticSearchIllegalArgumentException("Can't find default or mapped analyzer with name [" + name +"]"); + } + return analyzer; + } } private static final BytesRef EMPTY = new BytesRef(); @@ -164,15 +187,17 @@ public class CompletionFieldMapper extends AbstractFieldMapper { private final boolean payloads; private final boolean preservePositionIncrements; private final boolean preserveSeparators; + private int maxInputLength; public CompletionFieldMapper(Names names, NamedAnalyzer indexAnalyzer, NamedAnalyzer searchAnalyzer, PostingsFormatProvider provider, SimilarityProvider similarity, boolean payloads, - boolean preserveSeparators, boolean preservePositionIncrements) { + boolean preserveSeparators, boolean preservePositionIncrements, int maxInputLength) { super(names, 1.0f, Defaults.FIELD_TYPE, indexAnalyzer, searchAnalyzer, provider, similarity, null); analyzingSuggestLookupProvider = new AnalyzingCompletionLookupProvider(preserveSeparators, false, preservePositionIncrements, payloads); this.completionPostingsFormatProvider = new CompletionPostingsFormatProvider("completion", provider, analyzingSuggestLookupProvider); this.preserveSeparators = preserveSeparators; this.payloads = payloads; this.preservePositionIncrements = preservePositionIncrements; + this.maxInputLength = maxInputLength; } @@ -251,8 +276,20 @@ public class CompletionFieldMapper extends AbstractFieldMapper { } public Field getCompletionField(String input, BytesRef payload) { + if (input.length() > maxInputLength) { + final int len = correctSubStringLen(input, Math.min(maxInputLength, input.length())); + input = input.substring(0, len); + } return new SuggestField(names().fullName(), input, this.fieldType, payload, analyzingSuggestLookupProvider); } + + public static int correctSubStringLen(String input, int len) { + if (Character.isHighSurrogate(input.charAt(len-1))) { + assert input.length() >= len+1 && Character.isLowSurrogate(input.charAt(len)); + return len + 1; + } + return len; + } public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException { return analyzingSuggestLookupProvider.buildPayload( @@ -262,6 +299,12 @@ public class CompletionFieldMapper extends AbstractFieldMapper { private static final class SuggestField extends Field { private final BytesRef payload; private final CompletionTokenStream.ToFiniteStrings toFiniteStrings; + + public SuggestField(String name, Reader value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) { + super(name, value, type); + this.payload = payload; + this.toFiniteStrings = toFiniteStrings; + } public SuggestField(String name, String value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) { super(name, value, type); @@ -287,12 +330,11 @@ public class CompletionFieldMapper extends AbstractFieldMapper { builder.field(Fields.INDEX_ANALYZER, indexAnalyzer.name()) .field(Fields.SEARCH_ANALYZER, searchAnalyzer.name()); } - builder.field(Fields.PAYLOADS, this.payloads) - .field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators) - .field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements) - .endObject(); - - return builder; + builder.field(Fields.PAYLOADS, this.payloads); + builder.field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators); + builder.field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements); + builder.field(Fields.MAX_INPUT_LENGTH, this.maxInputLength); + return builder.endObject(); } @Override @@ -328,4 +370,22 @@ public class CompletionFieldMapper extends AbstractFieldMapper { public boolean isStoringPayloads() { return payloads; } + + @Override + public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException { + super.merge(mergeWith, mergeContext); + CompletionFieldMapper fieldMergeWith = (CompletionFieldMapper) mergeWith; + if (payloads != fieldMergeWith.payloads) { + mergeContext.addConflict("mapper [" + names.fullName() + "] has different payload values"); + } + if (preservePositionIncrements != fieldMergeWith.preservePositionIncrements) { + mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_position_increments' values"); + } + if (preserveSeparators != fieldMergeWith.preserveSeparators) { + mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_separators' values"); + } + if (!mergeContext.mergeFlags().simulate()) { + this.maxInputLength = fieldMergeWith.maxInputLength; + } + } } diff --git a/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java b/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java index e8477d95616..ae0c1198a1b 100644 --- a/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java +++ b/src/test/java/org/elasticsearch/test/hamcrest/ElasticsearchAssertions.java @@ -28,6 +28,8 @@ import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder; import org.elasticsearch.action.admin.indices.create.CreateIndexResponse; import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequestBuilder; import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse; +import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequestBuilder; +import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse; import org.elasticsearch.action.count.CountResponse; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.ShardSearchFailure; @@ -50,6 +52,15 @@ import static org.junit.Assert.fail; public class ElasticsearchAssertions { + public static void assertAcked(PutMappingRequestBuilder builder) { + assertAcked(builder.get()); + } + + private static void assertAcked(PutMappingResponse response) { + assertThat("Put Mapping failed - not acked", response.isAcknowledged(), equalTo(true)); + + } + public static void assertAcked(DeleteIndexRequestBuilder builder) { assertAcked(builder.get()); } diff --git a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java index d6f4771f2da..40768c73fcc 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java +++ b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionPostingsFormatTest.java @@ -91,7 +91,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase { LookupFactory load = provider.load(input); PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat()); NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT)); - Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null)); + Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null)); List result = lookup.lookup("ge", false, 10); assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters")); assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10")); @@ -176,7 +176,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase { NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT)); final CompletionFieldMapper mapper = new CompletionFieldMapper(new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads, - preserveSeparators, preservePositionIncrements); + preserveSeparators, preservePositionIncrements, Integer.MAX_VALUE); Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights); Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput"); field.setAccessible(true); @@ -265,7 +265,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase { LookupFactory load = provider.load(input); PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat()); NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT)); - assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null))); + assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null))); dir.close(); } diff --git a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java index 282e0143da9..4d21134f146 100644 --- a/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java +++ b/src/test/java/org/elasticsearch/test/integration/search/suggest/CompletionSuggestSearchTests.java @@ -32,11 +32,13 @@ import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.mapper.MapperException; +import org.elasticsearch.index.mapper.core.CompletionFieldMapper; import org.elasticsearch.search.suggest.Suggest; import org.elasticsearch.search.suggest.completion.CompletionStats; import org.elasticsearch.search.suggest.completion.CompletionSuggestion; import org.elasticsearch.search.suggest.completion.CompletionSuggestionBuilder; import org.elasticsearch.search.suggest.completion.CompletionSuggestionFuzzyBuilder; +import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; import org.elasticsearch.test.integration.AbstractSharedClusterTest; import org.junit.Test; @@ -717,4 +719,60 @@ public class CompletionSuggestSearchTests extends AbstractSharedClusterTest { } } } + + @Test + public void testMaxFieldLength() throws IOException { + client().admin().indices().prepareCreate(INDEX).get(); + int iters = atLeast(10); + for (int i = 0; i < iters; i++) { + int len = between(3, 50); + String str = randomRealisticUnicodeOfCodepointLengthBetween(len+1, atLeast(len + 2)); + ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject() + .startObject(TYPE).startObject("properties") + .startObject(FIELD) + .field("type", "completion") + .field("max_input_len", len) + // upgrade mapping each time + .field("analyzer", "keyword") + .endObject() + .endObject().endObject() + .endObject())); + ensureYellow(); + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(str).endArray() + .field("output", "foobar") + .endObject().endObject() + ).setRefresh(true).get(); + int prefixLen = CompletionFieldMapper.correctSubStringLen(str, between(1, len - 1)); + assertSuggestions(str.substring(0, prefixLen), "foobar"); + if (len + 1 < str.length()) { + assertSuggestions(str.substring(0, CompletionFieldMapper.correctSubStringLen(str, + len + (Character.isHighSurrogate(str.charAt(len-1)) ? 2 : 1)))); + } + } + } + + @Test + // see #3596 + public void testVeryLongInput() throws IOException { + client().admin().indices().prepareCreate(INDEX).get(); + ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject() + .startObject(TYPE).startObject("properties") + .startObject(FIELD) + .field("type", "completion") + .endObject() + .endObject().endObject() + .endObject())); + ensureYellow(); + // can cause stack overflow without the default max_input_len + String longString = randomRealisticUnicodeOfLength(atLeast(5000)); + client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(longString).endArray() + .field("output", "foobar") + .endObject().endObject() + ).setRefresh(true).get(); + + } } diff --git a/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java b/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java index fbf9905f657..361bdd03ebc 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java +++ b/src/test/java/org/elasticsearch/test/unit/index/mapper/completion/CompletionFieldMapperTests.java @@ -19,7 +19,6 @@ package org.elasticsearch.test.unit.index.mapper.completion; import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.json.JsonXContent; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.FieldMapper; @@ -64,6 +63,8 @@ public class CompletionFieldMapperTests { .field("payloads", true) .field("preserve_separators", false) .field("preserve_position_increments", true) + .field("max_input_len", 14) + .endObject().endObject() .endObject().endObject().string(); @@ -83,6 +84,7 @@ public class CompletionFieldMapperTests { assertThat(Boolean.valueOf(configMap.get("payloads").toString()), is(true)); assertThat(Boolean.valueOf(configMap.get("preserve_separators").toString()), is(false)); assertThat(Boolean.valueOf(configMap.get("preserve_position_increments").toString()), is(true)); + assertThat(Integer.valueOf(configMap.get("max_input_len").toString()), is(14)); } @Test