Add 'min_input_len' to completion suggester
Restrict the size of the input length to a reasonable size otherwise very long strings can cause StackOverflowExceptions deep down in lucene land. Yet, this is simply a saftly limit set to `50` UTF-16 codepoints by default. This limit is only present at index time and not at query time. If prefix completions > 50 UTF-16 codepoints are expected / desired this limit should be raised. Critical string sizes are beyone the 1k UTF-16 Codepoints limit. Closes #3596
This commit is contained in:
parent
eed7f0bdb3
commit
eb2fed85f1
|
@ -65,7 +65,7 @@ Mapping supports the following parameters:
|
|||
`payloads`::
|
||||
Enables the storing of payloads, defaults to `false`
|
||||
|
||||
`preserve_separators`:
|
||||
`preserve_separators`::
|
||||
Preserves the separators, defaults to `true`.
|
||||
If disabled, you could find a field starting with `Foo Fighters`, if you
|
||||
suggest for `foof`.
|
||||
|
@ -78,6 +78,14 @@ Mapping supports the following parameters:
|
|||
`The Beatles`, no need to change a simple analyzer, if you are able to
|
||||
enrich your data.
|
||||
|
||||
`max_input_len`::
|
||||
Limits the length of a single input, defaults to `50` UTF-16 code points.
|
||||
This limit is only used at index time to reduce the total number of
|
||||
characters per input string in order to prevent massive inputs from
|
||||
bloating the underlying datastructure. The most usecases won't be influenced
|
||||
by the default value since prefix completions hardly grow beyond prefixes longer
|
||||
than a handful of characters.
|
||||
|
||||
==== Indexing
|
||||
|
||||
[source,js]
|
||||
|
|
|
@ -31,16 +31,14 @@ import org.elasticsearch.common.xcontent.XContentParser;
|
|||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
|
||||
import org.elasticsearch.index.fielddata.FieldDataType;
|
||||
import org.elasticsearch.index.mapper.Mapper;
|
||||
import org.elasticsearch.index.mapper.MapperException;
|
||||
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||
import org.elasticsearch.index.mapper.ParseContext;
|
||||
import org.elasticsearch.index.mapper.*;
|
||||
import org.elasticsearch.index.similarity.SimilarityProvider;
|
||||
import org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionPostingsFormatProvider;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionTokenStream;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -62,6 +60,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
public static final boolean DEFAULT_PRESERVE_SEPARATORS = true;
|
||||
public static final boolean DEFAULT_POSITION_INCREMENTS = true;
|
||||
public static final boolean DEFAULT_HAS_PAYLOADS = false;
|
||||
public static final int DEFAULT_MAX_INPUT_LENGTH = 50;
|
||||
}
|
||||
|
||||
public static class Fields {
|
||||
|
@ -72,6 +71,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
public static final String PRESERVE_POSITION_INCREMENTS = "preserve_position_increments";
|
||||
public static final String PAYLOADS = "payloads";
|
||||
public static final String TYPE = "type";
|
||||
public static final String MAX_INPUT_LENGTH = "max_input_len";
|
||||
}
|
||||
|
||||
public static class Builder extends AbstractFieldMapper.OpenBuilder<Builder, CompletionFieldMapper> {
|
||||
|
@ -81,6 +81,7 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
private boolean preserveSeparators = Defaults.DEFAULT_PRESERVE_SEPARATORS;
|
||||
private boolean payloads = Defaults.DEFAULT_HAS_PAYLOADS;
|
||||
private boolean preservePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS;
|
||||
private int maxInputLength = Defaults.DEFAULT_MAX_INPUT_LENGTH;
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, Defaults.FIELD_TYPE);
|
||||
|
@ -111,9 +112,18 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder maxInputLength(int maxInputLength) {
|
||||
if (maxInputLength <= 0) {
|
||||
throw new ElasticSearchIllegalArgumentException(Fields.MAX_INPUT_LENGTH + " must be > 0 but was [" + maxInputLength + "]");
|
||||
}
|
||||
this.maxInputLength = maxInputLength;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CompletionFieldMapper build(Mapper.BuilderContext context) {
|
||||
return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads, preserveSeparators, preservePositionIncrements);
|
||||
return new CompletionFieldMapper(buildNames(context), indexAnalyzer, searchAnalyzer, provider, similarity, payloads,
|
||||
preserveSeparators, preservePositionIncrements, maxInputLength);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -129,18 +139,23 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
continue;
|
||||
}
|
||||
if (fieldName.equals("analyzer")) {
|
||||
builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
|
||||
builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
|
||||
NamedAnalyzer analyzer = getNamedAnalyzer(parserContext, fieldNode.toString());
|
||||
builder.indexAnalyzer(analyzer);
|
||||
builder.searchAnalyzer(analyzer);
|
||||
} else if (fieldName.equals(Fields.INDEX_ANALYZER) || fieldName.equals("indexAnalyzer")) {
|
||||
builder.indexAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
|
||||
builder.indexAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString()));
|
||||
} else if (fieldName.equals(Fields.SEARCH_ANALYZER) || fieldName.equals("searchAnalyzer")) {
|
||||
builder.searchAnalyzer(parserContext.analysisService().analyzer(fieldNode.toString()));
|
||||
builder.searchAnalyzer(getNamedAnalyzer(parserContext, fieldNode.toString()));
|
||||
} else if (fieldName.equals(Fields.PAYLOADS)) {
|
||||
builder.payloads(Boolean.parseBoolean(fieldNode.toString()));
|
||||
} else if (fieldName.equals(Fields.PRESERVE_SEPARATORS) || fieldName.equals("preserveSeparators")) {
|
||||
builder.preserveSeparators(Boolean.parseBoolean(fieldNode.toString()));
|
||||
} else if (fieldName.equals(Fields.PRESERVE_POSITION_INCREMENTS) || fieldName.equals("preservePositionIncrements")) {
|
||||
builder.preservePositionIncrements(Boolean.parseBoolean(fieldNode.toString()));
|
||||
} else if (fieldName.equals(Fields.MAX_INPUT_LENGTH) || fieldName.equals("maxInputLen")) {
|
||||
builder.maxInputLength(Integer.parseInt(fieldNode.toString()));
|
||||
} else {
|
||||
throw new MapperParsingException("Unknown field [" + fieldName + "]");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -155,6 +170,14 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
builder.postingsFormat(parserContext.postingFormatService().get("default"));
|
||||
return builder;
|
||||
}
|
||||
|
||||
private NamedAnalyzer getNamedAnalyzer(ParserContext parserContext, String name) {
|
||||
NamedAnalyzer analyzer = parserContext.analysisService().analyzer(name);
|
||||
if (analyzer == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("Can't find default or mapped analyzer with name [" + name +"]");
|
||||
}
|
||||
return analyzer;
|
||||
}
|
||||
}
|
||||
|
||||
private static final BytesRef EMPTY = new BytesRef();
|
||||
|
@ -164,15 +187,17 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
private final boolean payloads;
|
||||
private final boolean preservePositionIncrements;
|
||||
private final boolean preserveSeparators;
|
||||
private int maxInputLength;
|
||||
|
||||
public CompletionFieldMapper(Names names, NamedAnalyzer indexAnalyzer, NamedAnalyzer searchAnalyzer, PostingsFormatProvider provider, SimilarityProvider similarity, boolean payloads,
|
||||
boolean preserveSeparators, boolean preservePositionIncrements) {
|
||||
boolean preserveSeparators, boolean preservePositionIncrements, int maxInputLength) {
|
||||
super(names, 1.0f, Defaults.FIELD_TYPE, indexAnalyzer, searchAnalyzer, provider, similarity, null);
|
||||
analyzingSuggestLookupProvider = new AnalyzingCompletionLookupProvider(preserveSeparators, false, preservePositionIncrements, payloads);
|
||||
this.completionPostingsFormatProvider = new CompletionPostingsFormatProvider("completion", provider, analyzingSuggestLookupProvider);
|
||||
this.preserveSeparators = preserveSeparators;
|
||||
this.payloads = payloads;
|
||||
this.preservePositionIncrements = preservePositionIncrements;
|
||||
this.maxInputLength = maxInputLength;
|
||||
}
|
||||
|
||||
|
||||
|
@ -251,9 +276,21 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
}
|
||||
|
||||
public Field getCompletionField(String input, BytesRef payload) {
|
||||
if (input.length() > maxInputLength) {
|
||||
final int len = correctSubStringLen(input, Math.min(maxInputLength, input.length()));
|
||||
input = input.substring(0, len);
|
||||
}
|
||||
return new SuggestField(names().fullName(), input, this.fieldType, payload, analyzingSuggestLookupProvider);
|
||||
}
|
||||
|
||||
public static int correctSubStringLen(String input, int len) {
|
||||
if (Character.isHighSurrogate(input.charAt(len-1))) {
|
||||
assert input.length() >= len+1 && Character.isLowSurrogate(input.charAt(len));
|
||||
return len + 1;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
public BytesRef buildPayload(BytesRef surfaceForm, long weight, BytesRef payload) throws IOException {
|
||||
return analyzingSuggestLookupProvider.buildPayload(
|
||||
surfaceForm, weight, payload);
|
||||
|
@ -263,6 +300,12 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
private final BytesRef payload;
|
||||
private final CompletionTokenStream.ToFiniteStrings toFiniteStrings;
|
||||
|
||||
public SuggestField(String name, Reader value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
|
||||
super(name, value, type);
|
||||
this.payload = payload;
|
||||
this.toFiniteStrings = toFiniteStrings;
|
||||
}
|
||||
|
||||
public SuggestField(String name, String value, FieldType type, BytesRef payload, CompletionTokenStream.ToFiniteStrings toFiniteStrings) {
|
||||
super(name, value, type);
|
||||
this.payload = payload;
|
||||
|
@ -287,12 +330,11 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
builder.field(Fields.INDEX_ANALYZER, indexAnalyzer.name())
|
||||
.field(Fields.SEARCH_ANALYZER, searchAnalyzer.name());
|
||||
}
|
||||
builder.field(Fields.PAYLOADS, this.payloads)
|
||||
.field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators)
|
||||
.field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements)
|
||||
.endObject();
|
||||
|
||||
return builder;
|
||||
builder.field(Fields.PAYLOADS, this.payloads);
|
||||
builder.field(Fields.PRESERVE_SEPARATORS, this.preserveSeparators);
|
||||
builder.field(Fields.PRESERVE_POSITION_INCREMENTS, this.preservePositionIncrements);
|
||||
builder.field(Fields.MAX_INPUT_LENGTH, this.maxInputLength);
|
||||
return builder.endObject();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -328,4 +370,22 @@ public class CompletionFieldMapper extends AbstractFieldMapper<String> {
|
|||
public boolean isStoringPayloads() {
|
||||
return payloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException {
|
||||
super.merge(mergeWith, mergeContext);
|
||||
CompletionFieldMapper fieldMergeWith = (CompletionFieldMapper) mergeWith;
|
||||
if (payloads != fieldMergeWith.payloads) {
|
||||
mergeContext.addConflict("mapper [" + names.fullName() + "] has different payload values");
|
||||
}
|
||||
if (preservePositionIncrements != fieldMergeWith.preservePositionIncrements) {
|
||||
mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_position_increments' values");
|
||||
}
|
||||
if (preserveSeparators != fieldMergeWith.preserveSeparators) {
|
||||
mergeContext.addConflict("mapper [" + names.fullName() + "] has different 'preserve_separators' values");
|
||||
}
|
||||
if (!mergeContext.mergeFlags().simulate()) {
|
||||
this.maxInputLength = fieldMergeWith.maxInputLength;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,6 +28,8 @@ import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
|
|||
import org.elasticsearch.action.admin.indices.create.CreateIndexResponse;
|
||||
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequestBuilder;
|
||||
import org.elasticsearch.action.admin.indices.delete.DeleteIndexResponse;
|
||||
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequestBuilder;
|
||||
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse;
|
||||
import org.elasticsearch.action.count.CountResponse;
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.action.search.ShardSearchFailure;
|
||||
|
@ -50,6 +52,15 @@ import static org.junit.Assert.fail;
|
|||
public class ElasticsearchAssertions {
|
||||
|
||||
|
||||
public static void assertAcked(PutMappingRequestBuilder builder) {
|
||||
assertAcked(builder.get());
|
||||
}
|
||||
|
||||
private static void assertAcked(PutMappingResponse response) {
|
||||
assertThat("Put Mapping failed - not acked", response.isAcknowledged(), equalTo(true));
|
||||
|
||||
}
|
||||
|
||||
public static void assertAcked(DeleteIndexRequestBuilder builder) {
|
||||
assertAcked(builder.get());
|
||||
}
|
||||
|
|
|
@ -91,7 +91,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
|||
LookupFactory load = provider.load(input);
|
||||
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
|
||||
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
|
||||
Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null));
|
||||
Lookup lookup = load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null));
|
||||
List<LookupResult> result = lookup.lookup("ge", false, 10);
|
||||
assertThat(result.get(0).key.toString(), equalTo("Generator - Foo Fighters"));
|
||||
assertThat(result.get(0).payload.utf8ToString(), equalTo("id:10"));
|
||||
|
@ -176,7 +176,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
|||
|
||||
NamedAnalyzer namedAnalzyer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
|
||||
final CompletionFieldMapper mapper = new CompletionFieldMapper(new Names("foo"), namedAnalzyer, namedAnalzyer, provider, null, usePayloads,
|
||||
preserveSeparators, preservePositionIncrements);
|
||||
preserveSeparators, preservePositionIncrements, Integer.MAX_VALUE);
|
||||
Lookup buildAnalyzingLookup = buildAnalyzingLookup(mapper, titles, titles, weights);
|
||||
Field field = buildAnalyzingLookup.getClass().getDeclaredField("maxAnalyzedPathsForOneInput");
|
||||
field.setAccessible(true);
|
||||
|
@ -265,7 +265,7 @@ public class CompletionPostingsFormatTest extends ElasticsearchTestCase {
|
|||
LookupFactory load = provider.load(input);
|
||||
PostingsFormatProvider format = new PreBuiltPostingsFormatProvider(new ElasticSearch090PostingsFormat());
|
||||
NamedAnalyzer analyzer = new NamedAnalyzer("foo", new StandardAnalyzer(TEST_VERSION_CURRENT));
|
||||
assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true), new CompletionSuggestionContext(null)));
|
||||
assertNull(load.getLookup(new CompletionFieldMapper(new Names("foo"), analyzer, analyzer, format, null, true, true, true, Integer.MAX_VALUE), new CompletionSuggestionContext(null)));
|
||||
dir.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -32,11 +32,13 @@ import org.elasticsearch.common.settings.ImmutableSettings;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.mapper.MapperException;
|
||||
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
|
||||
import org.elasticsearch.search.suggest.Suggest;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionStats;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionSuggestionBuilder;
|
||||
import org.elasticsearch.search.suggest.completion.CompletionSuggestionFuzzyBuilder;
|
||||
import org.elasticsearch.test.hamcrest.ElasticsearchAssertions;
|
||||
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -717,4 +719,60 @@ public class CompletionSuggestSearchTests extends AbstractSharedClusterTest {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMaxFieldLength() throws IOException {
|
||||
client().admin().indices().prepareCreate(INDEX).get();
|
||||
int iters = atLeast(10);
|
||||
for (int i = 0; i < iters; i++) {
|
||||
int len = between(3, 50);
|
||||
String str = randomRealisticUnicodeOfCodepointLengthBetween(len+1, atLeast(len + 2));
|
||||
ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject()
|
||||
.startObject(TYPE).startObject("properties")
|
||||
.startObject(FIELD)
|
||||
.field("type", "completion")
|
||||
.field("max_input_len", len)
|
||||
// upgrade mapping each time
|
||||
.field("analyzer", "keyword")
|
||||
.endObject()
|
||||
.endObject().endObject()
|
||||
.endObject()));
|
||||
ensureYellow();
|
||||
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
|
||||
.startObject().startObject(FIELD)
|
||||
.startArray("input").value(str).endArray()
|
||||
.field("output", "foobar")
|
||||
.endObject().endObject()
|
||||
).setRefresh(true).get();
|
||||
int prefixLen = CompletionFieldMapper.correctSubStringLen(str, between(1, len - 1));
|
||||
assertSuggestions(str.substring(0, prefixLen), "foobar");
|
||||
if (len + 1 < str.length()) {
|
||||
assertSuggestions(str.substring(0, CompletionFieldMapper.correctSubStringLen(str,
|
||||
len + (Character.isHighSurrogate(str.charAt(len-1)) ? 2 : 1))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
// see #3596
|
||||
public void testVeryLongInput() throws IOException {
|
||||
client().admin().indices().prepareCreate(INDEX).get();
|
||||
ElasticsearchAssertions.assertAcked(client().admin().indices().preparePutMapping(INDEX).setType(TYPE).setSource(jsonBuilder().startObject()
|
||||
.startObject(TYPE).startObject("properties")
|
||||
.startObject(FIELD)
|
||||
.field("type", "completion")
|
||||
.endObject()
|
||||
.endObject().endObject()
|
||||
.endObject()));
|
||||
ensureYellow();
|
||||
// can cause stack overflow without the default max_input_len
|
||||
String longString = randomRealisticUnicodeOfLength(atLeast(5000));
|
||||
client().prepareIndex(INDEX, TYPE, "1").setSource(jsonBuilder()
|
||||
.startObject().startObject(FIELD)
|
||||
.startArray("input").value(longString).endArray()
|
||||
.field("output", "foobar")
|
||||
.endObject().endObject()
|
||||
).setRefresh(true).get();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,6 @@
|
|||
package org.elasticsearch.test.unit.index.mapper.completion;
|
||||
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.json.JsonXContent;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
|
@ -64,6 +63,8 @@ public class CompletionFieldMapperTests {
|
|||
.field("payloads", true)
|
||||
.field("preserve_separators", false)
|
||||
.field("preserve_position_increments", true)
|
||||
.field("max_input_len", 14)
|
||||
|
||||
.endObject().endObject()
|
||||
.endObject().endObject().string();
|
||||
|
||||
|
@ -83,6 +84,7 @@ public class CompletionFieldMapperTests {
|
|||
assertThat(Boolean.valueOf(configMap.get("payloads").toString()), is(true));
|
||||
assertThat(Boolean.valueOf(configMap.get("preserve_separators").toString()), is(false));
|
||||
assertThat(Boolean.valueOf(configMap.get("preserve_position_increments").toString()), is(true));
|
||||
assertThat(Integer.valueOf(configMap.get("max_input_len").toString()), is(14));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue