Use index-prefix fields for terms of length min_chars - 1 (#36703)

The default index_prefix settings will index prefixes of between 2 and 5 characters in length. 
Currently, if a prefix search falls outside of this range at either end we fall back to a standard prefix 
expansion, which is still very expensive for single character prefixes. However, we have an option 
here to use a wildcard expansion rather than a prefix expansion, so that a query of a* gets remapped 
to a? against the _index_prefix field - likely to be a very small set of terms, and certain to be much
smaller than a* against the whole index.

This commit adds this extra level of mapping for any prefix term whose length is one less than
the min_chars parameter of the index_prefixes field.
This commit is contained in:
Alan Woodward 2018-12-19 08:55:05 +00:00 committed by GitHub
parent 132ccbec2f
commit dd540ef618
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 68 additions and 37 deletions

View File

@ -58,6 +58,21 @@ setup:
- match: {hits.max_score: 2}
- match: {hits.hits.0._score: 2}
- do:
search:
rest_total_hits_as_int: true
index: test
body:
query:
query_string:
default_field: text
query: s*
boost: 2
- match: {hits.total: 1}
- match: {hits.max_score: 2}
- match: {hits.hits.0._score: 2}
- do:
search:
rest_total_hits_as_int: true

View File

@ -32,6 +32,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.MultiTermQuery;
@ -40,6 +41,9 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.intervals.IntervalsSource;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.Version;
import org.elasticsearch.common.collect.Iterators;
import org.elasticsearch.common.settings.Settings;
@ -360,7 +364,7 @@ public class TextFieldMapper extends FieldMapper {
}
boolean accept(int length) {
return length >= minChars && length <= maxChars;
return length >= minChars - 1 && length <= maxChars;
}
void doXContent(XContentBuilder builder) throws IOException {
@ -370,6 +374,22 @@ public class TextFieldMapper extends FieldMapper {
builder.endObject();
}
@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
if (value.length() >= minChars) {
return super.termQuery(value, context);
}
List<Automaton> automata = new ArrayList<>();
automata.add(Automata.makeString(value));
for (int i = value.length(); i < minChars; i++) {
automata.add(Automata.makeAnyChar());
}
Automaton automaton = Operations.concatenate(automata);
AutomatonQuery query = new AutomatonQuery(new Term(name(), value + "*"), automaton);
query.setRewriteMethod(method);
return query;
}
@Override
public PrefixFieldType clone() {
return new PrefixFieldType(name(), minChars, maxChars);
@ -402,7 +422,6 @@ public class TextFieldMapper extends FieldMapper {
@Override
public int hashCode() {
return Objects.hash(super.hashCode(), minChars, maxChars);
}
}
@ -564,7 +583,7 @@ public class TextFieldMapper extends FieldMapper {
if (prefixFieldType == null || prefixFieldType.accept(value.length()) == false) {
return super.prefixQuery(value, method, context);
}
Query tq = prefixFieldType.termQuery(value, context);
Query tq = prefixFieldType.prefixQuery(value, method, context);
if (method == null || method == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
return new ConstantScoreQuery(tq);

View File

@ -31,10 +31,8 @@ import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
@ -71,10 +69,8 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import static org.apache.lucene.search.MultiTermQuery.CONSTANT_SCORE_REWRITE;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.core.Is.is;
public class TextFieldMapperTests extends ESSingleNodeTestCase {
@ -817,18 +813,13 @@ public class TextFieldMapperTests extends ESSingleNodeTestCase {
public void testIndexPrefixMapping() throws IOException {
QueryShardContext queryShardContext = indexService.newQueryShardContext(
randomInt(20), null, () -> {
throw new UnsupportedOperationException();
}, null);
{
String mapping = Strings.toString(XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "text")
.field("analyzer", "standard")
.startObject("index_prefixes")
.field("min_chars", 1)
.field("min_chars", 2)
.field("max_chars", 10)
.endObject()
.endObject().endObject()
@ -837,16 +828,7 @@ public class TextFieldMapperTests extends ESSingleNodeTestCase {
DocumentMapper mapper = parser.parse("type", new CompressedXContent(mapping));
assertEquals(mapping, mapper.mappingSource().toString());
assertThat(mapper.mappers().getMapper("field._index_prefix").toString(), containsString("prefixChars=1:10"));
FieldMapper fieldMapper = (FieldMapper) mapper.mappers().getMapper("field");
MappedFieldType fieldType = fieldMapper.fieldType;
Query q = fieldType.prefixQuery("goin", CONSTANT_SCORE_REWRITE, queryShardContext);
assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field._index_prefix", "goin"))), q);
q = fieldType.prefixQuery("internationalisatio", CONSTANT_SCORE_REWRITE, queryShardContext);
assertEquals(new PrefixQuery(new Term("field", "internationalisatio")), q);
assertThat(mapper.mappers().getMapper("field._index_prefix").toString(), containsString("prefixChars=2:10"));
ParsedDocument doc = mapper.parse(SourceToParse.source("test", "type", "1", BytesReference
.bytes(XContentFactory.jsonBuilder()
@ -870,17 +852,8 @@ public class TextFieldMapperTests extends ESSingleNodeTestCase {
CompressedXContent json = new CompressedXContent(mapping);
DocumentMapper mapper = parser.parse("type", json);
FieldMapper fieldMapper = (FieldMapper) mapper.mappers().getMapper("field");
MappedFieldType fieldType = fieldMapper.fieldType;
assertThat(mapper.mappers().getMapper("field._index_prefix").toString(), containsString("prefixChars=2:5"));
Query q1 = fieldType.prefixQuery("g", CONSTANT_SCORE_REWRITE, queryShardContext);
assertThat(q1, instanceOf(PrefixQuery.class));
Query q2 = fieldType.prefixQuery("go", CONSTANT_SCORE_REWRITE, queryShardContext);
assertThat(q2, instanceOf(ConstantScoreQuery.class));
Query q5 = fieldType.prefixQuery("going", CONSTANT_SCORE_REWRITE, queryShardContext);
assertThat(q5, instanceOf(ConstantScoreQuery.class));
Query q6 = fieldType.prefixQuery("goings", CONSTANT_SCORE_REWRITE, queryShardContext);
assertThat(q6, instanceOf(PrefixQuery.class));
}
{
@ -898,10 +871,8 @@ public class TextFieldMapperTests extends ESSingleNodeTestCase {
.endObject().endObject()
.endObject().endObject());
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> {
indexService.mapperService()
.merge("type", new CompressedXContent(illegalMapping), MergeReason.MAPPING_UPDATE);
});
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () ->
indexService.mapperService().merge("type", new CompressedXContent(illegalMapping), MergeReason.MAPPING_UPDATE));
assertThat(e.getMessage(), containsString("Field [field._index_prefix] is defined twice in [type]"));
}

View File

@ -20,11 +20,18 @@ package org.elasticsearch.index.mapper;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RegexpQuery;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.unit.Fuzziness;
import org.junit.Before;
@ -32,6 +39,8 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.apache.lucene.search.MultiTermQuery.CONSTANT_SCORE_REWRITE;
public class TextFieldTypeTests extends FieldTypeTestCase {
@Override
protected MappedFieldType createDefaultFieldType() {
@ -143,4 +152,21 @@ public class TextFieldTypeTests extends FieldTypeTestCase {
() -> ft.fuzzyQuery("foo", Fuzziness.fromEdits(2), 1, 50, true));
assertEquals("Cannot search on field [field] since it is not indexed.", e.getMessage());
}
public void testIndexPrefixes() {
TextFieldMapper.TextFieldType ft = new TextFieldMapper.TextFieldType();
ft.setName("field");
ft.setPrefixFieldType(new TextFieldMapper.PrefixFieldType("field._index_prefix", 2, 10));
Query q = ft.prefixQuery("goin", CONSTANT_SCORE_REWRITE, null);
assertEquals(new ConstantScoreQuery(new TermQuery(new Term("field._index_prefix", "goin"))), q);
q = ft.prefixQuery("internationalisatio", CONSTANT_SCORE_REWRITE, null);
assertEquals(new PrefixQuery(new Term("field", "internationalisatio")), q);
q = ft.prefixQuery("g", CONSTANT_SCORE_REWRITE, null);
Automaton automaton
= Operations.concatenate(Arrays.asList(Automata.makeChar('g'), Automata.makeAnyChar()));
assertEquals(new ConstantScoreQuery(new AutomatonQuery(new Term("field._index_prefix", "g*"), automaton)), q);
}
}