Add fuzzy intervals source ()

This intervals source will return terms that are similar to an input term, up to
an edit distance defined by fuzziness, similar to FuzzyQuery.

Closes 
This commit is contained in:
Alan Woodward 2020-01-03 09:55:53 +00:00
parent a34b3f133c
commit 8b362c657b
6 changed files with 276 additions and 5 deletions
docs/reference/query-dsl
rest-api-spec/src/main/resources/rest-api-spec/test/search
server/src
main/java/org
apache/lucene/queries
elasticsearch
test/java/org/elasticsearch/index/query

@ -73,6 +73,7 @@ Valid rules include:
* <<intervals-match,`match`>>
* <<intervals-prefix,`prefix`>>
* <<intervals-wildcard,`wildcard`>>
* <<intervals-fuzzy,`fuzzy`>>
* <<intervals-all_of,`all_of`>>
* <<intervals-any_of,`any_of`>>
--
@ -177,6 +178,42 @@ The `pattern` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--
[[intervals-fuzzy]]
==== `fuzzy` rule parameters
The `fuzzy` rule matches terms that are similar to the provided term, within an
edit distance defined by <<fuzziness>>. If the fuzzy expansion matches more than
128 terms, {es} returns an error.
`term`::
(Required, string) The term to match
`prefix_length`::
(Optional, string) Number of beginning characters left unchanged when creating
expansions. Defaults to `0`.
`transpositions`::
(Optional, boolean) Indicates whether edits include transpositions of two
adjacent characters (ab → ba). Defaults to `true`.
`fuzziness`::
(Optional, string) Maximum edit distance allowed for matching. See <<fuzziness>>
for valid values and more information. Defaults to `auto`.
`analyzer`::
(Optional, string) <<analysis, analyzer>> used to normalize the `term`.
Defaults to the top-level `<field>` 's analyzer.
`use_field`::
+
--
(Optional, string) If specified, match intervals from this field rather than the
top-level `<field>`.
The `term` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--
[[intervals-all_of]]
==== `all_of` rule parameters

@ -390,8 +390,8 @@ setup:
---
"Test prefix":
- skip:
version: " - 8.0.0"
reason: "TODO: change to 7.3 in backport"
version: " - 7.2.99"
reason: "Implemented in 7.3"
- do:
search:
index: test
@ -410,8 +410,8 @@ setup:
---
"Test wildcard":
- skip:
version: " - 8.0.0"
reason: "TODO: change to 7.3 in backport"
version: " - 7.2.99"
reason: "Implemented in 7.3"
- do:
search:
index: test
@ -427,3 +427,24 @@ setup:
pattern: out?ide
- match: { hits.total.value: 3 }
---
"Test fuzzy match":
- skip:
version: " - 7.5.99"
reason: "Implemented in 7.6"
- do:
search:
index: test
body:
query:
intervals:
text:
all_of:
intervals:
- fuzzy:
query: cald
- prefix:
prefix: out
- match: { hits.total.value: 3 }

@ -67,6 +67,10 @@ public final class XIntervals {
return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString());
}
public static IntervalsSource multiterm(CompiledAutomaton ca, String label) {
return new MultiTermIntervalsSource(ca, 128, label);
}
static class MultiTermIntervalsSource extends IntervalsSource {
private final CompiledAutomaton automaton;

@ -20,12 +20,15 @@
package org.elasticsearch.index.query;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.FilteredIntervalsSource;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
@ -33,7 +36,9 @@ import org.elasticsearch.common.io.stream.NamedWriteable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentFragment;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
@ -85,6 +90,8 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
return Prefix.fromXContent(parser);
case "wildcard":
return Wildcard.fromXContent(parser);
case "fuzzy":
return Fuzzy.fromXContent(parser);
}
throw new ParsingException(parser.getTokenLocation(),
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]");
@ -691,6 +698,148 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
}
}
public static class Fuzzy extends IntervalsSourceProvider {
public static final String NAME = "fuzzy";
private final String term;
private final int prefixLength;
private final boolean transpositions;
private final Fuzziness fuzziness;
private final String analyzer;
private final String useField;
public Fuzzy(String term, int prefixLength, boolean transpositions, Fuzziness fuzziness, String analyzer, String useField) {
this.term = term;
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.fuzziness = fuzziness;
this.analyzer = analyzer;
this.useField = useField;
}
public Fuzzy(StreamInput in) throws IOException {
this.term = in.readString();
this.prefixLength = in.readVInt();
this.transpositions = in.readBoolean();
this.fuzziness = new Fuzziness(in);
this.analyzer = in.readOptionalString();
this.useField = in.readOptionalString();
}
@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
NamedAnalyzer analyzer = fieldType.searchAnalyzer();
if (this.analyzer != null) {
analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer);
}
IntervalsSource source;
if (useField != null) {
fieldType = context.fieldMapper(useField);
assert fieldType != null;
checkPositions(fieldType);
if (this.analyzer == null) {
analyzer = fieldType.searchAnalyzer();
}
}
checkPositions(fieldType);
BytesRef normalizedTerm = analyzer.normalize(fieldType.name(), term);
FuzzyQuery fq = new FuzzyQuery(new Term(fieldType.name(), normalizedTerm),
fuzziness.asDistance(term), prefixLength, 128, transpositions);
CompiledAutomaton ca = new CompiledAutomaton(fq.toAutomaton());
source = XIntervals.multiterm(ca, term);
if (useField != null) {
source = Intervals.fixField(useField, source);
}
return source;
}
private void checkPositions(MappedFieldType type) {
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed");
}
}
@Override
public void extractFields(Set<String> fields) {
if (useField != null) {
fields.add(useField);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Fuzzy fuzzy = (Fuzzy) o;
return prefixLength == fuzzy.prefixLength &&
transpositions == fuzzy.transpositions &&
Objects.equals(term, fuzzy.term) &&
Objects.equals(fuzziness, fuzzy.fuzziness) &&
Objects.equals(analyzer, fuzzy.analyzer) &&
Objects.equals(useField, fuzzy.useField);
}
@Override
public int hashCode() {
return Objects.hash(term, prefixLength, transpositions, fuzziness, analyzer, useField);
}
@Override
public String getWriteableName() {
return NAME;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(term);
out.writeVInt(prefixLength);
out.writeBoolean(transpositions);
fuzziness.writeTo(out);
out.writeOptionalString(analyzer);
out.writeOptionalString(useField);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field("term", term);
builder.field("prefix_length", prefixLength);
builder.field("transpositions", transpositions);
fuzziness.toXContent(builder, params);
if (analyzer != null) {
builder.field("analyzer", analyzer);
}
if (useField != null) {
builder.field("use_field", useField);
}
builder.endObject();
return builder;
}
private static final ConstructingObjectParser<Fuzzy, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
String term = (String) args[0];
int prefixLength = (args[1] == null) ? FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH : (int) args[1];
boolean transpositions = (args[2] == null) ? FuzzyQueryBuilder.DEFAULT_TRANSPOSITIONS : (boolean) args[2];
Fuzziness fuzziness = (args[3] == null) ? FuzzyQueryBuilder.DEFAULT_FUZZINESS : (Fuzziness) args[3];
String analyzer = (String) args[4];
String useField = (String) args[5];
return new Fuzzy(term, prefixLength, transpositions, fuzziness, analyzer, useField);
});
static {
PARSER.declareString(constructorArg(), new ParseField("term"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("prefix_length"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("transpositions"));
PARSER.declareField(optionalConstructorArg(), (p, c) -> Fuzziness.parse(p), Fuzziness.FIELD, ObjectParser.ValueType.VALUE);
PARSER.declareString(optionalConstructorArg(), new ParseField("analyzer"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
}
public static Fuzzy fromXContent(XContentParser parser) throws IOException {
return PARSER.parse(parser, null);
}
}
static class ScriptFilterSource extends FilteredIntervalsSource {
final IntervalFilterScript script;

@ -861,6 +861,8 @@ public class SearchModule {
IntervalsSourceProvider.Prefix.NAME, IntervalsSourceProvider.Prefix::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Wildcard.NAME, IntervalsSourceProvider.Wildcard::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Fuzzy.NAME, IntervalsSourceProvider.Fuzzy::new));
}
private void registerQuery(QuerySpec<?> spec) {

@ -19,17 +19,22 @@
package org.elasticsearch.index.query;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.MapperService;
@ -529,4 +534,57 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
assertEquals(expected, builder.toQuery(createShardContext()));
}
private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) {
FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions);
return XIntervals.multiterm(new CompiledAutomaton(fq.toAutomaton()), label);
}
public void testFuzzy() throws IOException {
String json = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\" } } } }";
IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json);
Query expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_prefix = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2 } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_prefix);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_fuzziness = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fuzziness);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_no_transpositions = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"transpositions\" : false } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_no_transpositions);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, false, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_analyzer = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"analyzer\" : \"keyword\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_analyzer);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("Term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_fixfield = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\", " +
"\"use_field\" : \"" + MASKED_FIELD + "\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fixfield);
expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term"))));
assertEquals(expected, builder.toQuery(createShardContext()));
}
}