Add fuzzy intervals source (#49762)

This intervals source will return terms that are similar to an input term, up to
an edit distance defined by fuzziness, similar to FuzzyQuery.

Closes #49595
This commit is contained in:
Alan Woodward 2020-01-03 09:55:53 +00:00
parent a34b3f133c
commit 8b362c657b
6 changed files with 276 additions and 5 deletions

View File

@ -73,6 +73,7 @@ Valid rules include:
* <<intervals-match,`match`>>
* <<intervals-prefix,`prefix`>>
* <<intervals-wildcard,`wildcard`>>
* <<intervals-fuzzy,`fuzzy`>>
* <<intervals-all_of,`all_of`>>
* <<intervals-any_of,`any_of`>>
--
@ -177,6 +178,42 @@ The `pattern` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--
[[intervals-fuzzy]]
==== `fuzzy` rule parameters
The `fuzzy` rule matches terms that are similar to the provided term, within an
edit distance defined by <<fuzziness>>. If the fuzzy expansion matches more than
128 terms, {es} returns an error.
`term`::
(Required, string) The term to match
`prefix_length`::
(Optional, string) Number of beginning characters left unchanged when creating
expansions. Defaults to `0`.
`transpositions`::
(Optional, boolean) Indicates whether edits include transpositions of two
adjacent characters (ab → ba). Defaults to `true`.
`fuzziness`::
(Optional, string) Maximum edit distance allowed for matching. See <<fuzziness>>
for valid values and more information. Defaults to `auto`.
`analyzer`::
(Optional, string) <<analysis, analyzer>> used to normalize the `term`.
Defaults to the top-level `<field>` 's analyzer.
`use_field`::
+
--
(Optional, string) If specified, match intervals from this field rather than the
top-level `<field>`.
The `term` is normalized using the search analyzer from this field, unless
`analyzer` is specified separately.
--
[[intervals-all_of]]
==== `all_of` rule parameters

View File

@ -390,8 +390,8 @@ setup:
---
"Test prefix":
- skip:
version: " - 8.0.0"
reason: "TODO: change to 7.3 in backport"
version: " - 7.2.99"
reason: "Implemented in 7.3"
- do:
search:
index: test
@ -410,8 +410,8 @@ setup:
---
"Test wildcard":
- skip:
version: " - 8.0.0"
reason: "TODO: change to 7.3 in backport"
version: " - 7.2.99"
reason: "Implemented in 7.3"
- do:
search:
index: test
@ -427,3 +427,24 @@ setup:
pattern: out?ide
- match: { hits.total.value: 3 }
---
"Test fuzzy match":
- skip:
version: " - 7.5.99"
reason: "Implemented in 7.6"
- do:
search:
index: test
body:
query:
intervals:
text:
all_of:
intervals:
- fuzzy:
query: cald
- prefix:
prefix: out
- match: { hits.total.value: 3 }

View File

@ -67,6 +67,10 @@ public final class XIntervals {
return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString());
}
public static IntervalsSource multiterm(CompiledAutomaton ca, String label) {
return new MultiTermIntervalsSource(ca, 128, label);
}
static class MultiTermIntervalsSource extends IntervalsSource {
private final CompiledAutomaton automaton;

View File

@ -20,12 +20,15 @@
package org.elasticsearch.index.query;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.FilteredIntervalsSource;
import org.apache.lucene.queries.intervals.IntervalIterator;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.ParsingException;
@ -33,7 +36,9 @@ import org.elasticsearch.common.io.stream.NamedWriteable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentFragment;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
@ -85,6 +90,8 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
return Prefix.fromXContent(parser);
case "wildcard":
return Wildcard.fromXContent(parser);
case "fuzzy":
return Fuzzy.fromXContent(parser);
}
throw new ParsingException(parser.getTokenLocation(),
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]");
@ -691,6 +698,148 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
}
}
public static class Fuzzy extends IntervalsSourceProvider {
public static final String NAME = "fuzzy";
private final String term;
private final int prefixLength;
private final boolean transpositions;
private final Fuzziness fuzziness;
private final String analyzer;
private final String useField;
public Fuzzy(String term, int prefixLength, boolean transpositions, Fuzziness fuzziness, String analyzer, String useField) {
this.term = term;
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.fuzziness = fuzziness;
this.analyzer = analyzer;
this.useField = useField;
}
public Fuzzy(StreamInput in) throws IOException {
this.term = in.readString();
this.prefixLength = in.readVInt();
this.transpositions = in.readBoolean();
this.fuzziness = new Fuzziness(in);
this.analyzer = in.readOptionalString();
this.useField = in.readOptionalString();
}
@Override
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
NamedAnalyzer analyzer = fieldType.searchAnalyzer();
if (this.analyzer != null) {
analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer);
}
IntervalsSource source;
if (useField != null) {
fieldType = context.fieldMapper(useField);
assert fieldType != null;
checkPositions(fieldType);
if (this.analyzer == null) {
analyzer = fieldType.searchAnalyzer();
}
}
checkPositions(fieldType);
BytesRef normalizedTerm = analyzer.normalize(fieldType.name(), term);
FuzzyQuery fq = new FuzzyQuery(new Term(fieldType.name(), normalizedTerm),
fuzziness.asDistance(term), prefixLength, 128, transpositions);
CompiledAutomaton ca = new CompiledAutomaton(fq.toAutomaton());
source = XIntervals.multiterm(ca, term);
if (useField != null) {
source = Intervals.fixField(useField, source);
}
return source;
}
private void checkPositions(MappedFieldType type) {
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed");
}
}
@Override
public void extractFields(Set<String> fields) {
if (useField != null) {
fields.add(useField);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Fuzzy fuzzy = (Fuzzy) o;
return prefixLength == fuzzy.prefixLength &&
transpositions == fuzzy.transpositions &&
Objects.equals(term, fuzzy.term) &&
Objects.equals(fuzziness, fuzzy.fuzziness) &&
Objects.equals(analyzer, fuzzy.analyzer) &&
Objects.equals(useField, fuzzy.useField);
}
@Override
public int hashCode() {
return Objects.hash(term, prefixLength, transpositions, fuzziness, analyzer, useField);
}
@Override
public String getWriteableName() {
return NAME;
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(term);
out.writeVInt(prefixLength);
out.writeBoolean(transpositions);
fuzziness.writeTo(out);
out.writeOptionalString(analyzer);
out.writeOptionalString(useField);
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject(NAME);
builder.field("term", term);
builder.field("prefix_length", prefixLength);
builder.field("transpositions", transpositions);
fuzziness.toXContent(builder, params);
if (analyzer != null) {
builder.field("analyzer", analyzer);
}
if (useField != null) {
builder.field("use_field", useField);
}
builder.endObject();
return builder;
}
private static final ConstructingObjectParser<Fuzzy, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
String term = (String) args[0];
int prefixLength = (args[1] == null) ? FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH : (int) args[1];
boolean transpositions = (args[2] == null) ? FuzzyQueryBuilder.DEFAULT_TRANSPOSITIONS : (boolean) args[2];
Fuzziness fuzziness = (args[3] == null) ? FuzzyQueryBuilder.DEFAULT_FUZZINESS : (Fuzziness) args[3];
String analyzer = (String) args[4];
String useField = (String) args[5];
return new Fuzzy(term, prefixLength, transpositions, fuzziness, analyzer, useField);
});
static {
PARSER.declareString(constructorArg(), new ParseField("term"));
PARSER.declareInt(optionalConstructorArg(), new ParseField("prefix_length"));
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("transpositions"));
PARSER.declareField(optionalConstructorArg(), (p, c) -> Fuzziness.parse(p), Fuzziness.FIELD, ObjectParser.ValueType.VALUE);
PARSER.declareString(optionalConstructorArg(), new ParseField("analyzer"));
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
}
public static Fuzzy fromXContent(XContentParser parser) throws IOException {
return PARSER.parse(parser, null);
}
}
static class ScriptFilterSource extends FilteredIntervalsSource {
final IntervalFilterScript script;

View File

@ -861,6 +861,8 @@ public class SearchModule {
IntervalsSourceProvider.Prefix.NAME, IntervalsSourceProvider.Prefix::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Wildcard.NAME, IntervalsSourceProvider.Wildcard::new));
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
IntervalsSourceProvider.Fuzzy.NAME, IntervalsSourceProvider.Fuzzy::new));
}
private void registerQuery(QuerySpec<?> spec) {

View File

@ -19,17 +19,22 @@
package org.elasticsearch.index.query;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.XIntervals;
import org.apache.lucene.queries.intervals.IntervalQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.compress.CompressedXContent;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.MapperService;
@ -529,4 +534,57 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
assertEquals(expected, builder.toQuery(createShardContext()));
}
private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) {
FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions);
return XIntervals.multiterm(new CompiledAutomaton(fq.toAutomaton()), label);
}
public void testFuzzy() throws IOException {
String json = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\" } } } }";
IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json);
Query expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_prefix = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2 } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_prefix);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_fuzziness = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fuzziness);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_no_transpositions = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"transpositions\" : false } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_no_transpositions);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("term", "Term", 2, false, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_analyzer = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"analyzer\" : \"keyword\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_analyzer);
expected = new IntervalQuery(STRING_FIELD_NAME,
buildFuzzySource("Term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
assertEquals(expected, builder.toQuery(createShardContext()));
String json_with_fixfield = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\", " +
"\"use_field\" : \"" + MASKED_FIELD + "\" } } } }";
builder = (IntervalQueryBuilder) parseQuery(json_with_fixfield);
expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD,
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term"))));
assertEquals(expected, builder.toQuery(createShardContext()));
}
}