Add fuzzy intervals source (#49762)
This intervals source will return terms that are similar to an input term, up to an edit distance defined by fuzziness, similar to FuzzyQuery. Closes #49595
This commit is contained in:
parent
a34b3f133c
commit
8b362c657b
|
@ -73,6 +73,7 @@ Valid rules include:
|
|||
* <<intervals-match,`match`>>
|
||||
* <<intervals-prefix,`prefix`>>
|
||||
* <<intervals-wildcard,`wildcard`>>
|
||||
* <<intervals-fuzzy,`fuzzy`>>
|
||||
* <<intervals-all_of,`all_of`>>
|
||||
* <<intervals-any_of,`any_of`>>
|
||||
--
|
||||
|
@ -97,7 +98,7 @@ set to `0`, the terms must appear next to each other.
|
|||
--
|
||||
|
||||
`ordered`::
|
||||
(Optional, boolean)
|
||||
(Optional, boolean)
|
||||
If `true`, matching terms must appear in their specified order. Defaults to
|
||||
`false`.
|
||||
|
||||
|
@ -177,6 +178,42 @@ The `pattern` is normalized using the search analyzer from this field, unless
|
|||
`analyzer` is specified separately.
|
||||
--
|
||||
|
||||
[[intervals-fuzzy]]
|
||||
==== `fuzzy` rule parameters
|
||||
|
||||
The `fuzzy` rule matches terms that are similar to the provided term, within an
|
||||
edit distance defined by <<fuzziness>>. If the fuzzy expansion matches more than
|
||||
128 terms, {es} returns an error.
|
||||
|
||||
`term`::
|
||||
(Required, string) The term to match
|
||||
|
||||
`prefix_length`::
|
||||
(Optional, string) Number of beginning characters left unchanged when creating
|
||||
expansions. Defaults to `0`.
|
||||
|
||||
`transpositions`::
|
||||
(Optional, boolean) Indicates whether edits include transpositions of two
|
||||
adjacent characters (ab → ba). Defaults to `true`.
|
||||
|
||||
`fuzziness`::
|
||||
(Optional, string) Maximum edit distance allowed for matching. See <<fuzziness>>
|
||||
for valid values and more information. Defaults to `auto`.
|
||||
|
||||
`analyzer`::
|
||||
(Optional, string) <<analysis, analyzer>> used to normalize the `term`.
|
||||
Defaults to the top-level `<field>` 's analyzer.
|
||||
|
||||
`use_field`::
|
||||
+
|
||||
--
|
||||
(Optional, string) If specified, match intervals from this field rather than the
|
||||
top-level `<field>`.
|
||||
|
||||
The `term` is normalized using the search analyzer from this field, unless
|
||||
`analyzer` is specified separately.
|
||||
--
|
||||
|
||||
[[intervals-all_of]]
|
||||
==== `all_of` rule parameters
|
||||
|
||||
|
|
|
@ -390,8 +390,8 @@ setup:
|
|||
---
|
||||
"Test prefix":
|
||||
- skip:
|
||||
version: " - 8.0.0"
|
||||
reason: "TODO: change to 7.3 in backport"
|
||||
version: " - 7.2.99"
|
||||
reason: "Implemented in 7.3"
|
||||
- do:
|
||||
search:
|
||||
index: test
|
||||
|
@ -410,8 +410,8 @@ setup:
|
|||
---
|
||||
"Test wildcard":
|
||||
- skip:
|
||||
version: " - 8.0.0"
|
||||
reason: "TODO: change to 7.3 in backport"
|
||||
version: " - 7.2.99"
|
||||
reason: "Implemented in 7.3"
|
||||
- do:
|
||||
search:
|
||||
index: test
|
||||
|
@ -427,3 +427,24 @@ setup:
|
|||
pattern: out?ide
|
||||
- match: { hits.total.value: 3 }
|
||||
|
||||
---
|
||||
"Test fuzzy match":
|
||||
- skip:
|
||||
version: " - 7.5.99"
|
||||
reason: "Implemented in 7.6"
|
||||
- do:
|
||||
search:
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
intervals:
|
||||
text:
|
||||
all_of:
|
||||
intervals:
|
||||
- fuzzy:
|
||||
query: cald
|
||||
- prefix:
|
||||
prefix: out
|
||||
- match: { hits.total.value: 3 }
|
||||
|
||||
|
||||
|
|
|
@ -67,6 +67,10 @@ public final class XIntervals {
|
|||
return new MultiTermIntervalsSource(ca, 128, prefix.utf8ToString());
|
||||
}
|
||||
|
||||
public static IntervalsSource multiterm(CompiledAutomaton ca, String label) {
|
||||
return new MultiTermIntervalsSource(ca, 128, label);
|
||||
}
|
||||
|
||||
static class MultiTermIntervalsSource extends IntervalsSource {
|
||||
|
||||
private final CompiledAutomaton automaton;
|
||||
|
|
|
@ -20,12 +20,15 @@
|
|||
package org.elasticsearch.index.query;
|
||||
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.XIntervals;
|
||||
import org.apache.lucene.queries.intervals.FilteredIntervalsSource;
|
||||
import org.apache.lucene.queries.intervals.IntervalIterator;
|
||||
import org.apache.lucene.queries.intervals.Intervals;
|
||||
import org.apache.lucene.queries.intervals.IntervalsSource;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.ParsingException;
|
||||
|
@ -33,7 +36,9 @@ import org.elasticsearch.common.io.stream.NamedWriteable;
|
|||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.unit.Fuzziness;
|
||||
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
|
||||
import org.elasticsearch.common.xcontent.ObjectParser;
|
||||
import org.elasticsearch.common.xcontent.ToXContentFragment;
|
||||
import org.elasticsearch.common.xcontent.ToXContentObject;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
|
@ -85,6 +90,8 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
|
|||
return Prefix.fromXContent(parser);
|
||||
case "wildcard":
|
||||
return Wildcard.fromXContent(parser);
|
||||
case "fuzzy":
|
||||
return Fuzzy.fromXContent(parser);
|
||||
}
|
||||
throw new ParsingException(parser.getTokenLocation(),
|
||||
"Unknown interval type [" + parser.currentName() + "], expecting one of [match, any_of, all_of, prefix, wildcard]");
|
||||
|
@ -691,6 +698,148 @@ public abstract class IntervalsSourceProvider implements NamedWriteable, ToXCont
|
|||
}
|
||||
}
|
||||
|
||||
public static class Fuzzy extends IntervalsSourceProvider {
|
||||
|
||||
public static final String NAME = "fuzzy";
|
||||
|
||||
private final String term;
|
||||
private final int prefixLength;
|
||||
private final boolean transpositions;
|
||||
private final Fuzziness fuzziness;
|
||||
private final String analyzer;
|
||||
private final String useField;
|
||||
|
||||
public Fuzzy(String term, int prefixLength, boolean transpositions, Fuzziness fuzziness, String analyzer, String useField) {
|
||||
this.term = term;
|
||||
this.prefixLength = prefixLength;
|
||||
this.transpositions = transpositions;
|
||||
this.fuzziness = fuzziness;
|
||||
this.analyzer = analyzer;
|
||||
this.useField = useField;
|
||||
}
|
||||
|
||||
public Fuzzy(StreamInput in) throws IOException {
|
||||
this.term = in.readString();
|
||||
this.prefixLength = in.readVInt();
|
||||
this.transpositions = in.readBoolean();
|
||||
this.fuzziness = new Fuzziness(in);
|
||||
this.analyzer = in.readOptionalString();
|
||||
this.useField = in.readOptionalString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntervalsSource getSource(QueryShardContext context, MappedFieldType fieldType) {
|
||||
NamedAnalyzer analyzer = fieldType.searchAnalyzer();
|
||||
if (this.analyzer != null) {
|
||||
analyzer = context.getMapperService().getIndexAnalyzers().get(this.analyzer);
|
||||
}
|
||||
IntervalsSource source;
|
||||
if (useField != null) {
|
||||
fieldType = context.fieldMapper(useField);
|
||||
assert fieldType != null;
|
||||
checkPositions(fieldType);
|
||||
if (this.analyzer == null) {
|
||||
analyzer = fieldType.searchAnalyzer();
|
||||
}
|
||||
}
|
||||
checkPositions(fieldType);
|
||||
BytesRef normalizedTerm = analyzer.normalize(fieldType.name(), term);
|
||||
FuzzyQuery fq = new FuzzyQuery(new Term(fieldType.name(), normalizedTerm),
|
||||
fuzziness.asDistance(term), prefixLength, 128, transpositions);
|
||||
CompiledAutomaton ca = new CompiledAutomaton(fq.toAutomaton());
|
||||
source = XIntervals.multiterm(ca, term);
|
||||
if (useField != null) {
|
||||
source = Intervals.fixField(useField, source);
|
||||
}
|
||||
return source;
|
||||
}
|
||||
|
||||
private void checkPositions(MappedFieldType type) {
|
||||
if (type.indexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
|
||||
throw new IllegalArgumentException("Cannot create intervals over field [" + type.name() + "] with no positions indexed");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractFields(Set<String> fields) {
|
||||
if (useField != null) {
|
||||
fields.add(useField);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
Fuzzy fuzzy = (Fuzzy) o;
|
||||
return prefixLength == fuzzy.prefixLength &&
|
||||
transpositions == fuzzy.transpositions &&
|
||||
Objects.equals(term, fuzzy.term) &&
|
||||
Objects.equals(fuzziness, fuzzy.fuzziness) &&
|
||||
Objects.equals(analyzer, fuzzy.analyzer) &&
|
||||
Objects.equals(useField, fuzzy.useField);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(term, prefixLength, transpositions, fuzziness, analyzer, useField);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getWriteableName() {
|
||||
return NAME;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeString(term);
|
||||
out.writeVInt(prefixLength);
|
||||
out.writeBoolean(transpositions);
|
||||
fuzziness.writeTo(out);
|
||||
out.writeOptionalString(analyzer);
|
||||
out.writeOptionalString(useField);
|
||||
}
|
||||
|
||||
@Override
|
||||
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
|
||||
builder.startObject(NAME);
|
||||
builder.field("term", term);
|
||||
builder.field("prefix_length", prefixLength);
|
||||
builder.field("transpositions", transpositions);
|
||||
fuzziness.toXContent(builder, params);
|
||||
if (analyzer != null) {
|
||||
builder.field("analyzer", analyzer);
|
||||
}
|
||||
if (useField != null) {
|
||||
builder.field("use_field", useField);
|
||||
}
|
||||
builder.endObject();
|
||||
return builder;
|
||||
}
|
||||
|
||||
private static final ConstructingObjectParser<Fuzzy, Void> PARSER = new ConstructingObjectParser<>(NAME, args -> {
|
||||
String term = (String) args[0];
|
||||
int prefixLength = (args[1] == null) ? FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH : (int) args[1];
|
||||
boolean transpositions = (args[2] == null) ? FuzzyQueryBuilder.DEFAULT_TRANSPOSITIONS : (boolean) args[2];
|
||||
Fuzziness fuzziness = (args[3] == null) ? FuzzyQueryBuilder.DEFAULT_FUZZINESS : (Fuzziness) args[3];
|
||||
String analyzer = (String) args[4];
|
||||
String useField = (String) args[5];
|
||||
return new Fuzzy(term, prefixLength, transpositions, fuzziness, analyzer, useField);
|
||||
});
|
||||
static {
|
||||
PARSER.declareString(constructorArg(), new ParseField("term"));
|
||||
PARSER.declareInt(optionalConstructorArg(), new ParseField("prefix_length"));
|
||||
PARSER.declareBoolean(optionalConstructorArg(), new ParseField("transpositions"));
|
||||
PARSER.declareField(optionalConstructorArg(), (p, c) -> Fuzziness.parse(p), Fuzziness.FIELD, ObjectParser.ValueType.VALUE);
|
||||
PARSER.declareString(optionalConstructorArg(), new ParseField("analyzer"));
|
||||
PARSER.declareString(optionalConstructorArg(), new ParseField("use_field"));
|
||||
}
|
||||
|
||||
public static Fuzzy fromXContent(XContentParser parser) throws IOException {
|
||||
return PARSER.parse(parser, null);
|
||||
}
|
||||
}
|
||||
|
||||
static class ScriptFilterSource extends FilteredIntervalsSource {
|
||||
|
||||
final IntervalFilterScript script;
|
||||
|
|
|
@ -861,6 +861,8 @@ public class SearchModule {
|
|||
IntervalsSourceProvider.Prefix.NAME, IntervalsSourceProvider.Prefix::new));
|
||||
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
|
||||
IntervalsSourceProvider.Wildcard.NAME, IntervalsSourceProvider.Wildcard::new));
|
||||
namedWriteables.add(new NamedWriteableRegistry.Entry(IntervalsSourceProvider.class,
|
||||
IntervalsSourceProvider.Fuzzy.NAME, IntervalsSourceProvider.Fuzzy::new));
|
||||
}
|
||||
|
||||
private void registerQuery(QuerySpec<?> spec) {
|
||||
|
|
|
@ -19,17 +19,22 @@
|
|||
|
||||
package org.elasticsearch.index.query;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queries.XIntervals;
|
||||
import org.apache.lucene.queries.intervals.IntervalQuery;
|
||||
import org.apache.lucene.queries.intervals.Intervals;
|
||||
import org.apache.lucene.queries.intervals.IntervalsSource;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MatchNoDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.elasticsearch.common.ParsingException;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.compress.CompressedXContent;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.unit.Fuzziness;
|
||||
import org.elasticsearch.common.util.BigArrays;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.mapper.MapperService;
|
||||
|
@ -529,4 +534,57 @@ public class IntervalQueryBuilderTests extends AbstractQueryTestCase<IntervalQue
|
|||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
}
|
||||
|
||||
private static IntervalsSource buildFuzzySource(String term, String label, int prefixLength, boolean transpositions, int editDistance) {
|
||||
FuzzyQuery fq = new FuzzyQuery(new Term("field", term), editDistance, prefixLength, 128, transpositions);
|
||||
return XIntervals.multiterm(new CompiledAutomaton(fq.toAutomaton()), label);
|
||||
}
|
||||
|
||||
public void testFuzzy() throws IOException {
|
||||
|
||||
String json = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
|
||||
"\"fuzzy\" : { \"term\" : \"Term\" } } } }";
|
||||
IntervalQueryBuilder builder = (IntervalQueryBuilder) parseQuery(json);
|
||||
|
||||
Query expected = new IntervalQuery(STRING_FIELD_NAME,
|
||||
buildFuzzySource("term", "Term", FuzzyQueryBuilder.DEFAULT_PREFIX_LENGTH, true, Fuzziness.AUTO.asDistance("term")));
|
||||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
|
||||
String json_with_prefix = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
|
||||
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2 } } } }";
|
||||
builder = (IntervalQueryBuilder) parseQuery(json_with_prefix);
|
||||
expected = new IntervalQuery(STRING_FIELD_NAME,
|
||||
buildFuzzySource("term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
|
||||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
|
||||
String json_with_fuzziness = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
|
||||
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\" } } } }";
|
||||
builder = (IntervalQueryBuilder) parseQuery(json_with_fuzziness);
|
||||
expected = new IntervalQuery(STRING_FIELD_NAME,
|
||||
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term")));
|
||||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
|
||||
String json_no_transpositions = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
|
||||
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"transpositions\" : false } } } }";
|
||||
builder = (IntervalQueryBuilder) parseQuery(json_no_transpositions);
|
||||
expected = new IntervalQuery(STRING_FIELD_NAME,
|
||||
buildFuzzySource("term", "Term", 2, false, Fuzziness.AUTO.asDistance("term")));
|
||||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
|
||||
String json_with_analyzer = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
|
||||
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"analyzer\" : \"keyword\" } } } }";
|
||||
builder = (IntervalQueryBuilder) parseQuery(json_with_analyzer);
|
||||
expected = new IntervalQuery(STRING_FIELD_NAME,
|
||||
buildFuzzySource("Term", "Term", 2, true, Fuzziness.AUTO.asDistance("term")));
|
||||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
|
||||
String json_with_fixfield = "{ \"intervals\" : { \"" + STRING_FIELD_NAME + "\": { " +
|
||||
"\"fuzzy\" : { \"term\" : \"Term\", \"prefix_length\" : 2, \"fuzziness\" : \"1\", " +
|
||||
"\"use_field\" : \"" + MASKED_FIELD + "\" } } } }";
|
||||
builder = (IntervalQueryBuilder) parseQuery(json_with_fixfield);
|
||||
expected = new IntervalQuery(STRING_FIELD_NAME, Intervals.fixField(MASKED_FIELD,
|
||||
buildFuzzySource("term", "Term", 2, true, Fuzziness.ONE.asDistance("term"))));
|
||||
assertEquals(expected, builder.toQuery(createShardContext()));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue