Configurable distance limit with the AUTO fuzziness. (#25731)
Make the distance thresholds configurable with the AUTO fuzziness.
This commit is contained in:
parent
e89d9400c9
commit
93cc2d0372
|
@ -18,6 +18,8 @@
|
|||
*/
|
||||
package org.elasticsearch.common.unit;
|
||||
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
|
@ -43,8 +45,12 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
|
|||
public static final Fuzziness TWO = new Fuzziness(2);
|
||||
public static final Fuzziness AUTO = new Fuzziness("AUTO");
|
||||
public static final ParseField FIELD = new ParseField(X_FIELD_NAME);
|
||||
private static final int DEFAULT_LOW_DISTANCE = 3;
|
||||
private static final int DEFAULT_HIGH_DISTANCE = 6;
|
||||
|
||||
private final String fuzziness;
|
||||
private int lowDistance = DEFAULT_LOW_DISTANCE;
|
||||
private int highDistance = DEFAULT_HIGH_DISTANCE;
|
||||
|
||||
private Fuzziness(int fuzziness) {
|
||||
if (fuzziness != 0 && fuzziness != 1 && fuzziness != 2) {
|
||||
|
@ -54,22 +60,48 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
|
|||
}
|
||||
|
||||
private Fuzziness(String fuzziness) {
|
||||
if (fuzziness == null) {
|
||||
if (fuzziness == null || fuzziness.isEmpty()) {
|
||||
throw new IllegalArgumentException("fuzziness can't be null!");
|
||||
}
|
||||
this.fuzziness = fuzziness.toUpperCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
private Fuzziness(String fuzziness, int lowDistance, int highDistance) {
|
||||
this(fuzziness);
|
||||
if (lowDistance < 0 || highDistance < 0 || lowDistance > highDistance) {
|
||||
throw new IllegalArgumentException("fuzziness wrongly configured, must be: lowDistance > 0, highDistance" +
|
||||
" > 0 and lowDistance <= highDistance ");
|
||||
}
|
||||
this.lowDistance = lowDistance;
|
||||
this.highDistance = highDistance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read from a stream.
|
||||
*/
|
||||
public Fuzziness(StreamInput in) throws IOException {
|
||||
fuzziness = in.readString();
|
||||
if (in.getVersion().onOrAfter(Version.V_6_1_0) && in.readBoolean()) {
|
||||
lowDistance = in.readVInt();
|
||||
highDistance = in.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeString(fuzziness);
|
||||
if (out.getVersion().onOrAfter(Version.V_6_1_0)) {
|
||||
// we cannot serialize the low/high bounds since the other node does not know about them.
|
||||
// This is a best-effort to not fail queries in case the cluster is being upgraded and users
|
||||
// start using features that are not available on all nodes.
|
||||
if (isAutoWithCustomValues()) {
|
||||
out.writeBoolean(true);
|
||||
out.writeVInt(lowDistance);
|
||||
out.writeVInt(highDistance);
|
||||
} else {
|
||||
out.writeBoolean(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -88,10 +120,29 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
|
|||
String string = fuzziness.toString();
|
||||
if (AUTO.asString().equalsIgnoreCase(string)) {
|
||||
return AUTO;
|
||||
} else if (string.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":")) {
|
||||
return parseCustomAuto(string);
|
||||
}
|
||||
return new Fuzziness(string);
|
||||
}
|
||||
|
||||
private static Fuzziness parseCustomAuto( final String string) {
|
||||
assert string.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":");
|
||||
String[] fuzzinessLimit = string.substring(AUTO.asString().length() + 1).split(",");
|
||||
if (fuzzinessLimit.length == 2) {
|
||||
try {
|
||||
int lowerLimit = Integer.parseInt(fuzzinessLimit[0]);
|
||||
int highLimit = Integer.parseInt(fuzzinessLimit[1]);
|
||||
return new Fuzziness("AUTO", lowerLimit, highLimit);
|
||||
} catch (NumberFormatException e) {
|
||||
throw new ElasticsearchParseException("failed to parse [{}] as a \"auto:int,int\"", e,
|
||||
string);
|
||||
}
|
||||
} else {
|
||||
throw new ElasticsearchParseException("failed to find low and high distance values");
|
||||
}
|
||||
}
|
||||
|
||||
public static Fuzziness parse(XContentParser parser) throws IOException {
|
||||
XContentParser.Token token = parser.currentToken();
|
||||
switch (token) {
|
||||
|
@ -100,6 +151,8 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
|
|||
final String fuzziness = parser.text();
|
||||
if (AUTO.asString().equalsIgnoreCase(fuzziness)) {
|
||||
return AUTO;
|
||||
} else if (fuzziness.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":")) {
|
||||
return parseCustomAuto(fuzziness);
|
||||
}
|
||||
try {
|
||||
final int minimumSimilarity = Integer.parseInt(fuzziness);
|
||||
|
@ -135,19 +188,19 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
|
|||
public int asDistance(String text) {
|
||||
if (this.equals(AUTO)) { //AUTO
|
||||
final int len = termLen(text);
|
||||
if (len <= 2) {
|
||||
if (len < lowDistance) {
|
||||
return 0;
|
||||
} else if (len > 5) {
|
||||
return 2;
|
||||
} else {
|
||||
} else if (len < highDistance) {
|
||||
return 1;
|
||||
} else {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
return Math.min(2, (int) asFloat());
|
||||
}
|
||||
|
||||
public float asFloat() {
|
||||
if (this.equals(AUTO)) {
|
||||
if (this.equals(AUTO) || isAutoWithCustomValues()) {
|
||||
return 1f;
|
||||
}
|
||||
return Float.parseFloat(fuzziness.toString());
|
||||
|
@ -158,9 +211,17 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
|
|||
}
|
||||
|
||||
public String asString() {
|
||||
if (isAutoWithCustomValues()) {
|
||||
return fuzziness.toString() + ":" + lowDistance + "," + highDistance;
|
||||
}
|
||||
return fuzziness.toString();
|
||||
}
|
||||
|
||||
private boolean isAutoWithCustomValues() {
|
||||
return fuzziness.startsWith("AUTO") && (lowDistance != DEFAULT_LOW_DISTANCE ||
|
||||
highDistance != DEFAULT_HIGH_DISTANCE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
*/
|
||||
package org.elasticsearch.common.unit;
|
||||
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamOutput;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
|
@ -49,8 +50,8 @@ public class FuzzinessTests extends ESTestCase {
|
|||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_NUMBER));
|
||||
Fuzziness parse = Fuzziness.parse(parser);
|
||||
assertThat(parse.asFloat(), equalTo(floatValue));
|
||||
Fuzziness fuzziness = Fuzziness.parse(parser);
|
||||
assertThat(fuzziness.asFloat(), equalTo(floatValue));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
|
||||
}
|
||||
{
|
||||
|
@ -67,21 +68,21 @@ public class FuzzinessTests extends ESTestCase {
|
|||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
|
||||
assertThat(parser.nextToken(), anyOf(equalTo(XContentParser.Token.VALUE_NUMBER), equalTo(XContentParser.Token.VALUE_STRING)));
|
||||
Fuzziness parse = Fuzziness.parse(parser);
|
||||
Fuzziness fuzziness = Fuzziness.parse(parser);
|
||||
if (value.intValue() >= 1) {
|
||||
assertThat(parse.asDistance(), equalTo(Math.min(2, value.intValue())));
|
||||
assertThat(fuzziness.asDistance(), equalTo(Math.min(2, value.intValue())));
|
||||
}
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
|
||||
if (intValue.equals(value)) {
|
||||
switch (intValue) {
|
||||
case 1:
|
||||
assertThat(parse, sameInstance(Fuzziness.ONE));
|
||||
assertThat(fuzziness, sameInstance(Fuzziness.ONE));
|
||||
break;
|
||||
case 2:
|
||||
assertThat(parse, sameInstance(Fuzziness.TWO));
|
||||
assertThat(fuzziness, sameInstance(Fuzziness.TWO));
|
||||
break;
|
||||
case 0:
|
||||
assertThat(parse, sameInstance(Fuzziness.ZERO));
|
||||
assertThat(fuzziness, sameInstance(Fuzziness.ZERO));
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
@ -90,19 +91,26 @@ public class FuzzinessTests extends ESTestCase {
|
|||
}
|
||||
{
|
||||
XContentBuilder json;
|
||||
if (randomBoolean()) {
|
||||
boolean isDefaultAutoFuzzinessTested = randomBoolean();
|
||||
if (isDefaultAutoFuzzinessTested) {
|
||||
json = Fuzziness.AUTO.toXContent(jsonBuilder().startObject(), null).endObject();
|
||||
} else {
|
||||
String auto = randomBoolean() ? "AUTO" : "auto";
|
||||
if (randomBoolean()) {
|
||||
auto += ":" + randomIntBetween(1, 3) + "," + randomIntBetween(4, 10);
|
||||
}
|
||||
json = jsonBuilder().startObject()
|
||||
.field(Fuzziness.X_FIELD_NAME, randomBoolean() ? "AUTO" : "auto")
|
||||
.endObject();
|
||||
.field(Fuzziness.X_FIELD_NAME, auto)
|
||||
.endObject();
|
||||
}
|
||||
XContentParser parser = createParser(json);
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_STRING));
|
||||
Fuzziness parse = Fuzziness.parse(parser);
|
||||
assertThat(parse, sameInstance(Fuzziness.AUTO));
|
||||
Fuzziness fuzziness = Fuzziness.parse(parser);
|
||||
if (isDefaultAutoFuzzinessTested) {
|
||||
assertThat(fuzziness, sameInstance(Fuzziness.AUTO));
|
||||
}
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
|
||||
}
|
||||
}
|
||||
|
@ -132,13 +140,30 @@ public class FuzzinessTests extends ESTestCase {
|
|||
assertEquals(fuzziness, deserializedFuzziness);
|
||||
}
|
||||
|
||||
public void testSerializationAuto() throws IOException {
|
||||
public void testSerializationDefaultAuto() throws IOException {
|
||||
Fuzziness fuzziness = Fuzziness.AUTO;
|
||||
Fuzziness deserializedFuzziness = doSerializeRoundtrip(fuzziness);
|
||||
assertEquals(fuzziness, deserializedFuzziness);
|
||||
assertEquals(fuzziness.asFloat(), deserializedFuzziness.asFloat(), 0f);
|
||||
}
|
||||
|
||||
public void testSerializationCustomAuto() throws IOException {
|
||||
String auto = "AUTO:4,7";
|
||||
XContentBuilder json = jsonBuilder().startObject()
|
||||
.field(Fuzziness.X_FIELD_NAME, auto)
|
||||
.endObject();
|
||||
|
||||
XContentParser parser = createParser(json);
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
|
||||
assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_STRING));
|
||||
Fuzziness fuzziness = Fuzziness.parse(parser);
|
||||
|
||||
Fuzziness deserializedFuzziness = doSerializeRoundtrip(fuzziness);
|
||||
assertEquals(fuzziness, deserializedFuzziness);
|
||||
assertEquals(fuzziness.asString(), deserializedFuzziness.asString());
|
||||
}
|
||||
|
||||
private static Fuzziness doSerializeRoundtrip(Fuzziness in) throws IOException {
|
||||
BytesStreamOutput output = new BytesStreamOutput();
|
||||
in.writeTo(output);
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.search.BoostQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.elasticsearch.ElasticsearchParseException;
|
||||
import org.elasticsearch.common.ParsingException;
|
||||
import org.elasticsearch.common.unit.Fuzziness;
|
||||
import org.elasticsearch.search.internal.SearchContext;
|
||||
|
@ -120,6 +121,92 @@ public class FuzzyQueryBuilderTests extends AbstractQueryTestCase<FuzzyQueryBuil
|
|||
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
|
||||
}
|
||||
|
||||
public void testToQueryWithStringFieldDefinedFuzziness() throws IOException {
|
||||
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
|
||||
String query = "{\n" +
|
||||
" \"fuzzy\":{\n" +
|
||||
" \"" + STRING_FIELD_NAME + "\":{\n" +
|
||||
" \"value\":\"sh\",\n" +
|
||||
" \"fuzziness\": \"AUTO:2,5\",\n" +
|
||||
" \"prefix_length\":1,\n" +
|
||||
" \"boost\":2.0\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
"}";
|
||||
Query parsedQuery = parseQuery(query).toQuery(createShardContext());
|
||||
assertThat(parsedQuery, instanceOf(BoostQuery.class));
|
||||
BoostQuery boostQuery = (BoostQuery) parsedQuery;
|
||||
assertThat(boostQuery.getBoost(), equalTo(2.0f));
|
||||
assertThat(boostQuery.getQuery(), instanceOf(FuzzyQuery.class));
|
||||
FuzzyQuery fuzzyQuery = (FuzzyQuery) boostQuery.getQuery();
|
||||
assertThat(fuzzyQuery.getTerm(), equalTo(new Term(STRING_FIELD_NAME, "sh")));
|
||||
assertThat(fuzzyQuery.getMaxEdits(), equalTo(1));
|
||||
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
|
||||
}
|
||||
|
||||
public void testToQueryWithStringFieldDefinedWrongFuzziness() throws IOException {
|
||||
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
|
||||
String queryMissingFuzzinessUpLimit = "{\n" +
|
||||
" \"fuzzy\":{\n" +
|
||||
" \"" + STRING_FIELD_NAME + "\":{\n" +
|
||||
" \"value\":\"sh\",\n" +
|
||||
" \"fuzziness\": \"AUTO:2\",\n" +
|
||||
" \"prefix_length\":1,\n" +
|
||||
" \"boost\":2.0\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
"}";
|
||||
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class,
|
||||
() -> parseQuery(queryMissingFuzzinessUpLimit).toQuery(createShardContext()));
|
||||
String msg = "failed to find low and high distance values";
|
||||
assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
|
||||
|
||||
String queryHavingNegativeFuzzinessLowLimit = "{\n" +
|
||||
" \"fuzzy\":{\n" +
|
||||
" \"" + STRING_FIELD_NAME + "\":{\n" +
|
||||
" \"value\":\"sh\",\n" +
|
||||
" \"fuzziness\": \"AUTO:-1,6\",\n" +
|
||||
" \"prefix_length\":1,\n" +
|
||||
" \"boost\":2.0\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
"}";
|
||||
String msg2 = "fuzziness wrongly configured";
|
||||
IllegalArgumentException e2 = expectThrows(IllegalArgumentException.class,
|
||||
() -> parseQuery(queryHavingNegativeFuzzinessLowLimit).toQuery(createShardContext()));
|
||||
assertTrue(e2.getMessage() + " didn't contain: " + msg2 + " but: " + e.getMessage(), e.getMessage().contains
|
||||
(msg));
|
||||
|
||||
String queryMissingFuzzinessUpLimit2 = "{\n" +
|
||||
" \"fuzzy\":{\n" +
|
||||
" \"" + STRING_FIELD_NAME + "\":{\n" +
|
||||
" \"value\":\"sh\",\n" +
|
||||
" \"fuzziness\": \"AUTO:1,\",\n" +
|
||||
" \"prefix_length\":1,\n" +
|
||||
" \"boost\":2.0\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
"}";
|
||||
e = expectThrows(ElasticsearchParseException.class,
|
||||
() -> parseQuery(queryMissingFuzzinessUpLimit2).toQuery(createShardContext()));
|
||||
assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
|
||||
|
||||
String queryMissingFuzzinessLowLimit = "{\n" +
|
||||
" \"fuzzy\":{\n" +
|
||||
" \"" + STRING_FIELD_NAME + "\":{\n" +
|
||||
" \"value\":\"sh\",\n" +
|
||||
" \"fuzziness\": \"AUTO:,5\",\n" +
|
||||
" \"prefix_length\":1,\n" +
|
||||
" \"boost\":2.0\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
"}";
|
||||
e = expectThrows(ElasticsearchParseException.class,
|
||||
() -> parseQuery(queryMissingFuzzinessLowLimit).toQuery(createShardContext()));
|
||||
msg = "failed to parse [AUTO:,5] as a \"auto:int,int\"";
|
||||
assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
|
||||
}
|
||||
|
||||
public void testToQueryWithNumericField() throws IOException {
|
||||
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
|
||||
String query = "{\n" +
|
||||
|
|
|
@ -577,7 +577,9 @@ the maximum allowed Levenshtein Edit Distance (or number of edits)
|
|||
`AUTO`::
|
||||
+
|
||||
--
|
||||
generates an edit distance based on the length of the term. For lengths:
|
||||
generates an edit distance based on the length of the term.
|
||||
Low and high distance arguments may be optionally provided `AUTO:[low],[high]`, if not specified,
|
||||
the default values are 3 and 6, equivalent to `AUTO:3,6` that make for lengths:
|
||||
|
||||
`0..2`:: must match exactly
|
||||
`3..5`:: one edit allowed
|
||||
|
|
Loading…
Reference in New Issue