Configurable distance limit with the AUTO fuzziness. (#25731)

Make the distance thresholds configurable with the AUTO fuzziness.
This commit is contained in:
Antonio Matarrese 2017-08-21 12:00:20 +03:00 committed by Adrien Grand
parent e89d9400c9
commit 93cc2d0372
4 changed files with 195 additions and 20 deletions

View File

@ -18,6 +18,8 @@
*/
package org.elasticsearch.common.unit;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
@ -43,8 +45,12 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
public static final Fuzziness TWO = new Fuzziness(2);
public static final Fuzziness AUTO = new Fuzziness("AUTO");
public static final ParseField FIELD = new ParseField(X_FIELD_NAME);
private static final int DEFAULT_LOW_DISTANCE = 3;
private static final int DEFAULT_HIGH_DISTANCE = 6;
private final String fuzziness;
private int lowDistance = DEFAULT_LOW_DISTANCE;
private int highDistance = DEFAULT_HIGH_DISTANCE;
private Fuzziness(int fuzziness) {
if (fuzziness != 0 && fuzziness != 1 && fuzziness != 2) {
@ -54,22 +60,48 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
}
private Fuzziness(String fuzziness) {
if (fuzziness == null) {
if (fuzziness == null || fuzziness.isEmpty()) {
throw new IllegalArgumentException("fuzziness can't be null!");
}
this.fuzziness = fuzziness.toUpperCase(Locale.ROOT);
}
private Fuzziness(String fuzziness, int lowDistance, int highDistance) {
this(fuzziness);
if (lowDistance < 0 || highDistance < 0 || lowDistance > highDistance) {
throw new IllegalArgumentException("fuzziness wrongly configured, must be: lowDistance > 0, highDistance" +
" > 0 and lowDistance <= highDistance ");
}
this.lowDistance = lowDistance;
this.highDistance = highDistance;
}
/**
* Read from a stream.
*/
public Fuzziness(StreamInput in) throws IOException {
fuzziness = in.readString();
if (in.getVersion().onOrAfter(Version.V_6_1_0) && in.readBoolean()) {
lowDistance = in.readVInt();
highDistance = in.readVInt();
}
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(fuzziness);
if (out.getVersion().onOrAfter(Version.V_6_1_0)) {
// we cannot serialize the low/high bounds since the other node does not know about them.
// This is a best-effort to not fail queries in case the cluster is being upgraded and users
// start using features that are not available on all nodes.
if (isAutoWithCustomValues()) {
out.writeBoolean(true);
out.writeVInt(lowDistance);
out.writeVInt(highDistance);
} else {
out.writeBoolean(false);
}
}
}
/**
@ -88,10 +120,29 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
String string = fuzziness.toString();
if (AUTO.asString().equalsIgnoreCase(string)) {
return AUTO;
} else if (string.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":")) {
return parseCustomAuto(string);
}
return new Fuzziness(string);
}
private static Fuzziness parseCustomAuto( final String string) {
assert string.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":");
String[] fuzzinessLimit = string.substring(AUTO.asString().length() + 1).split(",");
if (fuzzinessLimit.length == 2) {
try {
int lowerLimit = Integer.parseInt(fuzzinessLimit[0]);
int highLimit = Integer.parseInt(fuzzinessLimit[1]);
return new Fuzziness("AUTO", lowerLimit, highLimit);
} catch (NumberFormatException e) {
throw new ElasticsearchParseException("failed to parse [{}] as a \"auto:int,int\"", e,
string);
}
} else {
throw new ElasticsearchParseException("failed to find low and high distance values");
}
}
public static Fuzziness parse(XContentParser parser) throws IOException {
XContentParser.Token token = parser.currentToken();
switch (token) {
@ -100,6 +151,8 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
final String fuzziness = parser.text();
if (AUTO.asString().equalsIgnoreCase(fuzziness)) {
return AUTO;
} else if (fuzziness.toUpperCase(Locale.ROOT).startsWith(AUTO.asString() + ":")) {
return parseCustomAuto(fuzziness);
}
try {
final int minimumSimilarity = Integer.parseInt(fuzziness);
@ -135,19 +188,19 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
public int asDistance(String text) {
if (this.equals(AUTO)) { //AUTO
final int len = termLen(text);
if (len <= 2) {
if (len < lowDistance) {
return 0;
} else if (len > 5) {
return 2;
} else {
} else if (len < highDistance) {
return 1;
} else {
return 2;
}
}
return Math.min(2, (int) asFloat());
}
public float asFloat() {
if (this.equals(AUTO)) {
if (this.equals(AUTO) || isAutoWithCustomValues()) {
return 1f;
}
return Float.parseFloat(fuzziness.toString());
@ -158,9 +211,17 @@ public final class Fuzziness implements ToXContentFragment, Writeable {
}
public String asString() {
if (isAutoWithCustomValues()) {
return fuzziness.toString() + ":" + lowDistance + "," + highDistance;
}
return fuzziness.toString();
}
private boolean isAutoWithCustomValues() {
return fuzziness.startsWith("AUTO") && (lowDistance != DEFAULT_LOW_DISTANCE ||
highDistance != DEFAULT_HIGH_DISTANCE);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {

View File

@ -18,6 +18,7 @@
*/
package org.elasticsearch.common.unit;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.xcontent.XContentBuilder;
@ -49,8 +50,8 @@ public class FuzzinessTests extends ESTestCase {
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_NUMBER));
Fuzziness parse = Fuzziness.parse(parser);
assertThat(parse.asFloat(), equalTo(floatValue));
Fuzziness fuzziness = Fuzziness.parse(parser);
assertThat(fuzziness.asFloat(), equalTo(floatValue));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
}
{
@ -67,21 +68,21 @@ public class FuzzinessTests extends ESTestCase {
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
assertThat(parser.nextToken(), anyOf(equalTo(XContentParser.Token.VALUE_NUMBER), equalTo(XContentParser.Token.VALUE_STRING)));
Fuzziness parse = Fuzziness.parse(parser);
Fuzziness fuzziness = Fuzziness.parse(parser);
if (value.intValue() >= 1) {
assertThat(parse.asDistance(), equalTo(Math.min(2, value.intValue())));
assertThat(fuzziness.asDistance(), equalTo(Math.min(2, value.intValue())));
}
assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
if (intValue.equals(value)) {
switch (intValue) {
case 1:
assertThat(parse, sameInstance(Fuzziness.ONE));
assertThat(fuzziness, sameInstance(Fuzziness.ONE));
break;
case 2:
assertThat(parse, sameInstance(Fuzziness.TWO));
assertThat(fuzziness, sameInstance(Fuzziness.TWO));
break;
case 0:
assertThat(parse, sameInstance(Fuzziness.ZERO));
assertThat(fuzziness, sameInstance(Fuzziness.ZERO));
break;
default:
break;
@ -90,19 +91,26 @@ public class FuzzinessTests extends ESTestCase {
}
{
XContentBuilder json;
if (randomBoolean()) {
boolean isDefaultAutoFuzzinessTested = randomBoolean();
if (isDefaultAutoFuzzinessTested) {
json = Fuzziness.AUTO.toXContent(jsonBuilder().startObject(), null).endObject();
} else {
String auto = randomBoolean() ? "AUTO" : "auto";
if (randomBoolean()) {
auto += ":" + randomIntBetween(1, 3) + "," + randomIntBetween(4, 10);
}
json = jsonBuilder().startObject()
.field(Fuzziness.X_FIELD_NAME, randomBoolean() ? "AUTO" : "auto")
.endObject();
.field(Fuzziness.X_FIELD_NAME, auto)
.endObject();
}
XContentParser parser = createParser(json);
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_STRING));
Fuzziness parse = Fuzziness.parse(parser);
assertThat(parse, sameInstance(Fuzziness.AUTO));
Fuzziness fuzziness = Fuzziness.parse(parser);
if (isDefaultAutoFuzzinessTested) {
assertThat(fuzziness, sameInstance(Fuzziness.AUTO));
}
assertThat(parser.nextToken(), equalTo(XContentParser.Token.END_OBJECT));
}
}
@ -132,13 +140,30 @@ public class FuzzinessTests extends ESTestCase {
assertEquals(fuzziness, deserializedFuzziness);
}
public void testSerializationAuto() throws IOException {
public void testSerializationDefaultAuto() throws IOException {
Fuzziness fuzziness = Fuzziness.AUTO;
Fuzziness deserializedFuzziness = doSerializeRoundtrip(fuzziness);
assertEquals(fuzziness, deserializedFuzziness);
assertEquals(fuzziness.asFloat(), deserializedFuzziness.asFloat(), 0f);
}
public void testSerializationCustomAuto() throws IOException {
String auto = "AUTO:4,7";
XContentBuilder json = jsonBuilder().startObject()
.field(Fuzziness.X_FIELD_NAME, auto)
.endObject();
XContentParser parser = createParser(json);
assertThat(parser.nextToken(), equalTo(XContentParser.Token.START_OBJECT));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.FIELD_NAME));
assertThat(parser.nextToken(), equalTo(XContentParser.Token.VALUE_STRING));
Fuzziness fuzziness = Fuzziness.parse(parser);
Fuzziness deserializedFuzziness = doSerializeRoundtrip(fuzziness);
assertEquals(fuzziness, deserializedFuzziness);
assertEquals(fuzziness.asString(), deserializedFuzziness.asString());
}
private static Fuzziness doSerializeRoundtrip(Fuzziness in) throws IOException {
BytesStreamOutput output = new BytesStreamOutput();
in.writeTo(output);

View File

@ -23,6 +23,7 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.unit.Fuzziness;
import org.elasticsearch.search.internal.SearchContext;
@ -120,6 +121,92 @@ public class FuzzyQueryBuilderTests extends AbstractQueryTestCase<FuzzyQueryBuil
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
}
public void testToQueryWithStringFieldDefinedFuzziness() throws IOException {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
String query = "{\n" +
" \"fuzzy\":{\n" +
" \"" + STRING_FIELD_NAME + "\":{\n" +
" \"value\":\"sh\",\n" +
" \"fuzziness\": \"AUTO:2,5\",\n" +
" \"prefix_length\":1,\n" +
" \"boost\":2.0\n" +
" }\n" +
" }\n" +
"}";
Query parsedQuery = parseQuery(query).toQuery(createShardContext());
assertThat(parsedQuery, instanceOf(BoostQuery.class));
BoostQuery boostQuery = (BoostQuery) parsedQuery;
assertThat(boostQuery.getBoost(), equalTo(2.0f));
assertThat(boostQuery.getQuery(), instanceOf(FuzzyQuery.class));
FuzzyQuery fuzzyQuery = (FuzzyQuery) boostQuery.getQuery();
assertThat(fuzzyQuery.getTerm(), equalTo(new Term(STRING_FIELD_NAME, "sh")));
assertThat(fuzzyQuery.getMaxEdits(), equalTo(1));
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
}
public void testToQueryWithStringFieldDefinedWrongFuzziness() throws IOException {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
String queryMissingFuzzinessUpLimit = "{\n" +
" \"fuzzy\":{\n" +
" \"" + STRING_FIELD_NAME + "\":{\n" +
" \"value\":\"sh\",\n" +
" \"fuzziness\": \"AUTO:2\",\n" +
" \"prefix_length\":1,\n" +
" \"boost\":2.0\n" +
" }\n" +
" }\n" +
"}";
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class,
() -> parseQuery(queryMissingFuzzinessUpLimit).toQuery(createShardContext()));
String msg = "failed to find low and high distance values";
assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
String queryHavingNegativeFuzzinessLowLimit = "{\n" +
" \"fuzzy\":{\n" +
" \"" + STRING_FIELD_NAME + "\":{\n" +
" \"value\":\"sh\",\n" +
" \"fuzziness\": \"AUTO:-1,6\",\n" +
" \"prefix_length\":1,\n" +
" \"boost\":2.0\n" +
" }\n" +
" }\n" +
"}";
String msg2 = "fuzziness wrongly configured";
IllegalArgumentException e2 = expectThrows(IllegalArgumentException.class,
() -> parseQuery(queryHavingNegativeFuzzinessLowLimit).toQuery(createShardContext()));
assertTrue(e2.getMessage() + " didn't contain: " + msg2 + " but: " + e.getMessage(), e.getMessage().contains
(msg));
String queryMissingFuzzinessUpLimit2 = "{\n" +
" \"fuzzy\":{\n" +
" \"" + STRING_FIELD_NAME + "\":{\n" +
" \"value\":\"sh\",\n" +
" \"fuzziness\": \"AUTO:1,\",\n" +
" \"prefix_length\":1,\n" +
" \"boost\":2.0\n" +
" }\n" +
" }\n" +
"}";
e = expectThrows(ElasticsearchParseException.class,
() -> parseQuery(queryMissingFuzzinessUpLimit2).toQuery(createShardContext()));
assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
String queryMissingFuzzinessLowLimit = "{\n" +
" \"fuzzy\":{\n" +
" \"" + STRING_FIELD_NAME + "\":{\n" +
" \"value\":\"sh\",\n" +
" \"fuzziness\": \"AUTO:,5\",\n" +
" \"prefix_length\":1,\n" +
" \"boost\":2.0\n" +
" }\n" +
" }\n" +
"}";
e = expectThrows(ElasticsearchParseException.class,
() -> parseQuery(queryMissingFuzzinessLowLimit).toQuery(createShardContext()));
msg = "failed to parse [AUTO:,5] as a \"auto:int,int\"";
assertTrue(e.getMessage() + " didn't contain: " + msg + " but: " + e.getMessage(), e.getMessage().contains(msg));
}
public void testToQueryWithNumericField() throws IOException {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
String query = "{\n" +

View File

@ -577,7 +577,9 @@ the maximum allowed Levenshtein Edit Distance (or number of edits)
`AUTO`::
+
--
generates an edit distance based on the length of the term. For lengths:
generates an edit distance based on the length of the term.
Low and high distance arguments may be optionally provided `AUTO:[low],[high]`, if not specified,
the default values are 3 and 6, equivalent to `AUTO:3,6` that make for lengths:
`0..2`:: must match exactly
`3..5`:: one edit allowed