Don't allow fuzziness specified as a and require edits [0,2]

Lucene deprecated this in 4.0 and we only try best effort to support it.
Folks should only use edit distance rather than some length based
similarity. Yet the formular is simple enough such that users can
still do it in the client if they really need to.

Closes #10638
This commit is contained in:
Simon Willnauer 2015-07-14 12:17:46 +02:00
parent 738a0bc8a2
commit 09bd19b947
8 changed files with 28 additions and 116 deletions

View File

@ -128,7 +128,7 @@ public class MapperQueryParser extends QueryParser {
setLowercaseExpandedTerms(settings.lowercaseExpandedTerms()); setLowercaseExpandedTerms(settings.lowercaseExpandedTerms());
setPhraseSlop(settings.phraseSlop()); setPhraseSlop(settings.phraseSlop());
setDefaultOperator(settings.defaultOperator()); setDefaultOperator(settings.defaultOperator());
setFuzzyMinSim(settings.fuzzyMinSim()); setFuzzyMinSim(settings.getFuzziness().asFloat());
setFuzzyPrefixLength(settings.fuzzyPrefixLength()); setFuzzyPrefixLength(settings.fuzzyPrefixLength());
setLocale(settings.locale()); setLocale(settings.locale());
this.analyzeWildcard = settings.analyzeWildcard(); this.analyzeWildcard = settings.analyzeWildcard();

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.common.unit.Fuzziness;
import org.joda.time.DateTimeZone; import org.joda.time.DateTimeZone;
import java.util.Collection; import java.util.Collection;
@ -49,7 +50,7 @@ public class QueryParserSettings {
private boolean lowercaseExpandedTerms = true; private boolean lowercaseExpandedTerms = true;
private boolean enablePositionIncrements = true; private boolean enablePositionIncrements = true;
private int phraseSlop = 0; private int phraseSlop = 0;
private float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; private Fuzziness fuzziness = Fuzziness.AUTO;
private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions; private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions;
private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES; private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
@ -158,14 +159,6 @@ public class QueryParserSettings {
this.phraseSlop = phraseSlop; this.phraseSlop = phraseSlop;
} }
public float fuzzyMinSim() {
return fuzzyMinSim;
}
public void fuzzyMinSim(float fuzzyMinSim) {
this.fuzzyMinSim = fuzzyMinSim;
}
public int fuzzyPrefixLength() { public int fuzzyPrefixLength() {
return fuzzyPrefixLength; return fuzzyPrefixLength;
} }
@ -340,7 +333,7 @@ public class QueryParserSettings {
if (enablePositionIncrements != that.enablePositionIncrements) return false; if (enablePositionIncrements != that.enablePositionIncrements) return false;
if (escape != that.escape) return false; if (escape != that.escape) return false;
if (analyzeWildcard != that.analyzeWildcard) return false; if (analyzeWildcard != that.analyzeWildcard) return false;
if (Float.compare(that.fuzzyMinSim, fuzzyMinSim) != 0) return false; if (fuzziness != null ? fuzziness.equals(that.fuzziness) == false : fuzziness != null) return false;
if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false; if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false;
if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false; if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false;
if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null) if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null)
@ -395,7 +388,7 @@ public class QueryParserSettings {
result = 31 * result + (lowercaseExpandedTerms ? 1 : 0); result = 31 * result + (lowercaseExpandedTerms ? 1 : 0);
result = 31 * result + (enablePositionIncrements ? 1 : 0); result = 31 * result + (enablePositionIncrements ? 1 : 0);
result = 31 * result + phraseSlop; result = 31 * result + phraseSlop;
result = 31 * result + (fuzzyMinSim != +0.0f ? Float.floatToIntBits(fuzzyMinSim) : 0); result = 31 * result + (fuzziness.hashCode());
result = 31 * result + fuzzyPrefixLength; result = 31 * result + fuzzyPrefixLength;
result = 31 * result + (escape ? 1 : 0); result = 31 * result + (escape ? 1 : 0);
result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0); result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0);
@ -413,4 +406,12 @@ public class QueryParserSettings {
result = 31 * result + (timeZone != null ? timeZone.hashCode() : 0); result = 31 * result + (timeZone != null ? timeZone.hashCode() : 0);
return result; return result;
} }
public void setFuzziness(Fuzziness fuzziness) {
this.fuzziness = fuzziness;
}
public Fuzziness getFuzziness() {
return fuzziness;
}
} }

View File

@ -43,29 +43,17 @@ public final class Fuzziness implements ToXContent {
public static final Fuzziness AUTO = new Fuzziness("AUTO"); public static final Fuzziness AUTO = new Fuzziness("AUTO");
public static final ParseField FIELD = new ParseField(X_FIELD_NAME.camelCase().getValue()); public static final ParseField FIELD = new ParseField(X_FIELD_NAME.camelCase().getValue());
private final Object fuzziness; private final String fuzziness;
private Fuzziness(int fuzziness) { private Fuzziness(int fuzziness) {
Preconditions.checkArgument(fuzziness >= 0 && fuzziness <= 2, "Valid edit distances are [0, 1, 2] but was [" + fuzziness + "]"); Preconditions.checkArgument(fuzziness >= 0 && fuzziness <= 2, "Valid edit distances are [0, 1, 2] but was [" + fuzziness + "]");
this.fuzziness = fuzziness; this.fuzziness = Integer.toString(fuzziness);
}
private Fuzziness(float fuzziness) {
Preconditions.checkArgument(fuzziness >= 0.0 && fuzziness < 1.0f, "Valid similarities must be in the interval [0..1] but was [" + fuzziness + "]");
this.fuzziness = fuzziness;
} }
private Fuzziness(String fuzziness) { private Fuzziness(String fuzziness) {
this.fuzziness = fuzziness; this.fuzziness = fuzziness;
} }
/**
* Creates a {@link Fuzziness} instance from a similarity. The value must be in the range <tt>[0..1)</tt>
*/
public static Fuzziness fromSimilarity(float similarity) {
return new Fuzziness(similarity);
}
/** /**
* Creates a {@link Fuzziness} instance from an edit distance. The value must be one of <tt>[0, 1, 2]</tt> * Creates a {@link Fuzziness} instance from an edit distance. The value must be one of <tt>[0, 1, 2]</tt>
*/ */
@ -133,7 +121,6 @@ public final class Fuzziness implements ToXContent {
} }
public int asDistance(String text) { public int asDistance(String text) {
if (fuzziness instanceof String) {
if (this == AUTO) { //AUTO if (this == AUTO) { //AUTO
final int len = termLen(text); final int len = termLen(text);
if (len <= 2) { if (len <= 2) {
@ -144,8 +131,7 @@ public final class Fuzziness implements ToXContent {
return 1; return 1;
} }
} }
} return Math.min(2, asInt());
return FuzzyQuery.floatToEdits(asFloat(), termLen(text));
} }
public TimeValue asTimeValue() { public TimeValue asTimeValue() {
@ -214,37 +200,6 @@ public final class Fuzziness implements ToXContent {
return Float.parseFloat(fuzziness.toString()); return Float.parseFloat(fuzziness.toString());
} }
public float asSimilarity() {
return asSimilarity(null);
}
public float asSimilarity(String text) {
if (this == AUTO) {
final int len = termLen(text);
if (len <= 2) {
return 0.0f;
} else if (len > 5) {
return 0.5f;
} else {
return 0.66f;
}
// return dist == 0 ? dist : Math.min(0.999f, Math.max(0.0f, 1.0f - ((float) dist/ (float) termLen(text))));
}
if (fuzziness instanceof Float) { // it's a similarity
return ((Float) fuzziness).floatValue();
} else if (fuzziness instanceof Integer) { // it's an edit!
int dist = Math.min(((Integer) fuzziness).intValue(),
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
return Math.min(0.999f, Math.max(0.0f, 1.0f - ((float) dist / (float) termLen(text))));
} else {
final float similarity = Float.parseFloat(fuzziness.toString());
if (similarity >= 0.0f && similarity < 1.0f) {
return similarity;
}
}
throw new IllegalArgumentException("Can't get similarity from fuzziness [" + fuzziness + "]");
}
private int termLen(String text) { private int termLen(String text) {
return text == null ? 5 : text.codePointCount(0, text.length()); // 5 avg term length in english return text == null ? 5 : text.codePointCount(0, text.length()); // 5 avg term length in english
} }

View File

@ -179,7 +179,7 @@ public class QueryStringQueryParser implements QueryParser {
} else if ("phrase_slop".equals(currentFieldName) || "phraseSlop".equals(currentFieldName)) { } else if ("phrase_slop".equals(currentFieldName) || "phraseSlop".equals(currentFieldName)) {
qpSettings.phraseSlop(parser.intValue()); qpSettings.phraseSlop(parser.intValue());
} else if (parseContext.parseFieldMatcher().match(currentFieldName, FUZZINESS)) { } else if (parseContext.parseFieldMatcher().match(currentFieldName, FUZZINESS)) {
qpSettings.fuzzyMinSim(Fuzziness.parse(parser).asSimilarity()); qpSettings.setFuzziness(Fuzziness.parse(parser));
} else if ("boost".equals(currentFieldName)) { } else if ("boost".equals(currentFieldName)) {
qpSettings.boost(parser.floatValue()); qpSettings.boost(parser.floatValue());
} else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) { } else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) {

View File

@ -18,7 +18,6 @@
*/ */
package org.elasticsearch.common.unit; package org.elasticsearch.common.unit;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.common.xcontent.XContent; import org.elasticsearch.common.xcontent.XContent;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.common.xcontent.XContentType;
@ -143,16 +142,6 @@ public class FuzzinessTests extends ElasticsearchTestCase {
public void testAuto() { public void testAuto() {
final int codePoints = randomIntBetween(0, 10); final int codePoints = randomIntBetween(0, 10);
String string = randomRealisticUnicodeOfCodepointLength(codePoints); String string = randomRealisticUnicodeOfCodepointLength(codePoints);
if (codePoints <= 2) {
assertThat(Fuzziness.AUTO.asDistance(string), equalTo(0));
assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(0));
} else if (codePoints > 5) {
assertThat(Fuzziness.AUTO.asDistance(string), equalTo(2));
assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(2));
} else {
assertThat(Fuzziness.AUTO.asDistance(string), equalTo(1));
assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(1));
}
assertThat(Fuzziness.AUTO.asByte(), equalTo((byte) 1)); assertThat(Fuzziness.AUTO.asByte(), equalTo((byte) 1));
assertThat(Fuzziness.AUTO.asInt(), equalTo(1)); assertThat(Fuzziness.AUTO.asInt(), equalTo(1));
assertThat(Fuzziness.AUTO.asFloat(), equalTo(1f)); assertThat(Fuzziness.AUTO.asFloat(), equalTo(1f));
@ -173,28 +162,4 @@ public class FuzzinessTests extends ElasticsearchTestCase {
} }
} }
@Test
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/10638")
public void testSimilarityToDistance() {
assertThat(Fuzziness.fromSimilarity(0.5f).asDistance("ab"), equalTo(1));
assertThat(Fuzziness.fromSimilarity(0.66f).asDistance("abcefg"), equalTo(2));
assertThat(Fuzziness.fromSimilarity(0.8f).asDistance("ab"), equalTo(0));
assertThat(Fuzziness.fromSimilarity(0.8f).asDistance("abcefg"), equalTo(1));
assertThat((double) Fuzziness.ONE.asSimilarity("abcefg"), closeTo(0.8f, 0.05));
assertThat((double) Fuzziness.TWO.asSimilarity("abcefg"), closeTo(0.66f, 0.05));
assertThat((double) Fuzziness.ONE.asSimilarity("ab"), closeTo(0.5f, 0.05));
int iters = randomIntBetween(100, 1000);
for (int i = 0; i < iters; i++) {
Fuzziness fuzziness = Fuzziness.fromEdits(between(1, 2));
String string = rarely() ? randomRealisticUnicodeOfLengthBetween(2, 4) :
randomRealisticUnicodeOfLengthBetween(4, 10);
float similarity = fuzziness.asSimilarity(string);
if (similarity != 0.0f) {
Fuzziness similarityBased = Fuzziness.build(similarity);
assertThat((double) similarityBased.asSimilarity(string), closeTo(similarity, 0.05));
assertThat(similarityBased.asDistance(string), equalTo(Math.min(2, fuzziness.asDistance(string))));
}
}
}
} }

View File

@ -437,7 +437,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
@Test @Test
public void testFuzzyQueryWithFieldsBuilder() throws IOException { public void testFuzzyQueryWithFieldsBuilder() throws IOException {
IndexQueryParserService queryParser = queryParser(); IndexQueryParserService queryParser = queryParser();
Query parsedQuery = queryParser.parse(fuzzyQuery("name.first", "sh").fuzziness(Fuzziness.fromSimilarity(0.1f)).prefixLength(1).boost(2.0f).buildAsBytes()).query(); Query parsedQuery = queryParser.parse(fuzzyQuery("name.first", "sh").fuzziness(Fuzziness.ONE).prefixLength(1).boost(2.0f).buildAsBytes()).query();
assertThat(parsedQuery, instanceOf(FuzzyQuery.class)); assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery; FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh"))); assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
@ -454,7 +454,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
assertThat(parsedQuery, instanceOf(FuzzyQuery.class)); assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery; FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh"))); assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
assertThat(fuzzyQuery.getMaxEdits(), equalTo(FuzzyQuery.floatToEdits(0.1f, "sh".length()))); assertThat(fuzzyQuery.getMaxEdits(), equalTo(Fuzziness.AUTO.asDistance("sh")));
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1)); assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
assertThat(fuzzyQuery.getBoost(), equalTo(2.0f)); assertThat(fuzzyQuery.getBoost(), equalTo(2.0f));
} }

View File

@ -2,7 +2,7 @@
"fuzzy":{ "fuzzy":{
"name.first":{ "name.first":{
"value":"sh", "value":"sh",
"fuzziness":0.1, "fuzziness": "AUTO",
"prefix_length":1, "prefix_length":1,
"boost":2.0 "boost":2.0
} }

View File

@ -331,15 +331,6 @@ generates an edit distance based on the length of the term. For lengths:
`>5`:: two edits allowed `>5`:: two edits allowed
`AUTO` should generally be the preferred value for `fuzziness`. `AUTO` should generally be the preferred value for `fuzziness`.
--
`0.0..1.0`::
converted into an edit distance using the formula: `length(term) * (1.0 -
fuzziness)`, eg a `fuzziness` of `0.6` with a term of length 10 would result
in an edit distance of `4`. Note: in all APIs the maximum allowed edit distance is `2`.
[float] [float]
=== Result Casing === Result Casing