Don't allow fuzziness specified as a and require edits [0,2]
Lucene deprecated this in 4.0 and we only try best effort to support it. Folks should only use edit distance rather than some length based similarity. Yet the formular is simple enough such that users can still do it in the client if they really need to. Closes #10638
This commit is contained in:
parent
738a0bc8a2
commit
09bd19b947
|
@ -128,7 +128,7 @@ public class MapperQueryParser extends QueryParser {
|
||||||
setLowercaseExpandedTerms(settings.lowercaseExpandedTerms());
|
setLowercaseExpandedTerms(settings.lowercaseExpandedTerms());
|
||||||
setPhraseSlop(settings.phraseSlop());
|
setPhraseSlop(settings.phraseSlop());
|
||||||
setDefaultOperator(settings.defaultOperator());
|
setDefaultOperator(settings.defaultOperator());
|
||||||
setFuzzyMinSim(settings.fuzzyMinSim());
|
setFuzzyMinSim(settings.getFuzziness().asFloat());
|
||||||
setFuzzyPrefixLength(settings.fuzzyPrefixLength());
|
setFuzzyPrefixLength(settings.fuzzyPrefixLength());
|
||||||
setLocale(settings.locale());
|
setLocale(settings.locale());
|
||||||
this.analyzeWildcard = settings.analyzeWildcard();
|
this.analyzeWildcard = settings.analyzeWildcard();
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.search.FuzzyQuery;
|
import org.apache.lucene.search.FuzzyQuery;
|
||||||
import org.apache.lucene.search.MultiTermQuery;
|
import org.apache.lucene.search.MultiTermQuery;
|
||||||
import org.apache.lucene.util.automaton.Operations;
|
import org.apache.lucene.util.automaton.Operations;
|
||||||
|
import org.elasticsearch.common.unit.Fuzziness;
|
||||||
import org.joda.time.DateTimeZone;
|
import org.joda.time.DateTimeZone;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
@ -49,7 +50,7 @@ public class QueryParserSettings {
|
||||||
private boolean lowercaseExpandedTerms = true;
|
private boolean lowercaseExpandedTerms = true;
|
||||||
private boolean enablePositionIncrements = true;
|
private boolean enablePositionIncrements = true;
|
||||||
private int phraseSlop = 0;
|
private int phraseSlop = 0;
|
||||||
private float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity;
|
private Fuzziness fuzziness = Fuzziness.AUTO;
|
||||||
private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
|
private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
|
||||||
private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions;
|
private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions;
|
||||||
private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
|
||||||
|
@ -158,14 +159,6 @@ public class QueryParserSettings {
|
||||||
this.phraseSlop = phraseSlop;
|
this.phraseSlop = phraseSlop;
|
||||||
}
|
}
|
||||||
|
|
||||||
public float fuzzyMinSim() {
|
|
||||||
return fuzzyMinSim;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void fuzzyMinSim(float fuzzyMinSim) {
|
|
||||||
this.fuzzyMinSim = fuzzyMinSim;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int fuzzyPrefixLength() {
|
public int fuzzyPrefixLength() {
|
||||||
return fuzzyPrefixLength;
|
return fuzzyPrefixLength;
|
||||||
}
|
}
|
||||||
|
@ -340,7 +333,7 @@ public class QueryParserSettings {
|
||||||
if (enablePositionIncrements != that.enablePositionIncrements) return false;
|
if (enablePositionIncrements != that.enablePositionIncrements) return false;
|
||||||
if (escape != that.escape) return false;
|
if (escape != that.escape) return false;
|
||||||
if (analyzeWildcard != that.analyzeWildcard) return false;
|
if (analyzeWildcard != that.analyzeWildcard) return false;
|
||||||
if (Float.compare(that.fuzzyMinSim, fuzzyMinSim) != 0) return false;
|
if (fuzziness != null ? fuzziness.equals(that.fuzziness) == false : fuzziness != null) return false;
|
||||||
if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false;
|
if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false;
|
||||||
if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false;
|
if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false;
|
||||||
if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null)
|
if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null)
|
||||||
|
@ -395,7 +388,7 @@ public class QueryParserSettings {
|
||||||
result = 31 * result + (lowercaseExpandedTerms ? 1 : 0);
|
result = 31 * result + (lowercaseExpandedTerms ? 1 : 0);
|
||||||
result = 31 * result + (enablePositionIncrements ? 1 : 0);
|
result = 31 * result + (enablePositionIncrements ? 1 : 0);
|
||||||
result = 31 * result + phraseSlop;
|
result = 31 * result + phraseSlop;
|
||||||
result = 31 * result + (fuzzyMinSim != +0.0f ? Float.floatToIntBits(fuzzyMinSim) : 0);
|
result = 31 * result + (fuzziness.hashCode());
|
||||||
result = 31 * result + fuzzyPrefixLength;
|
result = 31 * result + fuzzyPrefixLength;
|
||||||
result = 31 * result + (escape ? 1 : 0);
|
result = 31 * result + (escape ? 1 : 0);
|
||||||
result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0);
|
result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0);
|
||||||
|
@ -413,4 +406,12 @@ public class QueryParserSettings {
|
||||||
result = 31 * result + (timeZone != null ? timeZone.hashCode() : 0);
|
result = 31 * result + (timeZone != null ? timeZone.hashCode() : 0);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setFuzziness(Fuzziness fuzziness) {
|
||||||
|
this.fuzziness = fuzziness;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Fuzziness getFuzziness() {
|
||||||
|
return fuzziness;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,29 +43,17 @@ public final class Fuzziness implements ToXContent {
|
||||||
public static final Fuzziness AUTO = new Fuzziness("AUTO");
|
public static final Fuzziness AUTO = new Fuzziness("AUTO");
|
||||||
public static final ParseField FIELD = new ParseField(X_FIELD_NAME.camelCase().getValue());
|
public static final ParseField FIELD = new ParseField(X_FIELD_NAME.camelCase().getValue());
|
||||||
|
|
||||||
private final Object fuzziness;
|
private final String fuzziness;
|
||||||
|
|
||||||
private Fuzziness(int fuzziness) {
|
private Fuzziness(int fuzziness) {
|
||||||
Preconditions.checkArgument(fuzziness >= 0 && fuzziness <= 2, "Valid edit distances are [0, 1, 2] but was [" + fuzziness + "]");
|
Preconditions.checkArgument(fuzziness >= 0 && fuzziness <= 2, "Valid edit distances are [0, 1, 2] but was [" + fuzziness + "]");
|
||||||
this.fuzziness = fuzziness;
|
this.fuzziness = Integer.toString(fuzziness);
|
||||||
}
|
|
||||||
|
|
||||||
private Fuzziness(float fuzziness) {
|
|
||||||
Preconditions.checkArgument(fuzziness >= 0.0 && fuzziness < 1.0f, "Valid similarities must be in the interval [0..1] but was [" + fuzziness + "]");
|
|
||||||
this.fuzziness = fuzziness;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Fuzziness(String fuzziness) {
|
private Fuzziness(String fuzziness) {
|
||||||
this.fuzziness = fuzziness;
|
this.fuzziness = fuzziness;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a {@link Fuzziness} instance from a similarity. The value must be in the range <tt>[0..1)</tt>
|
|
||||||
*/
|
|
||||||
public static Fuzziness fromSimilarity(float similarity) {
|
|
||||||
return new Fuzziness(similarity);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link Fuzziness} instance from an edit distance. The value must be one of <tt>[0, 1, 2]</tt>
|
* Creates a {@link Fuzziness} instance from an edit distance. The value must be one of <tt>[0, 1, 2]</tt>
|
||||||
*/
|
*/
|
||||||
|
@ -133,7 +121,6 @@ public final class Fuzziness implements ToXContent {
|
||||||
}
|
}
|
||||||
|
|
||||||
public int asDistance(String text) {
|
public int asDistance(String text) {
|
||||||
if (fuzziness instanceof String) {
|
|
||||||
if (this == AUTO) { //AUTO
|
if (this == AUTO) { //AUTO
|
||||||
final int len = termLen(text);
|
final int len = termLen(text);
|
||||||
if (len <= 2) {
|
if (len <= 2) {
|
||||||
|
@ -144,8 +131,7 @@ public final class Fuzziness implements ToXContent {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return Math.min(2, asInt());
|
||||||
return FuzzyQuery.floatToEdits(asFloat(), termLen(text));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TimeValue asTimeValue() {
|
public TimeValue asTimeValue() {
|
||||||
|
@ -214,37 +200,6 @@ public final class Fuzziness implements ToXContent {
|
||||||
return Float.parseFloat(fuzziness.toString());
|
return Float.parseFloat(fuzziness.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
public float asSimilarity() {
|
|
||||||
return asSimilarity(null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public float asSimilarity(String text) {
|
|
||||||
if (this == AUTO) {
|
|
||||||
final int len = termLen(text);
|
|
||||||
if (len <= 2) {
|
|
||||||
return 0.0f;
|
|
||||||
} else if (len > 5) {
|
|
||||||
return 0.5f;
|
|
||||||
} else {
|
|
||||||
return 0.66f;
|
|
||||||
}
|
|
||||||
// return dist == 0 ? dist : Math.min(0.999f, Math.max(0.0f, 1.0f - ((float) dist/ (float) termLen(text))));
|
|
||||||
}
|
|
||||||
if (fuzziness instanceof Float) { // it's a similarity
|
|
||||||
return ((Float) fuzziness).floatValue();
|
|
||||||
} else if (fuzziness instanceof Integer) { // it's an edit!
|
|
||||||
int dist = Math.min(((Integer) fuzziness).intValue(),
|
|
||||||
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
|
|
||||||
return Math.min(0.999f, Math.max(0.0f, 1.0f - ((float) dist / (float) termLen(text))));
|
|
||||||
} else {
|
|
||||||
final float similarity = Float.parseFloat(fuzziness.toString());
|
|
||||||
if (similarity >= 0.0f && similarity < 1.0f) {
|
|
||||||
return similarity;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw new IllegalArgumentException("Can't get similarity from fuzziness [" + fuzziness + "]");
|
|
||||||
}
|
|
||||||
|
|
||||||
private int termLen(String text) {
|
private int termLen(String text) {
|
||||||
return text == null ? 5 : text.codePointCount(0, text.length()); // 5 avg term length in english
|
return text == null ? 5 : text.codePointCount(0, text.length()); // 5 avg term length in english
|
||||||
}
|
}
|
||||||
|
|
|
@ -179,7 +179,7 @@ public class QueryStringQueryParser implements QueryParser {
|
||||||
} else if ("phrase_slop".equals(currentFieldName) || "phraseSlop".equals(currentFieldName)) {
|
} else if ("phrase_slop".equals(currentFieldName) || "phraseSlop".equals(currentFieldName)) {
|
||||||
qpSettings.phraseSlop(parser.intValue());
|
qpSettings.phraseSlop(parser.intValue());
|
||||||
} else if (parseContext.parseFieldMatcher().match(currentFieldName, FUZZINESS)) {
|
} else if (parseContext.parseFieldMatcher().match(currentFieldName, FUZZINESS)) {
|
||||||
qpSettings.fuzzyMinSim(Fuzziness.parse(parser).asSimilarity());
|
qpSettings.setFuzziness(Fuzziness.parse(parser));
|
||||||
} else if ("boost".equals(currentFieldName)) {
|
} else if ("boost".equals(currentFieldName)) {
|
||||||
qpSettings.boost(parser.floatValue());
|
qpSettings.boost(parser.floatValue());
|
||||||
} else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) {
|
} else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) {
|
||||||
|
|
|
@ -18,7 +18,6 @@
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.common.unit;
|
package org.elasticsearch.common.unit;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.elasticsearch.common.xcontent.XContent;
|
import org.elasticsearch.common.xcontent.XContent;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
import org.elasticsearch.common.xcontent.XContentType;
|
import org.elasticsearch.common.xcontent.XContentType;
|
||||||
|
@ -143,16 +142,6 @@ public class FuzzinessTests extends ElasticsearchTestCase {
|
||||||
public void testAuto() {
|
public void testAuto() {
|
||||||
final int codePoints = randomIntBetween(0, 10);
|
final int codePoints = randomIntBetween(0, 10);
|
||||||
String string = randomRealisticUnicodeOfCodepointLength(codePoints);
|
String string = randomRealisticUnicodeOfCodepointLength(codePoints);
|
||||||
if (codePoints <= 2) {
|
|
||||||
assertThat(Fuzziness.AUTO.asDistance(string), equalTo(0));
|
|
||||||
assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(0));
|
|
||||||
} else if (codePoints > 5) {
|
|
||||||
assertThat(Fuzziness.AUTO.asDistance(string), equalTo(2));
|
|
||||||
assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(2));
|
|
||||||
} else {
|
|
||||||
assertThat(Fuzziness.AUTO.asDistance(string), equalTo(1));
|
|
||||||
assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(1));
|
|
||||||
}
|
|
||||||
assertThat(Fuzziness.AUTO.asByte(), equalTo((byte) 1));
|
assertThat(Fuzziness.AUTO.asByte(), equalTo((byte) 1));
|
||||||
assertThat(Fuzziness.AUTO.asInt(), equalTo(1));
|
assertThat(Fuzziness.AUTO.asInt(), equalTo(1));
|
||||||
assertThat(Fuzziness.AUTO.asFloat(), equalTo(1f));
|
assertThat(Fuzziness.AUTO.asFloat(), equalTo(1f));
|
||||||
|
@ -173,28 +162,4 @@ public class FuzzinessTests extends ElasticsearchTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/10638")
|
|
||||||
public void testSimilarityToDistance() {
|
|
||||||
assertThat(Fuzziness.fromSimilarity(0.5f).asDistance("ab"), equalTo(1));
|
|
||||||
assertThat(Fuzziness.fromSimilarity(0.66f).asDistance("abcefg"), equalTo(2));
|
|
||||||
assertThat(Fuzziness.fromSimilarity(0.8f).asDistance("ab"), equalTo(0));
|
|
||||||
assertThat(Fuzziness.fromSimilarity(0.8f).asDistance("abcefg"), equalTo(1));
|
|
||||||
assertThat((double) Fuzziness.ONE.asSimilarity("abcefg"), closeTo(0.8f, 0.05));
|
|
||||||
assertThat((double) Fuzziness.TWO.asSimilarity("abcefg"), closeTo(0.66f, 0.05));
|
|
||||||
assertThat((double) Fuzziness.ONE.asSimilarity("ab"), closeTo(0.5f, 0.05));
|
|
||||||
|
|
||||||
int iters = randomIntBetween(100, 1000);
|
|
||||||
for (int i = 0; i < iters; i++) {
|
|
||||||
Fuzziness fuzziness = Fuzziness.fromEdits(between(1, 2));
|
|
||||||
String string = rarely() ? randomRealisticUnicodeOfLengthBetween(2, 4) :
|
|
||||||
randomRealisticUnicodeOfLengthBetween(4, 10);
|
|
||||||
float similarity = fuzziness.asSimilarity(string);
|
|
||||||
if (similarity != 0.0f) {
|
|
||||||
Fuzziness similarityBased = Fuzziness.build(similarity);
|
|
||||||
assertThat((double) similarityBased.asSimilarity(string), closeTo(similarity, 0.05));
|
|
||||||
assertThat(similarityBased.asDistance(string), equalTo(Math.min(2, fuzziness.asDistance(string))));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -437,7 +437,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
|
||||||
@Test
|
@Test
|
||||||
public void testFuzzyQueryWithFieldsBuilder() throws IOException {
|
public void testFuzzyQueryWithFieldsBuilder() throws IOException {
|
||||||
IndexQueryParserService queryParser = queryParser();
|
IndexQueryParserService queryParser = queryParser();
|
||||||
Query parsedQuery = queryParser.parse(fuzzyQuery("name.first", "sh").fuzziness(Fuzziness.fromSimilarity(0.1f)).prefixLength(1).boost(2.0f).buildAsBytes()).query();
|
Query parsedQuery = queryParser.parse(fuzzyQuery("name.first", "sh").fuzziness(Fuzziness.ONE).prefixLength(1).boost(2.0f).buildAsBytes()).query();
|
||||||
assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
|
assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
|
||||||
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
|
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
|
||||||
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
|
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
|
||||||
|
@ -454,7 +454,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
|
||||||
assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
|
assertThat(parsedQuery, instanceOf(FuzzyQuery.class));
|
||||||
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
|
FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery;
|
||||||
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
|
assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh")));
|
||||||
assertThat(fuzzyQuery.getMaxEdits(), equalTo(FuzzyQuery.floatToEdits(0.1f, "sh".length())));
|
assertThat(fuzzyQuery.getMaxEdits(), equalTo(Fuzziness.AUTO.asDistance("sh")));
|
||||||
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
|
assertThat(fuzzyQuery.getPrefixLength(), equalTo(1));
|
||||||
assertThat(fuzzyQuery.getBoost(), equalTo(2.0f));
|
assertThat(fuzzyQuery.getBoost(), equalTo(2.0f));
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
"fuzzy":{
|
"fuzzy":{
|
||||||
"name.first":{
|
"name.first":{
|
||||||
"value":"sh",
|
"value":"sh",
|
||||||
"fuzziness":0.1,
|
"fuzziness": "AUTO",
|
||||||
"prefix_length":1,
|
"prefix_length":1,
|
||||||
"boost":2.0
|
"boost":2.0
|
||||||
}
|
}
|
||||||
|
|
|
@ -331,15 +331,6 @@ generates an edit distance based on the length of the term. For lengths:
|
||||||
`>5`:: two edits allowed
|
`>5`:: two edits allowed
|
||||||
|
|
||||||
`AUTO` should generally be the preferred value for `fuzziness`.
|
`AUTO` should generally be the preferred value for `fuzziness`.
|
||||||
--
|
|
||||||
|
|
||||||
`0.0..1.0`::
|
|
||||||
|
|
||||||
converted into an edit distance using the formula: `length(term) * (1.0 -
|
|
||||||
fuzziness)`, eg a `fuzziness` of `0.6` with a term of length 10 would result
|
|
||||||
in an edit distance of `4`. Note: in all APIs the maximum allowed edit distance is `2`.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[float]
|
[float]
|
||||||
=== Result Casing
|
=== Result Casing
|
||||||
|
|
Loading…
Reference in New Issue