From 09bd19b947a12f57267dfbc05aa1a5c920cd4964 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 14 Jul 2015 12:17:46 +0200 Subject: [PATCH] Don't allow fuzziness specified as a and require edits [0,2] Lucene deprecated this in 4.0 and we only try best effort to support it. Folks should only use edit distance rather than some length based similarity. Yet the formular is simple enough such that users can still do it in the client if they really need to. Closes #10638 --- .../classic/MapperQueryParser.java | 2 +- .../classic/QueryParserSettings.java | 23 ++++--- .../elasticsearch/common/unit/Fuzziness.java | 67 +++---------------- .../index/query/QueryStringQueryParser.java | 2 +- .../common/unit/FuzzinessTests.java | 35 ---------- .../query/SimpleIndexQueryParserTests.java | 4 +- .../index/query/fuzzy-with-fields.json | 2 +- docs/reference/api-conventions.asciidoc | 9 --- 8 files changed, 28 insertions(+), 116 deletions(-) diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java index db4164f9f9e..8dc52f5dc3b 100644 --- a/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java +++ b/core/src/main/java/org/apache/lucene/queryparser/classic/MapperQueryParser.java @@ -128,7 +128,7 @@ public class MapperQueryParser extends QueryParser { setLowercaseExpandedTerms(settings.lowercaseExpandedTerms()); setPhraseSlop(settings.phraseSlop()); setDefaultOperator(settings.defaultOperator()); - setFuzzyMinSim(settings.fuzzyMinSim()); + setFuzzyMinSim(settings.getFuzziness().asFloat()); setFuzzyPrefixLength(settings.fuzzyPrefixLength()); setLocale(settings.locale()); this.analyzeWildcard = settings.analyzeWildcard(); diff --git a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java index ca364e486e1..e079e00303e 100644 --- a/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java +++ b/core/src/main/java/org/apache/lucene/queryparser/classic/QueryParserSettings.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiTermQuery; import org.apache.lucene.util.automaton.Operations; +import org.elasticsearch.common.unit.Fuzziness; import org.joda.time.DateTimeZone; import java.util.Collection; @@ -49,7 +50,7 @@ public class QueryParserSettings { private boolean lowercaseExpandedTerms = true; private boolean enablePositionIncrements = true; private int phraseSlop = 0; - private float fuzzyMinSim = FuzzyQuery.defaultMinSimilarity; + private Fuzziness fuzziness = Fuzziness.AUTO; private int fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength; private int fuzzyMaxExpansions = FuzzyQuery.defaultMaxExpansions; private int maxDeterminizedStates = Operations.DEFAULT_MAX_DETERMINIZED_STATES; @@ -158,14 +159,6 @@ public class QueryParserSettings { this.phraseSlop = phraseSlop; } - public float fuzzyMinSim() { - return fuzzyMinSim; - } - - public void fuzzyMinSim(float fuzzyMinSim) { - this.fuzzyMinSim = fuzzyMinSim; - } - public int fuzzyPrefixLength() { return fuzzyPrefixLength; } @@ -340,7 +333,7 @@ public class QueryParserSettings { if (enablePositionIncrements != that.enablePositionIncrements) return false; if (escape != that.escape) return false; if (analyzeWildcard != that.analyzeWildcard) return false; - if (Float.compare(that.fuzzyMinSim, fuzzyMinSim) != 0) return false; + if (fuzziness != null ? fuzziness.equals(that.fuzziness) == false : fuzziness != null) return false; if (fuzzyPrefixLength != that.fuzzyPrefixLength) return false; if (fuzzyMaxExpansions != that.fuzzyMaxExpansions) return false; if (fuzzyRewriteMethod != null ? !fuzzyRewriteMethod.equals(that.fuzzyRewriteMethod) : that.fuzzyRewriteMethod != null) @@ -395,7 +388,7 @@ public class QueryParserSettings { result = 31 * result + (lowercaseExpandedTerms ? 1 : 0); result = 31 * result + (enablePositionIncrements ? 1 : 0); result = 31 * result + phraseSlop; - result = 31 * result + (fuzzyMinSim != +0.0f ? Float.floatToIntBits(fuzzyMinSim) : 0); + result = 31 * result + (fuzziness.hashCode()); result = 31 * result + fuzzyPrefixLength; result = 31 * result + (escape ? 1 : 0); result = 31 * result + (defaultAnalyzer != null ? defaultAnalyzer.hashCode() : 0); @@ -413,4 +406,12 @@ public class QueryParserSettings { result = 31 * result + (timeZone != null ? timeZone.hashCode() : 0); return result; } + + public void setFuzziness(Fuzziness fuzziness) { + this.fuzziness = fuzziness; + } + + public Fuzziness getFuzziness() { + return fuzziness; + } } diff --git a/core/src/main/java/org/elasticsearch/common/unit/Fuzziness.java b/core/src/main/java/org/elasticsearch/common/unit/Fuzziness.java index 30b959d25b6..cfcd209b435 100644 --- a/core/src/main/java/org/elasticsearch/common/unit/Fuzziness.java +++ b/core/src/main/java/org/elasticsearch/common/unit/Fuzziness.java @@ -43,29 +43,17 @@ public final class Fuzziness implements ToXContent { public static final Fuzziness AUTO = new Fuzziness("AUTO"); public static final ParseField FIELD = new ParseField(X_FIELD_NAME.camelCase().getValue()); - private final Object fuzziness; + private final String fuzziness; private Fuzziness(int fuzziness) { Preconditions.checkArgument(fuzziness >= 0 && fuzziness <= 2, "Valid edit distances are [0, 1, 2] but was [" + fuzziness + "]"); - this.fuzziness = fuzziness; - } - - private Fuzziness(float fuzziness) { - Preconditions.checkArgument(fuzziness >= 0.0 && fuzziness < 1.0f, "Valid similarities must be in the interval [0..1] but was [" + fuzziness + "]"); - this.fuzziness = fuzziness; + this.fuzziness = Integer.toString(fuzziness); } private Fuzziness(String fuzziness) { this.fuzziness = fuzziness; } - /** - * Creates a {@link Fuzziness} instance from a similarity. The value must be in the range [0..1) - */ - public static Fuzziness fromSimilarity(float similarity) { - return new Fuzziness(similarity); - } - /** * Creates a {@link Fuzziness} instance from an edit distance. The value must be one of [0, 1, 2] */ @@ -133,19 +121,17 @@ public final class Fuzziness implements ToXContent { } public int asDistance(String text) { - if (fuzziness instanceof String) { - if (this == AUTO) { //AUTO - final int len = termLen(text); - if (len <= 2) { - return 0; - } else if (len > 5) { - return 2; - } else { - return 1; - } + if (this == AUTO) { //AUTO + final int len = termLen(text); + if (len <= 2) { + return 0; + } else if (len > 5) { + return 2; + } else { + return 1; } } - return FuzzyQuery.floatToEdits(asFloat(), termLen(text)); + return Math.min(2, asInt()); } public TimeValue asTimeValue() { @@ -214,37 +200,6 @@ public final class Fuzziness implements ToXContent { return Float.parseFloat(fuzziness.toString()); } - public float asSimilarity() { - return asSimilarity(null); - } - - public float asSimilarity(String text) { - if (this == AUTO) { - final int len = termLen(text); - if (len <= 2) { - return 0.0f; - } else if (len > 5) { - return 0.5f; - } else { - return 0.66f; - } -// return dist == 0 ? dist : Math.min(0.999f, Math.max(0.0f, 1.0f - ((float) dist/ (float) termLen(text)))); - } - if (fuzziness instanceof Float) { // it's a similarity - return ((Float) fuzziness).floatValue(); - } else if (fuzziness instanceof Integer) { // it's an edit! - int dist = Math.min(((Integer) fuzziness).intValue(), - LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); - return Math.min(0.999f, Math.max(0.0f, 1.0f - ((float) dist / (float) termLen(text)))); - } else { - final float similarity = Float.parseFloat(fuzziness.toString()); - if (similarity >= 0.0f && similarity < 1.0f) { - return similarity; - } - } - throw new IllegalArgumentException("Can't get similarity from fuzziness [" + fuzziness + "]"); - } - private int termLen(String text) { return text == null ? 5 : text.codePointCount(0, text.length()); // 5 avg term length in english } diff --git a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryParser.java b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryParser.java index 752037b9b21..2a21d3d4595 100644 --- a/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryParser.java +++ b/core/src/main/java/org/elasticsearch/index/query/QueryStringQueryParser.java @@ -179,7 +179,7 @@ public class QueryStringQueryParser implements QueryParser { } else if ("phrase_slop".equals(currentFieldName) || "phraseSlop".equals(currentFieldName)) { qpSettings.phraseSlop(parser.intValue()); } else if (parseContext.parseFieldMatcher().match(currentFieldName, FUZZINESS)) { - qpSettings.fuzzyMinSim(Fuzziness.parse(parser).asSimilarity()); + qpSettings.setFuzziness(Fuzziness.parse(parser)); } else if ("boost".equals(currentFieldName)) { qpSettings.boost(parser.floatValue()); } else if ("tie_breaker".equals(currentFieldName) || "tieBreaker".equals(currentFieldName)) { diff --git a/core/src/test/java/org/elasticsearch/common/unit/FuzzinessTests.java b/core/src/test/java/org/elasticsearch/common/unit/FuzzinessTests.java index 74ed24a5ec4..dc3c66b4e83 100644 --- a/core/src/test/java/org/elasticsearch/common/unit/FuzzinessTests.java +++ b/core/src/test/java/org/elasticsearch/common/unit/FuzzinessTests.java @@ -18,7 +18,6 @@ */ package org.elasticsearch.common.unit; -import org.apache.lucene.util.LuceneTestCase; import org.elasticsearch.common.xcontent.XContent; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentType; @@ -143,16 +142,6 @@ public class FuzzinessTests extends ElasticsearchTestCase { public void testAuto() { final int codePoints = randomIntBetween(0, 10); String string = randomRealisticUnicodeOfCodepointLength(codePoints); - if (codePoints <= 2) { - assertThat(Fuzziness.AUTO.asDistance(string), equalTo(0)); - assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(0)); - } else if (codePoints > 5) { - assertThat(Fuzziness.AUTO.asDistance(string), equalTo(2)); - assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(2)); - } else { - assertThat(Fuzziness.AUTO.asDistance(string), equalTo(1)); - assertThat(Fuzziness.fromSimilarity(Fuzziness.AUTO.asSimilarity(string)).asDistance(string), equalTo(1)); - } assertThat(Fuzziness.AUTO.asByte(), equalTo((byte) 1)); assertThat(Fuzziness.AUTO.asInt(), equalTo(1)); assertThat(Fuzziness.AUTO.asFloat(), equalTo(1f)); @@ -173,28 +162,4 @@ public class FuzzinessTests extends ElasticsearchTestCase { } } - @Test - @LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/10638") - public void testSimilarityToDistance() { - assertThat(Fuzziness.fromSimilarity(0.5f).asDistance("ab"), equalTo(1)); - assertThat(Fuzziness.fromSimilarity(0.66f).asDistance("abcefg"), equalTo(2)); - assertThat(Fuzziness.fromSimilarity(0.8f).asDistance("ab"), equalTo(0)); - assertThat(Fuzziness.fromSimilarity(0.8f).asDistance("abcefg"), equalTo(1)); - assertThat((double) Fuzziness.ONE.asSimilarity("abcefg"), closeTo(0.8f, 0.05)); - assertThat((double) Fuzziness.TWO.asSimilarity("abcefg"), closeTo(0.66f, 0.05)); - assertThat((double) Fuzziness.ONE.asSimilarity("ab"), closeTo(0.5f, 0.05)); - - int iters = randomIntBetween(100, 1000); - for (int i = 0; i < iters; i++) { - Fuzziness fuzziness = Fuzziness.fromEdits(between(1, 2)); - String string = rarely() ? randomRealisticUnicodeOfLengthBetween(2, 4) : - randomRealisticUnicodeOfLengthBetween(4, 10); - float similarity = fuzziness.asSimilarity(string); - if (similarity != 0.0f) { - Fuzziness similarityBased = Fuzziness.build(similarity); - assertThat((double) similarityBased.asSimilarity(string), closeTo(similarity, 0.05)); - assertThat(similarityBased.asDistance(string), equalTo(Math.min(2, fuzziness.asDistance(string)))); - } - } - } } diff --git a/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index e8c73dfd0a9..b89c330dde8 100644 --- a/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/core/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -437,7 +437,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest { @Test public void testFuzzyQueryWithFieldsBuilder() throws IOException { IndexQueryParserService queryParser = queryParser(); - Query parsedQuery = queryParser.parse(fuzzyQuery("name.first", "sh").fuzziness(Fuzziness.fromSimilarity(0.1f)).prefixLength(1).boost(2.0f).buildAsBytes()).query(); + Query parsedQuery = queryParser.parse(fuzzyQuery("name.first", "sh").fuzziness(Fuzziness.ONE).prefixLength(1).boost(2.0f).buildAsBytes()).query(); assertThat(parsedQuery, instanceOf(FuzzyQuery.class)); FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery; assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh"))); @@ -454,7 +454,7 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest { assertThat(parsedQuery, instanceOf(FuzzyQuery.class)); FuzzyQuery fuzzyQuery = (FuzzyQuery) parsedQuery; assertThat(fuzzyQuery.getTerm(), equalTo(new Term("name.first", "sh"))); - assertThat(fuzzyQuery.getMaxEdits(), equalTo(FuzzyQuery.floatToEdits(0.1f, "sh".length()))); + assertThat(fuzzyQuery.getMaxEdits(), equalTo(Fuzziness.AUTO.asDistance("sh"))); assertThat(fuzzyQuery.getPrefixLength(), equalTo(1)); assertThat(fuzzyQuery.getBoost(), equalTo(2.0f)); } diff --git a/core/src/test/java/org/elasticsearch/index/query/fuzzy-with-fields.json b/core/src/test/java/org/elasticsearch/index/query/fuzzy-with-fields.json index 3e3d30ffdc0..7636496adc4 100644 --- a/core/src/test/java/org/elasticsearch/index/query/fuzzy-with-fields.json +++ b/core/src/test/java/org/elasticsearch/index/query/fuzzy-with-fields.json @@ -2,7 +2,7 @@ "fuzzy":{ "name.first":{ "value":"sh", - "fuzziness":0.1, + "fuzziness": "AUTO", "prefix_length":1, "boost":2.0 } diff --git a/docs/reference/api-conventions.asciidoc b/docs/reference/api-conventions.asciidoc index 7dfb3936e35..d54e93a9dc0 100644 --- a/docs/reference/api-conventions.asciidoc +++ b/docs/reference/api-conventions.asciidoc @@ -331,15 +331,6 @@ generates an edit distance based on the length of the term. For lengths: `>5`:: two edits allowed `AUTO` should generally be the preferred value for `fuzziness`. --- - -`0.0..1.0`:: - -converted into an edit distance using the formula: `length(term) * (1.0 - -fuzziness)`, eg a `fuzziness` of `0.6` with a term of length 10 would result -in an edit distance of `4`. Note: in all APIs the maximum allowed edit distance is `2`. - - [float] === Result Casing