diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java index 92c6348ae4a..3856c7e67f5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -28,7 +28,7 @@ import org.apache.lucene.util.ToStringUtils; import org.apache.lucene.util.automaton.LevenshteinAutomata; /** Implements the fuzzy search query. The similarity measurement - * is based on the Levenshtein (edit distance) algorithm. + * is based on the Damerau-Levenshtein (optimal string alignment) algorithm. * *

This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite} * as default. So terms will be collected and scored according to their @@ -37,94 +37,81 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata; */ public class FuzzyQuery extends MultiTermQuery { - public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; public final static int defaultPrefixLength = 0; public final static int defaultMaxExpansions = 50; + public final static boolean defaultTranspositions = true; - private float minimumSimilarity; - private int prefixLength; - private boolean termLongEnough = false; - - protected Term term; + private final int maxEdits; + private final int maxExpansions; + private final boolean transpositions; + private final int prefixLength; + private final Term term; /** - * Create a new FuzzyQuery that will match terms with a similarity - * of at least minimumSimilarity to term. + * Create a new FuzzyQuery that will match terms with an edit distance + * of at most maxEdits to term. * If a prefixLength > 0 is specified, a common prefix * of that length is also required. * * @param term the term to search for - * @param minimumSimilarity a value between 0 and 1 to set the required similarity - * between the query term and the matching terms. For example, for a - * minimumSimilarity of 0.5 a term of the same length - * as the query term is considered similar to the query term if the edit distance - * between both terms is less than length(term)*0.5 - *

- * Alternatively, if minimumSimilarity is >= 1f, it is interpreted - * as a pure Levenshtein edit distance. For example, a value of 2f - * will match all terms within an edit distance of 2 from the - * query term. Edit distances specified in this way may not be fractional. - * + * @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}. * @param prefixLength length of common (non-fuzzy) prefix * @param maxExpansions the maximum number of terms to match. If this number is * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, * then the maxClauseCount will be used instead. - * @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0 - * or if prefixLength < 0 + * @param transpositions true if transpositions should be treated as a primitive + * edit operation. If this is false, comparisons will implement the classic + * Levenshtein algorithm. */ - public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength, - int maxExpansions) { + public FuzzyQuery(Term term, int maxEdits, int prefixLength, int maxExpansions, boolean transpositions) { super(term.field()); - this.term = term; - if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity) - throw new IllegalArgumentException("fractional edit distances are not allowed"); - if (minimumSimilarity < 0.0f) - throw new IllegalArgumentException("minimumSimilarity < 0"); - if (prefixLength < 0) - throw new IllegalArgumentException("prefixLength < 0"); - if (maxExpansions < 0) - throw new IllegalArgumentException("maxExpansions < 0"); - - setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions)); - - String text = term.text(); - int len = text.codePointCount(0, text.length()); - if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) { - this.termLongEnough = true; + if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } + if (prefixLength < 0) { + throw new IllegalArgumentException("prefixLength cannot be negative."); + } + if (maxExpansions < 0) { + throw new IllegalArgumentException("maxExpansions cannot be negative."); } - this.minimumSimilarity = minimumSimilarity; + this.term = term; + this.maxEdits = maxEdits; this.prefixLength = prefixLength; + this.transpositions = transpositions; + this.maxExpansions = maxExpansions; + setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions)); } /** - * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}. + * Calls {@link #FuzzyQuery(Term, int, int, int, boolean) + * FuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions, defaultTranspositions)}. */ - public FuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { - this(term, minimumSimilarity, prefixLength, defaultMaxExpansions); + public FuzzyQuery(Term term, int maxEdits, int prefixLength) { + this(term, maxEdits, prefixLength, defaultMaxExpansions, defaultTranspositions); } /** - * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}. + * Calls {@link #FuzzyQuery(Term, int, int) FuzzyQuery(term, maxEdits, defaultPrefixLength)}. */ - public FuzzyQuery(Term term, float minimumSimilarity) { - this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions); + public FuzzyQuery(Term term, int maxEdits) { + this(term, maxEdits, defaultPrefixLength); } /** - * Calls {@link #FuzzyQuery(Term, float) FuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}. + * Calls {@link #FuzzyQuery(Term, int) FuzzyQuery(term, defaultMaxEdits)}. */ public FuzzyQuery(Term term) { - this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions); + this(term, defaultMaxEdits); } /** - * Returns the minimum similarity that is required for this query to match. - * @return float value between 0.0 and 1.0 + * @return the maximum number of edit distances allowed for this query to match. */ - public float getMinSimilarity() { - return minimumSimilarity; + public int getMaxEdits() { + return maxEdits; } /** @@ -138,13 +125,10 @@ public class FuzzyQuery extends MultiTermQuery { @Override protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { - if (!termLongEnough) { // can only match if it's exact + if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact return new SingleTermsEnum(terms.iterator(null), term.bytes()); } - // TODO: should we expose the transpositions option to this query? - // maybe move the old/slowish stuff (lev without transpositions, n > 2, etc) all to contrib, - // deprecate it, and just have a faster/simpler/better one in core? - return new FuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength, false); + return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions); } /** @@ -163,7 +147,7 @@ public class FuzzyQuery extends MultiTermQuery { } buffer.append(term.text()); buffer.append('~'); - buffer.append(Float.toString(minimumSimilarity)); + buffer.append(Integer.toString(maxEdits)); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } @@ -172,8 +156,10 @@ public class FuzzyQuery extends MultiTermQuery { public int hashCode() { final int prime = 31; int result = super.hashCode(); - result = prime * result + Float.floatToIntBits(minimumSimilarity); + result = prime * result + maxEdits; result = prime * result + prefixLength; + result = prime * result + maxExpansions; + result = prime * result + (transpositions ? 0 : 1); result = prime * result + ((term == null) ? 0 : term.hashCode()); return result; } @@ -187,11 +173,14 @@ public class FuzzyQuery extends MultiTermQuery { if (getClass() != obj.getClass()) return false; FuzzyQuery other = (FuzzyQuery) obj; - if (Float.floatToIntBits(minimumSimilarity) != Float - .floatToIntBits(other.minimumSimilarity)) + if (maxEdits != other.maxEdits) return false; if (prefixLength != other.prefixLength) return false; + if (maxExpansions != other.maxExpansions) + return false; + if (transpositions != other.transpositions) + return false; if (term == null) { if (other.term != null) return false; @@ -199,6 +188,31 @@ public class FuzzyQuery extends MultiTermQuery { return false; return true; } + + /** + * @deprecated pass integer edit distances instead. + */ + @Deprecated + public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; - + /** + * Helper function to convert from deprecated "minimumSimilarity" fractions + * to raw edit distances. + * + * @param minimumSimilarity scaled similarity + * @param termLen length (in unicode codepoints) of the term. + * @return equivalent number of maxEdits + * @deprecated pass integer edit distances instead. + */ + @Deprecated + public static int floatToEdits(float minimumSimilarity, int termLen) { + if (minimumSimilarity > 1f) { + return (int) Math.min(minimumSimilarity, LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } else if (minimumSimilarity == 0.0f) { + return 0; // 0 means exact, not infinite # of edits! + } else { + return Math.min((int) ((1D-minimumSimilarity) * termLen), + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE); + } + } } diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index c899dd977bb..72db835b9c8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -34,8 +34,6 @@ import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.BasicAutomata; @@ -51,7 +49,7 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata; * {@link #getComparator}. Each term in the enumeration is * greater than all that precede it.

*/ -public final class FuzzyTermsEnum extends TermsEnum { +public class FuzzyTermsEnum extends TermsEnum { private TermsEnum actualEnum; private BoostAttribute actualBoostAtt; @@ -67,18 +65,18 @@ public final class FuzzyTermsEnum extends TermsEnum { // TODO: chicken-and-egg private final Comparator termComparator = BytesRef.getUTF8SortedAsUnicodeComparator(); - private final float minSimilarity; - private final float scale_factor; + protected final float minSimilarity; + protected final float scale_factor; - private final int termLength; + protected final int termLength; - private int maxEdits; - private final boolean raw; + protected int maxEdits; + protected final boolean raw; - private final Terms terms; + protected final Terms terms; private final Term term; - private final int termText[]; - private final int realPrefixLength; + protected final int termText[]; + protected final int realPrefixLength; private final boolean transpositions; @@ -95,7 +93,8 @@ public final class FuzzyTermsEnum extends TermsEnum { * thats contains information about competitive boosts during rewrite. It is also used * to cache DFAs between segment transitions. * @param term Pattern term. - * @param minSimilarity Minimum required similarity for terms from the reader. + * @param minSimilarity Minimum required similarity for terms from the reader. Pass an integer value + * representing edit distance. Passing a fraction is deprecated. * @param prefixLength Length of required common prefix. Default value is 0. * @throws IOException */ @@ -149,7 +148,7 @@ public final class FuzzyTermsEnum extends TermsEnum { * return an automata-based enum for matching up to editDistance from * lastTerm, if possible */ - private TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) + protected TermsEnum getAutomatonEnum(int editDistance, BytesRef lastTerm) throws IOException { final List runAutomata = initAutomata(editDistance); if (editDistance < runAutomata.size()) { @@ -187,7 +186,7 @@ public final class FuzzyTermsEnum extends TermsEnum { } /** swap in a new actual enum to proxy to */ - private void setEnum(TermsEnum actualEnum) { + protected void setEnum(TermsEnum actualEnum) { this.actualEnum = actualEnum; this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class); } @@ -209,14 +208,21 @@ public final class FuzzyTermsEnum extends TermsEnum { maxEdits--; if (oldMaxEdits != maxEdits || init) { // the maximum n has changed - TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm); - if (newEnum != null) { - setEnum(newEnum); - } else if (init) { - setEnum(new LinearFuzzyTermsEnum()); - } + maxEditDistanceChanged(lastTerm, maxEdits, init); } } + + protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init) + throws IOException { + TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm); + // instead of assert, we do a hard check in case someone uses our enum directly + // assert newEnum != null; + if (newEnum == null) { + assert maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + throw new IllegalArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE"); + } + setEnum(newEnum); + } // for some raw min similarity and input term length, the maximum # of edits private int initialMaxDistance(float minimumSimilarity, int termLen) { @@ -383,194 +389,6 @@ public final class FuzzyTermsEnum extends TermsEnum { } } - /** - * Implement fuzzy enumeration with linear brute force. - */ - private class LinearFuzzyTermsEnum extends FilteredTermsEnum { - /* Allows us save time required to create a new array - * every time similarity is called. - */ - private int[] d; - private int[] p; - - // this is the text, minus the prefix - private final int[] text; - - private final BoostAttribute boostAtt = - attributes().addAttribute(BoostAttribute.class); - - /** - * Constructor for enumeration of all terms from specified reader which share a prefix of - * length prefixLength with term and which have a fuzzy similarity > - * minSimilarity. - *

- * After calling the constructor the enumeration is already pointing to the first - * valid term if such a term exists. - * - * @throws IOException - */ - public LinearFuzzyTermsEnum() throws IOException { - super(terms.iterator(null)); - - this.text = new int[termLength - realPrefixLength]; - System.arraycopy(termText, realPrefixLength, text, 0, text.length); - final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); - prefixBytesRef = new BytesRef(prefix); - this.d = new int[this.text.length + 1]; - this.p = new int[this.text.length + 1]; - - setInitialSeekTerm(prefixBytesRef); - } - - private final BytesRef prefixBytesRef; - // used for unicode conversion from BytesRef byte[] to int[] - private final IntsRef utf32 = new IntsRef(20); - - /** - * The termCompare method in FuzzyTermEnum uses Levenshtein distance to - * calculate the distance between the given term and the comparing term. - */ - @Override - protected final AcceptStatus accept(BytesRef term) { - if (StringHelper.startsWith(term, prefixBytesRef)) { - UnicodeUtil.UTF8toUTF32(term, utf32); - final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); - if (similarity > minSimilarity) { - boostAtt.setBoost((similarity - minSimilarity) * scale_factor); - return AcceptStatus.YES; - } else return AcceptStatus.NO; - } else { - return AcceptStatus.END; - } - } - - /****************************** - * Compute Levenshtein distance - ******************************/ - - /** - *

Similarity returns a number that is 1.0f or less (including negative numbers) - * based on how similar the Term is compared to a target term. It returns - * exactly 0.0f when - *

-     *    editDistance > maximumEditDistance
- * Otherwise it returns: - *
-     *    1 - (editDistance / length)
- * where length is the length of the shortest term (text or target) including a - * prefix that are identical and editDistance is the Levenshtein distance for - * the two words.

- * - *

Embedded within this algorithm is a fail-fast Levenshtein distance - * algorithm. The fail-fast algorithm differs from the standard Levenshtein - * distance algorithm in that it is aborted if it is discovered that the - * minimum distance between the words is greater than some threshold. - * - *

To calculate the maximum distance threshold we use the following formula: - *

-     *     (1 - minimumSimilarity) * length
- * where length is the shortest term including any prefix that is not part of the - * similarity comparison. This formula was derived by solving for what maximum value - * of distance returns false for the following statements: - *
-     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
-     *   return (similarity > minimumSimilarity);
- * where distance is the Levenshtein distance for the two words. - *

- *

Levenshtein distance (also known as edit distance) is a measure of similarity - * between two strings where the distance is measured as the number of character - * deletions, insertions or substitutions required to transform one string to - * the other string. - * @param target the target word or phrase - * @return the similarity, 0.0 or less indicates that it matches less than the required - * threshold and 1.0 indicates that the text and target are identical - */ - private final float similarity(final int[] target, int offset, int length) { - final int m = length; - final int n = text.length; - if (n == 0) { - //we don't have anything to compare. That means if we just add - //the letters for m we get the new word - return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength); - } - if (m == 0) { - return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength); - } - - final int maxDistance = calculateMaxDistance(m); - - if (maxDistance < Math.abs(m-n)) { - //just adding the characters of m to n or vice-versa results in - //too many edits - //for example "pre" length is 3 and "prefixes" length is 8. We can see that - //given this optimal circumstance, the edit distance cannot be less than 5. - //which is 8-3 or more precisely Math.abs(3-8). - //if our maximum edit distance is 4, then we can discard this word - //without looking at it. - return Float.NEGATIVE_INFINITY; - } - - // init matrix d - for (int i = 0; i <=n; ++i) { - p[i] = i; - } - - // start computing edit distance - for (int j = 1; j<=m; ++j) { // iterates through target - int bestPossibleEditDistance = m; - final int t_j = target[offset+j-1]; // jth character of t - d[0] = j; - - for (int i=1; i<=n; ++i) { // iterates through text - // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1) - if (t_j != text[i-1]) { - d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1; - } else { - d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]); - } - bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]); - } - - //After calculating row i, the best possible edit distance - //can be found by found by finding the smallest value in a given column. - //If the bestPossibleEditDistance is greater than the max distance, abort. - - if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater - //the closest the target can be to the text is just too far away. - //this target is leaving the party early. - return Float.NEGATIVE_INFINITY; - } - - // copy current distance counts to 'previous row' distance counts: swap p and d - int _d[] = p; - p = d; - d = _d; - } - - // our last action in the above loop was to switch d and p, so p now - // actually has the most recent cost counts - - // this will return less than 0.0 when the edit distance is - // greater than the number of characters in the shorter word. - // but this was the formula that was previously used in FuzzyTermEnum, - // so it has not been changed (even though minimumSimilarity must be - // greater than 0.0) - return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m))); - } - - /** - * The max Distance is the maximum Levenshtein distance for the text - * compared to some other value that results in score that is - * better than the minimum similarity. - * @param m the length of the "other value" - * @return the maximum levenshtein distance that we care about - */ - private int calculateMaxDistance(int m) { - return raw ? maxEdits : Math.min(maxEdits, - (int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength))); - } - } - /** @lucene.internal */ public float getMinSimilarity() { return minSimilarity; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 701aa1b7c8f..764d8967796 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -52,32 +52,32 @@ public class TestFuzzyQuery extends LuceneTestCase { IndexSearcher searcher = newSearcher(reader); writer.close(); - FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0); + FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0); ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); // same with prefix - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(2, hits.length); - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); // test scoring - query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals("3 documents should match", 3, hits.length); List order = Arrays.asList("bbbbb","abbbb","aabbb"); @@ -89,7 +89,7 @@ public class TestFuzzyQuery extends LuceneTestCase { // test pq size by supplying maxExpansions=2 // This query would normally return 3 documents, because 3 terms match (see above): - query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2); + query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals("only 2 documents should match", 2, hits.length); order = Arrays.asList("bbbbb","abbbb"); @@ -100,15 +100,15 @@ public class TestFuzzyQuery extends LuceneTestCase { } // not similar enough: - query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); - query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3 + query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3 hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // query identical to a word in the index: - query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); @@ -117,7 +117,7 @@ public class TestFuzzyQuery extends LuceneTestCase { assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); // query similar to a word in the index: - query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); @@ -125,158 +125,69 @@ public class TestFuzzyQuery extends LuceneTestCase { assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); // now with prefix - query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); - query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); - query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3); + query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); - query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4); + query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(2, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); - query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5); + query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); - query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); // now with prefix - query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1); + query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); - query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2); + query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); - query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3); + query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); - query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4); + query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); - query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5); + query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); // different field = no match: - query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0); + query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, null, 1000).scoreDocs; assertEquals(0, hits.length); reader.close(); directory.close(); } - - public void testFuzzinessLong() throws Exception { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory); - addDoc("aaaaaaa", writer); - addDoc("segment", writer); - - IndexReader reader = writer.getReader(); - IndexSearcher searcher = newSearcher(reader); - writer.close(); - - FuzzyQuery query; - // not similar enough: - query = new FuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - // edit distance to "aaaaaaa" = 3, this matches because the string is longer than - // in testDefaultFuzziness so a bigger difference is allowed: - query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - - // now with prefix - query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); - query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - // no match, more than half of the characters is wrong: - query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - // now with prefix - query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - // "student" and "stellent" are indeed similar to "segment" by default: - query = new FuzzyQuery(new Term("field", "student"), 0.5f, 0); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 0); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - - // now with prefix - query = new FuzzyQuery(new Term("field", "student"), 0.5f, 1); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 1); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - query = new FuzzyQuery(new Term("field", "student"), 0.5f, 2); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 2); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - // "student" doesn't match anymore thanks to increased minimum similarity: - query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - try { - query = new FuzzyQuery(new Term("field", "student"), 1.1f); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expecting exception - } - try { - query = new FuzzyQuery(new Term("field", "student"), -0.1f); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - // expecting exception - } - - reader.close(); - directory.close(); - } /** * MultiTermQuery provides (via attribute) information about which values @@ -307,7 +218,7 @@ public class TestFuzzyQuery extends LuceneTestCase { MultiReader mr = new MultiReader(ir1, ir2); IndexSearcher searcher = newSearcher(mr); - FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1f, 0, 2); + FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false); TopDocs docs = searcher.search(fq, 2); assertEquals(5, docs.totalHits); // 5 docs, from the a and b's mr.close(); @@ -319,41 +230,6 @@ public class TestFuzzyQuery extends LuceneTestCase { directory2.close(); } - public void testTokenLengthOpt() throws IOException { - Directory directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random(), directory); - addDoc("12345678911", writer); - addDoc("segment", writer); - - IndexReader reader = writer.getReader(); - IndexSearcher searcher = newSearcher(reader); - writer.close(); - - Query query; - // term not over 10 chars, so optimization shortcuts - query = new FuzzyQuery(new Term("field", "1234569"), 0.9f); - ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - // 10 chars, so no optimization - query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - // over 10 chars, so no optimization - query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(1, hits.length); - - // over 10 chars, no match - query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f); - hits = searcher.search(query, null, 1000).scoreDocs; - assertEquals(0, hits.length); - - reader.close(); - directory.close(); - } - /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */ public void testBoostOnlyRewrite() throws Exception { Directory directory = newDirectory(); @@ -404,7 +280,7 @@ public class TestFuzzyQuery extends LuceneTestCase { IndexReader r = w.getReader(); w.close(); - Query q = new FuzzyQuery(new Term("field", "giga"), 0.9f); + Query q = new FuzzyQuery(new Term("field", "giga"), 0); // 3. search IndexSearcher searcher = newSearcher(r); @@ -435,26 +311,17 @@ public class TestFuzzyQuery extends LuceneTestCase { assertEquals(1, hits.length); assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); - q = new FuzzyQuery(new Term("field", "t"), 3); - hits = searcher.search(q, 10).scoreDocs; - assertEquals(1, hits.length); - assertEquals("test", searcher.doc(hits[0].doc).get("field")); - - q = new FuzzyQuery(new Term("field", "a"), 4f, 0, 50); - hits = searcher.search(q, 10).scoreDocs; - assertEquals(1, hits.length); - assertEquals("test", searcher.doc(hits[0].doc).get("field")); - - q = new FuzzyQuery(new Term("field", "a"), 6f, 0, 50); - hits = searcher.search(q, 10).scoreDocs; - assertEquals(2, hits.length); - assertEquals("test", searcher.doc(hits[0].doc).get("field")); - assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); - + try { + q = new FuzzyQuery(new Term("field", "t"), 3); + fail(); + } catch (IllegalArgumentException expected) { + // expected + } + reader.close(); index.close(); } - + private void addDoc(String text, RandomIndexWriter writer) throws IOException { Document doc = new Document(); doc.add(newField("field", text, TextField.TYPE_STORED)); diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java index 3aa9c86420d..3ac1b01dcb0 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java @@ -90,7 +90,7 @@ public class TestSpanMultiTermQueryWrapper extends LuceneTestCase { public void testFuzzy2() throws Exception { // maximum of 1 term expansion - FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1); + FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1, 0, 1, false); SpanQuery sfq = new SpanMultiTermQueryWrapper(fq); // will only match jumps over lazy broun dog SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100); diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java index a41e1c9847c..e4d5978638c 100644 --- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java +++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java @@ -669,12 +669,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte @Override public void run() throws Exception { numHighlights = 0; - FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 0.5f); + FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 2); fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); doSearching(fuzzyQuery); doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true); assertTrue("Failed to find correct number of highlights " + numHighlights + " found", - numHighlights == 5); + numHighlights == 4); } }; diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java index 2bb7666ee40..d510fe0daca 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java @@ -774,7 +774,10 @@ public abstract class QueryParserBase { */ protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { // FuzzyQuery doesn't yet allow constant score rewrite - return new FuzzyQuery(term,minimumSimilarity,prefixLength); + String text = term.text(); + int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity, + text.codePointCount(0, text.length())); + return new FuzzyQuery(term,numEdits,prefixLength); } // TODO: Should this be protected instead? diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html index 1cb0e0677d5..de3a4206a80 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html @@ -191,12 +191,13 @@ enabling substantial customization to how a query is created.

Note: You cannot use a * or ? symbol as the first character of a search.

Fuzzy Searches

-

Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search:

+

Lucene supports fuzzy searches based on Damerau-Levenshtein Distance. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search:

roam~

This search will find terms like foam and roams.

-

Starting with Lucene 1.9 an additional (optional) parameter can specify the required similarity. The value is between 0 and 1, with a value closer to 1 only terms with a higher similarity will be matched. For example:

-
roam~0.8
-

The default that is used if the parameter is not given is 0.5.

+

An additional (optional) parameter can specify the maximum number of edits allowed. The value is between 0 and 2, For example:

+
roam~1
+

The default that is used if the parameter is not given is 2 edit distances.

+

Previously, a floating point value was allowed here. This syntax is considered deprecated and will be removed in Lucene 5.0

Proximity Searches

Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search:

diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java index dfd3b3d9dce..4cc36d82e31 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java @@ -34,9 +34,13 @@ public class FuzzyQueryNodeBuilder implements StandardQueryBuilder { public FuzzyQuery build(QueryNode queryNode) throws QueryNodeException { FuzzyQueryNode fuzzyNode = (FuzzyQueryNode) queryNode; - + String text = fuzzyNode.getTextAsString(); + + int numEdits = FuzzyQuery.floatToEdits(fuzzyNode.getSimilarity(), + text.codePointCount(0, text.length())); + return new FuzzyQuery(new Term(fuzzyNode.getFieldAsString(), fuzzyNode - .getTextAsString()), fuzzyNode.getSimilarity(), fuzzyNode + .getTextAsString()), numEdits, fuzzyNode .getPrefixLength()); } diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java index 0e806211cd6..42eb1f2d9ae 100644 --- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java +++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java @@ -5,7 +5,7 @@ import org.apache.lucene.queryparser.xml.DOMUtils; import org.apache.lucene.queryparser.xml.ParserException; import org.apache.lucene.queryparser.xml.QueryBuilder; import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery; -import org.apache.lucene.search.FuzzyQuery; +import org.apache.lucene.sandbox.queries.SlowFuzzyQuery; import org.apache.lucene.search.Query; import org.w3c.dom.Element; import org.w3c.dom.NodeList; @@ -33,7 +33,7 @@ import org.w3c.dom.NodeList; public class FuzzyLikeThisQueryBuilder implements QueryBuilder { private static final int DEFAULT_MAX_NUM_TERMS = 50; - private static final float DEFAULT_MIN_SIMILARITY = FuzzyQuery.defaultMinSimilarity; + private static final float DEFAULT_MIN_SIMILARITY = SlowFuzzyQuery.defaultMinSimilarity; private static final int DEFAULT_PREFIX_LENGTH = 1; private static final boolean DEFAULT_IGNORE_TF = false; diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java index 46437909f1a..d8edb1887a8 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java @@ -59,8 +59,8 @@ public class TestAnalyzingQueryParser extends LuceneTestCase { fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9", "Mötley Crüe Mötley~0.75 Crüe~0.5", "Renée Zellweger Renée~0.9 Zellweger~" }; - fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9", - "motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~2.0" }; + fuzzyExpected = new String[] { "ubersetzung ubersetzung~1", + "motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" }; a = new ASCIIAnalyzer(); } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java index 85314d88b0b..ce4b5c9517e 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java @@ -85,10 +85,10 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString()); q = mfqp.parse("one~ two"); - assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString()); + assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString()); q = mfqp.parse("one~0.8 two^2"); - assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString()); + assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString()); q = mfqp.parse("one* two*"); assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString()); @@ -272,7 +272,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase { q = parser.parse("bla*"); assertEquals("f1:bla* f2:bla* f3:bla*", q.toString()); q = parser.parse("bla~"); - assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString()); + assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString()); q = parser.parse("[a TO c]"); assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString()); } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java index 232fb994559..e558ad2f0c4 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java @@ -282,10 +282,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase { public void testWildcard() throws Exception { assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*^2", null, "term*^2.0"); - assertQueryEquals("term~", null, "term~2.0"); - assertQueryEquals("term~0.7", null, "term~0.7"); - assertQueryEquals("term~^3", null, "term~2.0^3.0"); - assertQueryEquals("term^3~", null, "term~2.0^3.0"); + assertQueryEquals("term~", null, "term~2"); + assertQueryEquals("term~0.7", null, "term~1"); + assertQueryEquals("term~^3", null, "term~2^3.0"); + assertQueryEquals("term^3~", null, "term~2^3.0"); assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ^3", null, "term*germ^3.0"); @@ -294,10 +294,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase { assertTrue(getQuery("term~", null) instanceof FuzzyQuery); assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery); FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null); - assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); + assertEquals(1, fq.getMaxEdits()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); fq = (FuzzyQuery) getQuery("term~", null); - assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); + assertEquals(2, fq.getMaxEdits()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); try { getQuery("term~1.1", null); // value > 1, throws exception @@ -336,9 +336,9 @@ public class TestPrecedenceQueryParser extends LuceneTestCase { assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); // Fuzzy queries: - assertWildcardQueryEquals("Term~", "term~2.0"); - assertWildcardQueryEquals("Term~", true, "term~2.0"); - assertWildcardQueryEquals("Term~", false, "Term~2.0"); + assertWildcardQueryEquals("Term~", "term~2"); + assertWildcardQueryEquals("Term~", true, "term~2"); + assertWildcardQueryEquals("Term~", false, "Term~2"); // Range queries: assertWildcardQueryEquals("[A TO C]", "[a TO c]"); assertWildcardQueryEquals("[A TO C]", true, "[a TO c]"); @@ -498,10 +498,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase { assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); - assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); - assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); - assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); - assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); + assertQueryEquals("a:b\\-c~", a, "a:b-c~2"); + assertQueryEquals("a:b\\+c~", a, "a:b+c~2"); + assertQueryEquals("a:b\\:c~", a, "a:b:c~2"); + assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java index 42b2bca61a1..bb99a377e2a 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java @@ -100,10 +100,10 @@ public class TestMultiFieldQPHelper extends LuceneTestCase { assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString()); q = mfqp.parse("one~ two", null); - assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString()); + assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString()); q = mfqp.parse("one~0.8 two^2", null); - assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString()); + assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString()); q = mfqp.parse("one* two*", null); assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString()); @@ -311,7 +311,7 @@ public class TestMultiFieldQPHelper extends LuceneTestCase { q = parser.parse("bla*", null); assertEquals("f1:bla* f2:bla* f3:bla*", q.toString()); q = parser.parse("bla~", null); - assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString()); + assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString()); q = parser.parse("[a TO c]", null); assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString()); } diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java index 51dae2356a2..91d25f86c57 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java @@ -514,12 +514,12 @@ public class TestQPHelper extends LuceneTestCase { public void testWildcard() throws Exception { assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*^2", null, "term*^2.0"); - assertQueryEquals("term~", null, "term~2.0"); - assertQueryEquals("term~0.7", null, "term~0.7"); + assertQueryEquals("term~", null, "term~2"); + assertQueryEquals("term~0.7", null, "term~1"); - assertQueryEquals("term~^3", null, "term~2.0^3.0"); + assertQueryEquals("term~^3", null, "term~2^3.0"); - assertQueryEquals("term^3~", null, "term~2.0^3.0"); + assertQueryEquals("term^3~", null, "term~2^3.0"); assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ^3", null, "term*germ^3.0"); @@ -528,10 +528,10 @@ public class TestQPHelper extends LuceneTestCase { assertTrue(getQuery("term~", null) instanceof FuzzyQuery); assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery); FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null); - assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); + assertEquals(1, fq.getMaxEdits()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); fq = (FuzzyQuery) getQuery("term~", null); - assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); + assertEquals(2, fq.getMaxEdits()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertQueryNodeException("term~1.1"); // value > 1, throws exception @@ -567,9 +567,9 @@ public class TestQPHelper extends LuceneTestCase { assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); // Fuzzy queries: - assertWildcardQueryEquals("Term~", "term~2.0"); - assertWildcardQueryEquals("Term~", true, "term~2.0"); - assertWildcardQueryEquals("Term~", false, "Term~2.0"); + assertWildcardQueryEquals("Term~", "term~2"); + assertWildcardQueryEquals("Term~", true, "term~2"); + assertWildcardQueryEquals("Term~", false, "Term~2"); // Range queries: // TODO: implement this on QueryParser @@ -805,10 +805,10 @@ public class TestQPHelper extends LuceneTestCase { assertQueryEquals("a:b\\\\?c", a, "a:b\\?c"); - assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); - assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); - assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); - assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); + assertQueryEquals("a:b\\-c~", a, "a:b-c~2"); + assertQueryEquals("a:b\\+c~", a, "a:b+c~2"); + assertQueryEquals("a:b\\:c~", a, "a:b:c~2"); + assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2"); // TODO: implement Range queries on QueryParser assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java index 96af099c1d5..5ba3b3dd496 100644 --- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java +++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java @@ -420,10 +420,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase { public void testWildcard() throws Exception { assertQueryEquals("term*", null, "term*"); assertQueryEquals("term*^2", null, "term*^2.0"); - assertQueryEquals("term~", null, "term~2.0"); - assertQueryEquals("term~0.7", null, "term~0.7"); - assertQueryEquals("term~^3", null, "term~2.0^3.0"); - assertQueryEquals("term^3~", null, "term~2.0^3.0"); + assertQueryEquals("term~", null, "term~2"); + assertQueryEquals("term~0.7", null, "term~1"); + assertQueryEquals("term~^3", null, "term~2^3.0"); + assertQueryEquals("term^3~", null, "term~2^3.0"); assertQueryEquals("term*germ", null, "term*germ"); assertQueryEquals("term*germ^3", null, "term*germ^3.0"); @@ -432,10 +432,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertTrue(getQuery("term~", null) instanceof FuzzyQuery); assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery); FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null); - assertEquals(0.7f, fq.getMinSimilarity(), 0.1f); + assertEquals(1, fq.getMaxEdits()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); fq = (FuzzyQuery)getQuery("term~", null); - assertEquals(2.0f, fq.getMinSimilarity(), 0.1f); + assertEquals(2, fq.getMaxEdits()); assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength()); assertParseException("term~1.1"); // value > 1, throws exception @@ -470,9 +470,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertWildcardQueryEquals("TE?M", false, "TE?M"); assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM"); // Fuzzy queries: - assertWildcardQueryEquals("Term~", "term~2.0"); - assertWildcardQueryEquals("Term~", true, "term~2.0"); - assertWildcardQueryEquals("Term~", false, "Term~2.0"); + assertWildcardQueryEquals("Term~", "term~2"); + assertWildcardQueryEquals("Term~", true, "term~2"); + assertWildcardQueryEquals("Term~", false, "Term~2"); // Range queries: assertWildcardQueryEquals("[A TO C]", "[a TO c]"); assertWildcardQueryEquals("[A TO C]", true, "[a TO c]"); @@ -693,10 +693,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase { assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c"); - assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0"); - assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0"); - assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0"); - assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0"); + assertQueryEquals("a:b\\-c~", a, "a:b-c~2"); + assertQueryEquals("a:b\\+c~", a, "a:b+c~2"); + assertQueryEquals("a:b\\:c~", a, "a:b:c~2"); + assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2"); assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]"); assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]"); @@ -1271,7 +1271,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase { public void testDistanceAsEditsParsing() throws Exception { QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(random())); FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2"); - assertEquals(2f, q.getMinSimilarity(), 0.0001f); + assertEquals(2, q.getMaxEdits()); } public void testPhraseQueryToString() throws ParseException { diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java index fb8a49da985..4b3ea6af2a2 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java @@ -211,7 +211,7 @@ public class FuzzyLikeThisQuery extends Query AttributeSource atts = new AttributeSource(); MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class); - FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength, false); + SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength); //store the df so all variants use same idf int df = reader.docFreq(startTerm); int numVariants=0; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyQuery.java new file mode 100644 index 00000000000..605b7742520 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyQuery.java @@ -0,0 +1,204 @@ +package org.apache.lucene.sandbox.queries; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SingleTermsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanQuery; // javadocs +import org.apache.lucene.search.FuzzyQuery; // javadocs +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.automaton.LevenshteinAutomata; + +/** Implements the classic fuzzy search query. The similarity measurement + * is based on the Levenshtein (edit distance) algorithm. + *

+ * Note that, unlike {@link FuzzyQuery}, this query will silently allow + * for a (possibly huge) number of edit distances in comparisons, and may + * be extremely slow (comparing every term in the index). + * + * @deprecated Use {@link FuzzyQuery} instead. + */ +@Deprecated +public class SlowFuzzyQuery extends MultiTermQuery { + + public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + public final static int defaultPrefixLength = 0; + public final static int defaultMaxExpansions = 50; + + private float minimumSimilarity; + private int prefixLength; + private boolean termLongEnough = false; + + protected Term term; + + /** + * Create a new SlowFuzzyQuery that will match terms with a similarity + * of at least minimumSimilarity to term. + * If a prefixLength > 0 is specified, a common prefix + * of that length is also required. + * + * @param term the term to search for + * @param minimumSimilarity a value between 0 and 1 to set the required similarity + * between the query term and the matching terms. For example, for a + * minimumSimilarity of 0.5 a term of the same length + * as the query term is considered similar to the query term if the edit distance + * between both terms is less than length(term)*0.5 + *

+ * Alternatively, if minimumSimilarity is >= 1f, it is interpreted + * as a pure Levenshtein edit distance. For example, a value of 2f + * will match all terms within an edit distance of 2 from the + * query term. Edit distances specified in this way may not be fractional. + * + * @param prefixLength length of common (non-fuzzy) prefix + * @param maxExpansions the maximum number of terms to match. If this number is + * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten, + * then the maxClauseCount will be used instead. + * @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0 + * or if prefixLength < 0 + */ + public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength, + int maxExpansions) { + super(term.field()); + this.term = term; + + if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity) + throw new IllegalArgumentException("fractional edit distances are not allowed"); + if (minimumSimilarity < 0.0f) + throw new IllegalArgumentException("minimumSimilarity < 0"); + if (prefixLength < 0) + throw new IllegalArgumentException("prefixLength < 0"); + if (maxExpansions < 0) + throw new IllegalArgumentException("maxExpansions < 0"); + + setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions)); + + String text = term.text(); + int len = text.codePointCount(0, text.length()); + if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) { + this.termLongEnough = true; + } + + this.minimumSimilarity = minimumSimilarity; + this.prefixLength = prefixLength; + } + + /** + * Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}. + */ + public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { + this(term, minimumSimilarity, prefixLength, defaultMaxExpansions); + } + + /** + * Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}. + */ + public SlowFuzzyQuery(Term term, float minimumSimilarity) { + this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions); + } + + /** + * Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}. + */ + public SlowFuzzyQuery(Term term) { + this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions); + } + + /** + * Returns the minimum similarity that is required for this query to match. + * @return float value between 0.0 and 1.0 + */ + public float getMinSimilarity() { + return minimumSimilarity; + } + + /** + * Returns the non-fuzzy prefix length. This is the number of characters at the start + * of a term that must be identical (not fuzzy) to the query term if the query + * is to match that term. + */ + public int getPrefixLength() { + return prefixLength; + } + + @Override + protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException { + if (!termLongEnough) { // can only match if it's exact + return new SingleTermsEnum(terms.iterator(null), term.bytes()); + } + return new SlowFuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength); + } + + /** + * Returns the pattern term. + */ + public Term getTerm() { + return term; + } + + @Override + public String toString(String field) { + final StringBuilder buffer = new StringBuilder(); + if (!term.field().equals(field)) { + buffer.append(term.field()); + buffer.append(":"); + } + buffer.append(term.text()); + buffer.append('~'); + buffer.append(Float.toString(minimumSimilarity)); + buffer.append(ToStringUtils.boost(getBoost())); + return buffer.toString(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + Float.floatToIntBits(minimumSimilarity); + result = prime * result + prefixLength; + result = prime * result + ((term == null) ? 0 : term.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + SlowFuzzyQuery other = (SlowFuzzyQuery) obj; + if (Float.floatToIntBits(minimumSimilarity) != Float + .floatToIntBits(other.minimumSimilarity)) + return false; + if (prefixLength != other.prefixLength) + return false; + if (term == null) { + if (other.term != null) + return false; + } else if (!term.equals(other.term)) + return false; + return true; + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java new file mode 100644 index 00000000000..f106c917426 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java @@ -0,0 +1,249 @@ +package org.apache.lucene.sandbox.queries; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.FilteredTermsEnum; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.FuzzyTermsEnum; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.UnicodeUtil; + +/** Classic fuzzy TermsEnum for enumerating all terms that are similar + * to the specified filter term. + * + *

Term enumerations are always ordered by + * {@link #getComparator}. Each term in the enumeration is + * greater than all that precede it.

+ * + * @deprecated Use {@link FuzzyTermsEnum} instead. + */ +@Deprecated +public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum { + + public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, + float minSimilarity, int prefixLength) throws IOException { + super(terms, atts, term, minSimilarity, prefixLength, false); + } + + @Override + protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init) + throws IOException { + TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm); + if (newEnum != null) { + setEnum(newEnum); + } else if (init) { + setEnum(new LinearFuzzyTermsEnum()); + } + } + + /** + * Implement fuzzy enumeration with linear brute force. + */ + private class LinearFuzzyTermsEnum extends FilteredTermsEnum { + /* Allows us save time required to create a new array + * every time similarity is called. + */ + private int[] d; + private int[] p; + + // this is the text, minus the prefix + private final int[] text; + + private final BoostAttribute boostAtt = + attributes().addAttribute(BoostAttribute.class); + + /** + * Constructor for enumeration of all terms from specified reader which share a prefix of + * length prefixLength with term and which have a fuzzy similarity > + * minSimilarity. + *

+ * After calling the constructor the enumeration is already pointing to the first + * valid term if such a term exists. + * + * @throws IOException + */ + public LinearFuzzyTermsEnum() throws IOException { + super(terms.iterator(null)); + + this.text = new int[termLength - realPrefixLength]; + System.arraycopy(termText, realPrefixLength, text, 0, text.length); + final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength); + prefixBytesRef = new BytesRef(prefix); + this.d = new int[this.text.length + 1]; + this.p = new int[this.text.length + 1]; + + setInitialSeekTerm(prefixBytesRef); + } + + private final BytesRef prefixBytesRef; + // used for unicode conversion from BytesRef byte[] to int[] + private final IntsRef utf32 = new IntsRef(20); + + /** + * The termCompare method in FuzzyTermEnum uses Levenshtein distance to + * calculate the distance between the given term and the comparing term. + */ + @Override + protected final AcceptStatus accept(BytesRef term) { + if (StringHelper.startsWith(term, prefixBytesRef)) { + UnicodeUtil.UTF8toUTF32(term, utf32); + final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength); + if (similarity > minSimilarity) { + boostAtt.setBoost((similarity - minSimilarity) * scale_factor); + return AcceptStatus.YES; + } else return AcceptStatus.NO; + } else { + return AcceptStatus.END; + } + } + + /****************************** + * Compute Levenshtein distance + ******************************/ + + /** + *

Similarity returns a number that is 1.0f or less (including negative numbers) + * based on how similar the Term is compared to a target term. It returns + * exactly 0.0f when + *

+     *    editDistance > maximumEditDistance
+ * Otherwise it returns: + *
+     *    1 - (editDistance / length)
+ * where length is the length of the shortest term (text or target) including a + * prefix that are identical and editDistance is the Levenshtein distance for + * the two words.

+ * + *

Embedded within this algorithm is a fail-fast Levenshtein distance + * algorithm. The fail-fast algorithm differs from the standard Levenshtein + * distance algorithm in that it is aborted if it is discovered that the + * minimum distance between the words is greater than some threshold. + * + *

To calculate the maximum distance threshold we use the following formula: + *

+     *     (1 - minimumSimilarity) * length
+ * where length is the shortest term including any prefix that is not part of the + * similarity comparison. This formula was derived by solving for what maximum value + * of distance returns false for the following statements: + *
+     *   similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+     *   return (similarity > minimumSimilarity);
+ * where distance is the Levenshtein distance for the two words. + *

+ *

Levenshtein distance (also known as edit distance) is a measure of similarity + * between two strings where the distance is measured as the number of character + * deletions, insertions or substitutions required to transform one string to + * the other string. + * @param target the target word or phrase + * @return the similarity, 0.0 or less indicates that it matches less than the required + * threshold and 1.0 indicates that the text and target are identical + */ + private final float similarity(final int[] target, int offset, int length) { + final int m = length; + final int n = text.length; + if (n == 0) { + //we don't have anything to compare. That means if we just add + //the letters for m we get the new word + return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength); + } + if (m == 0) { + return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength); + } + + final int maxDistance = calculateMaxDistance(m); + + if (maxDistance < Math.abs(m-n)) { + //just adding the characters of m to n or vice-versa results in + //too many edits + //for example "pre" length is 3 and "prefixes" length is 8. We can see that + //given this optimal circumstance, the edit distance cannot be less than 5. + //which is 8-3 or more precisely Math.abs(3-8). + //if our maximum edit distance is 4, then we can discard this word + //without looking at it. + return Float.NEGATIVE_INFINITY; + } + + // init matrix d + for (int i = 0; i <=n; ++i) { + p[i] = i; + } + + // start computing edit distance + for (int j = 1; j<=m; ++j) { // iterates through target + int bestPossibleEditDistance = m; + final int t_j = target[offset+j-1]; // jth character of t + d[0] = j; + + for (int i=1; i<=n; ++i) { // iterates through text + // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1) + if (t_j != text[i-1]) { + d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1; + } else { + d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]); + } + bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]); + } + + //After calculating row i, the best possible edit distance + //can be found by found by finding the smallest value in a given column. + //If the bestPossibleEditDistance is greater than the max distance, abort. + + if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater + //the closest the target can be to the text is just too far away. + //this target is leaving the party early. + return Float.NEGATIVE_INFINITY; + } + + // copy current distance counts to 'previous row' distance counts: swap p and d + int _d[] = p; + p = d; + d = _d; + } + + // our last action in the above loop was to switch d and p, so p now + // actually has the most recent cost counts + + // this will return less than 0.0 when the edit distance is + // greater than the number of characters in the shorter word. + // but this was the formula that was previously used in FuzzyTermEnum, + // so it has not been changed (even though minimumSimilarity must be + // greater than 0.0) + return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m))); + } + + /** + * The max Distance is the maximum Levenshtein distance for the text + * compared to some other value that results in score that is + * better than the minimum similarity. + * @param m the length of the "other value" + * @return the maximum levenshtein distance that we care about + */ + private int calculateMaxDistance(int m) { + return raw ? maxEdits : Math.min(maxEdits, + (int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength))); + } + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java new file mode 100644 index 00000000000..8557e3d832a --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java @@ -0,0 +1,468 @@ +package org.apache.lucene.sandbox.queries; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.List; +import java.util.Arrays; +import java.io.IOException; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Tests {@link SlowFuzzyQuery}. + * + */ +public class TestSlowFuzzyQuery extends LuceneTestCase { + + public void testFuzziness() throws Exception { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("aaaaa", writer); + addDoc("aaaab", writer); + addDoc("aaabb", writer); + addDoc("aabbb", writer); + addDoc("abbbb", writer); + addDoc("bbbbb", writer); + addDoc("ddddd", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + + // same with prefix + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(2, hits.length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 6); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + + // test scoring + query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("3 documents should match", 3, hits.length); + List order = Arrays.asList("bbbbb","abbbb","aabbb"); + for (int i = 0; i < hits.length; i++) { + final String term = searcher.doc(hits[i].doc).get("field"); + //System.out.println(hits[i].score); + assertEquals(order.get(i), term); + } + + // test pq size by supplying maxExpansions=2 + // This query would normally return 3 documents, because 3 terms match (see above): + query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals("only 2 documents should match", 2, hits.length); + order = Arrays.asList("bbbbb","abbbb"); + for (int i = 0; i < hits.length; i++) { + final String term = searcher.doc(hits[i].doc).get("field"); + //System.out.println(hits[i].score); + assertEquals(order.get(i), term); + } + + // not similar enough: + query = new SlowFuzzyQuery(new Term("field", "xxxxx"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + query = new SlowFuzzyQuery(new Term("field", "aaccc"), SlowFuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3 + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // query identical to a word in the index: + query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + // default allows for up to two edits: + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); + + // query similar to a word in the index: + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(2, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); + assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); + query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 1); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 3); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 4); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); + query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 5); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + + // different field = no match: + query = new SlowFuzzyQuery(new Term("anotherfield", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + reader.close(); + directory.close(); + } + + public void testFuzzinessLong() throws Exception { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("aaaaaaa", writer); + addDoc("segment", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + SlowFuzzyQuery query; + // not similar enough: + query = new SlowFuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + // edit distance to "aaaaaaa" = 3, this matches because the string is longer than + // in testDefaultFuzziness so a bigger difference is allowed: + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa")); + query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // no match, more than half of the characters is wrong: + query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // "student" and "stellent" are indeed similar to "segment" by default: + query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + + // now with prefix + query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 1); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 1); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 2); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // "student" doesn't match anymore thanks to increased minimum similarity: + query = new SlowFuzzyQuery(new Term("field", "student"), 0.6f, 0); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + try { + query = new SlowFuzzyQuery(new Term("field", "student"), 1.1f); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expecting exception + } + try { + query = new SlowFuzzyQuery(new Term("field", "student"), -0.1f); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + // expecting exception + } + + reader.close(); + directory.close(); + } + + /** + * MultiTermQuery provides (via attribute) information about which values + * must be competitive to enter the priority queue. + * + * SlowFuzzyQuery optimizes itself around this information, if the attribute + * is not implemented correctly, there will be problems! + */ + public void testTieBreaker() throws Exception { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("a123456", writer); + addDoc("c123456", writer); + addDoc("d123456", writer); + addDoc("e123456", writer); + + Directory directory2 = newDirectory(); + RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2); + addDoc("a123456", writer2); + addDoc("b123456", writer2); + addDoc("b123456", writer2); + addDoc("b123456", writer2); + addDoc("c123456", writer2); + addDoc("f123456", writer2); + + IndexReader ir1 = writer.getReader(); + IndexReader ir2 = writer2.getReader(); + + MultiReader mr = new MultiReader(ir1, ir2); + IndexSearcher searcher = newSearcher(mr); + SlowFuzzyQuery fq = new SlowFuzzyQuery(new Term("field", "z123456"), 1f, 0, 2); + TopDocs docs = searcher.search(fq, 2); + assertEquals(5, docs.totalHits); // 5 docs, from the a and b's + mr.close(); + ir1.close(); + ir2.close(); + writer.close(); + writer2.close(); + directory.close(); + directory2.close(); + } + + public void testTokenLengthOpt() throws IOException { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("12345678911", writer); + addDoc("segment", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + Query query; + // term not over 10 chars, so optimization shortcuts + query = new SlowFuzzyQuery(new Term("field", "1234569"), 0.9f); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // 10 chars, so no optimization + query = new SlowFuzzyQuery(new Term("field", "1234567891"), 0.9f); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + // over 10 chars, so no optimization + query = new SlowFuzzyQuery(new Term("field", "12345678911"), 0.9f); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(1, hits.length); + + // over 10 chars, no match + query = new SlowFuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f); + hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(0, hits.length); + + reader.close(); + directory.close(); + } + + /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */ + public void testBoostOnlyRewrite() throws Exception { + Directory directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), directory); + addDoc("Lucene", writer); + addDoc("Lucene", writer); + addDoc("Lucenne", writer); + + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + writer.close(); + + SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "lucene")); + query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50)); + ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs; + assertEquals(3, hits.length); + // normally, 'Lucenne' would be the first result as IDF will skew the score. + assertEquals("Lucene", reader.document(hits[0].doc).get("field")); + assertEquals("Lucene", reader.document(hits[1].doc).get("field")); + assertEquals("Lucenne", reader.document(hits[2].doc).get("field")); + reader.close(); + directory.close(); + } + + public void testGiga() throws Exception { + + MockAnalyzer analyzer = new MockAnalyzer(random()); + Directory index = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), index); + + addDoc("Lucene in Action", w); + addDoc("Lucene for Dummies", w); + + //addDoc("Giga", w); + addDoc("Giga byte", w); + + addDoc("ManagingGigabytesManagingGigabyte", w); + addDoc("ManagingGigabytesManagingGigabytes", w); + + addDoc("The Art of Computer Science", w); + addDoc("J. K. Rowling", w); + addDoc("JK Rowling", w); + addDoc("Joanne K Roling", w); + addDoc("Bruce Willis", w); + addDoc("Willis bruce", w); + addDoc("Brute willis", w); + addDoc("B. willis", w); + IndexReader r = w.getReader(); + w.close(); + + Query q = new SlowFuzzyQuery(new Term("field", "giga"), 0.9f); + + // 3. search + IndexSearcher searcher = newSearcher(r); + ScoreDoc[] hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field")); + r.close(); + index.close(); + } + + public void testDistanceAsEditsSearching() throws Exception { + Directory index = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), index); + addDoc("foobar", w); + addDoc("test", w); + addDoc("working", w); + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + w.close(); + + SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", "fouba"), 2); + ScoreDoc[] hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); + + q = new SlowFuzzyQuery(new Term("field", "foubara"), 2); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); + + q = new SlowFuzzyQuery(new Term("field", "t"), 3); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + + q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(1, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + + q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50); + hits = searcher.search(q, 10).scoreDocs; + assertEquals(2, hits.length); + assertEquals("test", searcher.doc(hits[0].doc).get("field")); + assertEquals("foobar", searcher.doc(hits[1].doc).get("field")); + + reader.close(); + index.close(); + } + + private void addDoc(String text, RandomIndexWriter writer) throws IOException { + Document doc = new Document(); + doc.add(newField("field", text, TextField.TYPE_STORED)); + writer.addDocument(doc); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery2.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery2.java similarity index 95% rename from lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery2.java rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery2.java index cda3d482bf5..ce1a9641cc5 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery2.java +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery2.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search; +package org.apache.lucene.sandbox.queries; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -55,7 +58,7 @@ import org.apache.lucene.util.LuceneTestCase; * * results line: comma-separated docID, score pair **/ -public class TestFuzzyQuery2 extends LuceneTestCase { +public class TestSlowFuzzyQuery2 extends LuceneTestCase { /** epsilon for score comparisons */ static final float epsilon = 0.00001f; @@ -115,7 +118,7 @@ public class TestFuzzyQuery2 extends LuceneTestCase { int prefix = Integer.parseInt(params[1]); int pqSize = Integer.parseInt(params[2]); float minScore = Float.parseFloat(params[3]); - FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix); + SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", query), minScore, prefix); q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize)); int expectedResults = Integer.parseInt(reader.readLine()); TopDocs docs = searcher.search(q, expectedResults); diff --git a/lucene/core/src/test/org/apache/lucene/search/fuzzyTestData.txt b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/fuzzyTestData.txt similarity index 100% rename from lucene/core/src/test/org/apache/lucene/search/fuzzyTestData.txt rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/fuzzyTestData.txt