runAutomata = initAutomata(editDistance);
if (editDistance < runAutomata.size()) {
@@ -187,7 +186,7 @@ public final class FuzzyTermsEnum extends TermsEnum {
}
/** swap in a new actual enum to proxy to */
- private void setEnum(TermsEnum actualEnum) {
+ protected void setEnum(TermsEnum actualEnum) {
this.actualEnum = actualEnum;
this.actualBoostAtt = actualEnum.attributes().addAttribute(BoostAttribute.class);
}
@@ -209,14 +208,21 @@ public final class FuzzyTermsEnum extends TermsEnum {
maxEdits--;
if (oldMaxEdits != maxEdits || init) { // the maximum n has changed
- TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
- if (newEnum != null) {
- setEnum(newEnum);
- } else if (init) {
- setEnum(new LinearFuzzyTermsEnum());
- }
+ maxEditDistanceChanged(lastTerm, maxEdits, init);
}
}
+
+ protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init)
+ throws IOException {
+ TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
+ // instead of assert, we do a hard check in case someone uses our enum directly
+ // assert newEnum != null;
+ if (newEnum == null) {
+ assert maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+ throw new IllegalArgumentException("maxEdits cannot be > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE");
+ }
+ setEnum(newEnum);
+ }
// for some raw min similarity and input term length, the maximum # of edits
private int initialMaxDistance(float minimumSimilarity, int termLen) {
@@ -383,194 +389,6 @@ public final class FuzzyTermsEnum extends TermsEnum {
}
}
- /**
- * Implement fuzzy enumeration with linear brute force.
- */
- private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
- /* Allows us save time required to create a new array
- * every time similarity is called.
- */
- private int[] d;
- private int[] p;
-
- // this is the text, minus the prefix
- private final int[] text;
-
- private final BoostAttribute boostAtt =
- attributes().addAttribute(BoostAttribute.class);
-
- /**
- * Constructor for enumeration of all terms from specified reader
which share a prefix of
- * length prefixLength
with term
and which have a fuzzy similarity >
- * minSimilarity
.
- *
- * After calling the constructor the enumeration is already pointing to the first
- * valid term if such a term exists.
- *
- * @throws IOException
- */
- public LinearFuzzyTermsEnum() throws IOException {
- super(terms.iterator(null));
-
- this.text = new int[termLength - realPrefixLength];
- System.arraycopy(termText, realPrefixLength, text, 0, text.length);
- final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
- prefixBytesRef = new BytesRef(prefix);
- this.d = new int[this.text.length + 1];
- this.p = new int[this.text.length + 1];
-
- setInitialSeekTerm(prefixBytesRef);
- }
-
- private final BytesRef prefixBytesRef;
- // used for unicode conversion from BytesRef byte[] to int[]
- private final IntsRef utf32 = new IntsRef(20);
-
- /**
- * The termCompare method in FuzzyTermEnum uses Levenshtein distance to
- * calculate the distance between the given term and the comparing term.
- */
- @Override
- protected final AcceptStatus accept(BytesRef term) {
- if (StringHelper.startsWith(term, prefixBytesRef)) {
- UnicodeUtil.UTF8toUTF32(term, utf32);
- final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
- if (similarity > minSimilarity) {
- boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
- return AcceptStatus.YES;
- } else return AcceptStatus.NO;
- } else {
- return AcceptStatus.END;
- }
- }
-
- /******************************
- * Compute Levenshtein distance
- ******************************/
-
- /**
- *
Similarity returns a number that is 1.0f or less (including negative numbers)
- * based on how similar the Term is compared to a target term. It returns
- * exactly 0.0f when
- *
- * editDistance > maximumEditDistance
- * Otherwise it returns:
- *
- * 1 - (editDistance / length)
- * where length is the length of the shortest term (text or target) including a
- * prefix that are identical and editDistance is the Levenshtein distance for
- * the two words.
- *
- * Embedded within this algorithm is a fail-fast Levenshtein distance
- * algorithm. The fail-fast algorithm differs from the standard Levenshtein
- * distance algorithm in that it is aborted if it is discovered that the
- * minimum distance between the words is greater than some threshold.
- *
- *
To calculate the maximum distance threshold we use the following formula:
- *
- * (1 - minimumSimilarity) * length
- * where length is the shortest term including any prefix that is not part of the
- * similarity comparison. This formula was derived by solving for what maximum value
- * of distance returns false for the following statements:
- *
- * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
- * return (similarity > minimumSimilarity);
- * where distance is the Levenshtein distance for the two words.
- *
- * Levenshtein distance (also known as edit distance) is a measure of similarity
- * between two strings where the distance is measured as the number of character
- * deletions, insertions or substitutions required to transform one string to
- * the other string.
- * @param target the target word or phrase
- * @return the similarity, 0.0 or less indicates that it matches less than the required
- * threshold and 1.0 indicates that the text and target are identical
- */
- private final float similarity(final int[] target, int offset, int length) {
- final int m = length;
- final int n = text.length;
- if (n == 0) {
- //we don't have anything to compare. That means if we just add
- //the letters for m we get the new word
- return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
- }
- if (m == 0) {
- return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
- }
-
- final int maxDistance = calculateMaxDistance(m);
-
- if (maxDistance < Math.abs(m-n)) {
- //just adding the characters of m to n or vice-versa results in
- //too many edits
- //for example "pre" length is 3 and "prefixes" length is 8. We can see that
- //given this optimal circumstance, the edit distance cannot be less than 5.
- //which is 8-3 or more precisely Math.abs(3-8).
- //if our maximum edit distance is 4, then we can discard this word
- //without looking at it.
- return Float.NEGATIVE_INFINITY;
- }
-
- // init matrix d
- for (int i = 0; i <=n; ++i) {
- p[i] = i;
- }
-
- // start computing edit distance
- for (int j = 1; j<=m; ++j) { // iterates through target
- int bestPossibleEditDistance = m;
- final int t_j = target[offset+j-1]; // jth character of t
- d[0] = j;
-
- for (int i=1; i<=n; ++i) { // iterates through text
- // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
- if (t_j != text[i-1]) {
- d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1;
- } else {
- d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]);
- }
- bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
- }
-
- //After calculating row i, the best possible edit distance
- //can be found by found by finding the smallest value in a given column.
- //If the bestPossibleEditDistance is greater than the max distance, abort.
-
- if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
- //the closest the target can be to the text is just too far away.
- //this target is leaving the party early.
- return Float.NEGATIVE_INFINITY;
- }
-
- // copy current distance counts to 'previous row' distance counts: swap p and d
- int _d[] = p;
- p = d;
- d = _d;
- }
-
- // our last action in the above loop was to switch d and p, so p now
- // actually has the most recent cost counts
-
- // this will return less than 0.0 when the edit distance is
- // greater than the number of characters in the shorter word.
- // but this was the formula that was previously used in FuzzyTermEnum,
- // so it has not been changed (even though minimumSimilarity must be
- // greater than 0.0)
- return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
- }
-
- /**
- * The max Distance is the maximum Levenshtein distance for the text
- * compared to some other value that results in score that is
- * better than the minimum similarity.
- * @param m the length of the "other value"
- * @return the maximum levenshtein distance that we care about
- */
- private int calculateMaxDistance(int m) {
- return raw ? maxEdits : Math.min(maxEdits,
- (int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)));
- }
- }
-
/** @lucene.internal */
public float getMinSimilarity() {
return minSimilarity;
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index 701aa1b7c8f..764d8967796 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -52,32 +52,32 @@ public class TestFuzzyQuery extends LuceneTestCase {
IndexSearcher searcher = newSearcher(reader);
writer.close();
- FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
+ FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
// same with prefix
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 1);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 2);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 3);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 4);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(2, hits.length);
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 5);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 6);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
// test scoring
- query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0);
+ query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("3 documents should match", 3, hits.length);
List order = Arrays.asList("bbbbb","abbbb","aabbb");
@@ -89,7 +89,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
// test pq size by supplying maxExpansions=2
// This query would normally return 3 documents, because 3 terms match (see above):
- query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMinSimilarity, 0, 2);
+ query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals("only 2 documents should match", 2, hits.length);
order = Arrays.asList("bbbbb","abbbb");
@@ -100,15 +100,15 @@ public class TestFuzzyQuery extends LuceneTestCase {
}
// not similar enough:
- query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMinSimilarity, 0);
+ query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
- query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
+ query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// query identical to a word in the index:
- query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMinSimilarity, 0);
+ query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
@@ -117,7 +117,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// query similar to a word in the index:
- query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 0);
+ query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
@@ -125,158 +125,69 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
// now with prefix
- query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 1);
+ query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
- query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 2);
+ query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
- query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 3);
+ query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(3, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
- query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 4);
+ query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(2, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
- query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMinSimilarity, 5);
+ query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
- query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
+ query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
// now with prefix
- query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 1);
+ query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
- query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 2);
+ query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
- query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 3);
+ query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
- query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 4);
+ query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(1, hits.length);
assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
- query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMinSimilarity, 5);
+ query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
// different field = no match:
- query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMinSimilarity, 0);
+ query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0);
hits = searcher.search(query, null, 1000).scoreDocs;
assertEquals(0, hits.length);
reader.close();
directory.close();
}
-
- public void testFuzzinessLong() throws Exception {
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
- addDoc("aaaaaaa", writer);
- addDoc("segment", writer);
-
- IndexReader reader = writer.getReader();
- IndexSearcher searcher = newSearcher(reader);
- writer.close();
-
- FuzzyQuery query;
- // not similar enough:
- query = new FuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
- // edit distance to "aaaaaaa" = 3, this matches because the string is longer than
- // in testDefaultFuzziness so a bigger difference is allowed:
- query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
-
- // now with prefix
- query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
- query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
- query = new FuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- // no match, more than half of the characters is wrong:
- query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- // now with prefix
- query = new FuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- // "student" and "stellent" are indeed similar to "segment" by default:
- query = new FuzzyQuery(new Term("field", "student"), 0.5f, 0);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 0);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
-
- // now with prefix
- query = new FuzzyQuery(new Term("field", "student"), 0.5f, 1);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 1);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
- query = new FuzzyQuery(new Term("field", "student"), 0.5f, 2);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
- query = new FuzzyQuery(new Term("field", "stellent"), 0.5f, 2);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- // "student" doesn't match anymore thanks to increased minimum similarity:
- query = new FuzzyQuery(new Term("field", "student"), 0.6f, 0);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- try {
- query = new FuzzyQuery(new Term("field", "student"), 1.1f);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expecting exception
- }
- try {
- query = new FuzzyQuery(new Term("field", "student"), -0.1f);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expecting exception
- }
-
- reader.close();
- directory.close();
- }
/**
* MultiTermQuery provides (via attribute) information about which values
@@ -307,7 +218,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
MultiReader mr = new MultiReader(ir1, ir2);
IndexSearcher searcher = newSearcher(mr);
- FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1f, 0, 2);
+ FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false);
TopDocs docs = searcher.search(fq, 2);
assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
mr.close();
@@ -319,41 +230,6 @@ public class TestFuzzyQuery extends LuceneTestCase {
directory2.close();
}
- public void testTokenLengthOpt() throws IOException {
- Directory directory = newDirectory();
- RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
- addDoc("12345678911", writer);
- addDoc("segment", writer);
-
- IndexReader reader = writer.getReader();
- IndexSearcher searcher = newSearcher(reader);
- writer.close();
-
- Query query;
- // term not over 10 chars, so optimization shortcuts
- query = new FuzzyQuery(new Term("field", "1234569"), 0.9f);
- ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- // 10 chars, so no optimization
- query = new FuzzyQuery(new Term("field", "1234567891"), 0.9f);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- // over 10 chars, so no optimization
- query = new FuzzyQuery(new Term("field", "12345678911"), 0.9f);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(1, hits.length);
-
- // over 10 chars, no match
- query = new FuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
- hits = searcher.search(query, null, 1000).scoreDocs;
- assertEquals(0, hits.length);
-
- reader.close();
- directory.close();
- }
-
/** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
public void testBoostOnlyRewrite() throws Exception {
Directory directory = newDirectory();
@@ -404,7 +280,7 @@ public class TestFuzzyQuery extends LuceneTestCase {
IndexReader r = w.getReader();
w.close();
- Query q = new FuzzyQuery(new Term("field", "giga"), 0.9f);
+ Query q = new FuzzyQuery(new Term("field", "giga"), 0);
// 3. search
IndexSearcher searcher = newSearcher(r);
@@ -435,26 +311,17 @@ public class TestFuzzyQuery extends LuceneTestCase {
assertEquals(1, hits.length);
assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
- q = new FuzzyQuery(new Term("field", "t"), 3);
- hits = searcher.search(q, 10).scoreDocs;
- assertEquals(1, hits.length);
- assertEquals("test", searcher.doc(hits[0].doc).get("field"));
-
- q = new FuzzyQuery(new Term("field", "a"), 4f, 0, 50);
- hits = searcher.search(q, 10).scoreDocs;
- assertEquals(1, hits.length);
- assertEquals("test", searcher.doc(hits[0].doc).get("field"));
-
- q = new FuzzyQuery(new Term("field", "a"), 6f, 0, 50);
- hits = searcher.search(q, 10).scoreDocs;
- assertEquals(2, hits.length);
- assertEquals("test", searcher.doc(hits[0].doc).get("field"));
- assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
-
+ try {
+ q = new FuzzyQuery(new Term("field", "t"), 3);
+ fail();
+ } catch (IllegalArgumentException expected) {
+ // expected
+ }
+
reader.close();
index.close();
}
-
+
private void addDoc(String text, RandomIndexWriter writer) throws IOException {
Document doc = new Document();
doc.add(newField("field", text, TextField.TYPE_STORED));
diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java
index 3aa9c86420d..3ac1b01dcb0 100644
--- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java
+++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpanMultiTermQueryWrapper.java
@@ -90,7 +90,7 @@ public class TestSpanMultiTermQueryWrapper extends LuceneTestCase {
public void testFuzzy2() throws Exception {
// maximum of 1 term expansion
- FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1f, 0, 1);
+ FuzzyQuery fq = new FuzzyQuery(new Term("field", "broan"), 1, 0, 1, false);
SpanQuery sfq = new SpanMultiTermQueryWrapper(fq);
// will only match jumps over lazy broun dog
SpanPositionRangeQuery sprq = new SpanPositionRangeQuery(sfq, 0, 100);
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
index a41e1c9847c..e4d5978638c 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
@@ -669,12 +669,12 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
@Override
public void run() throws Exception {
numHighlights = 0;
- FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 0.5f);
+ FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term(FIELD_NAME, "kinnedy"), 2);
fuzzyQuery.setRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
doSearching(fuzzyQuery);
doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true);
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
+ numHighlights == 4);
}
};
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
index 2bb7666ee40..d510fe0daca 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
@@ -774,7 +774,10 @@ public abstract class QueryParserBase {
*/
protected Query newFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
// FuzzyQuery doesn't yet allow constant score rewrite
- return new FuzzyQuery(term,minimumSimilarity,prefixLength);
+ String text = term.text();
+ int numEdits = FuzzyQuery.floatToEdits(minimumSimilarity,
+ text.codePointCount(0, text.length()));
+ return new FuzzyQuery(term,numEdits,prefixLength);
}
// TODO: Should this be protected instead?
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html
index 1cb0e0677d5..de3a4206a80 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/package.html
@@ -191,12 +191,13 @@ enabling substantial customization to how a query is created.
Note: You cannot use a * or ? symbol as the first character of a search.
Fuzzy Searches
-Lucene supports fuzzy searches based on the Levenshtein Distance, or Edit Distance algorithm. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search:
+Lucene supports fuzzy searches based on Damerau-Levenshtein Distance. To do a fuzzy search use the tilde, "~", symbol at the end of a Single word Term. For example to search for a term similar in spelling to "roam" use the fuzzy search:
roam~
This search will find terms like foam and roams.
-Starting with Lucene 1.9 an additional (optional) parameter can specify the required similarity. The value is between 0 and 1, with a value closer to 1 only terms with a higher similarity will be matched. For example:
-roam~0.8
-The default that is used if the parameter is not given is 0.5.
+An additional (optional) parameter can specify the maximum number of edits allowed. The value is between 0 and 2, For example:
+roam~1
+The default that is used if the parameter is not given is 2 edit distances.
+Previously, a floating point value was allowed here. This syntax is considered deprecated and will be removed in Lucene 5.0
Proximity Searches
Lucene supports finding words are a within a specific distance away. To do a proximity search use the tilde, "~", symbol at the end of a Phrase. For example to search for a "apache" and "jakarta" within 10 words of each other in a document use the search:
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java
index dfd3b3d9dce..4cc36d82e31 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/flexible/standard/builders/FuzzyQueryNodeBuilder.java
@@ -34,9 +34,13 @@ public class FuzzyQueryNodeBuilder implements StandardQueryBuilder {
public FuzzyQuery build(QueryNode queryNode) throws QueryNodeException {
FuzzyQueryNode fuzzyNode = (FuzzyQueryNode) queryNode;
-
+ String text = fuzzyNode.getTextAsString();
+
+ int numEdits = FuzzyQuery.floatToEdits(fuzzyNode.getSimilarity(),
+ text.codePointCount(0, text.length()));
+
return new FuzzyQuery(new Term(fuzzyNode.getFieldAsString(), fuzzyNode
- .getTextAsString()), fuzzyNode.getSimilarity(), fuzzyNode
+ .getTextAsString()), numEdits, fuzzyNode
.getPrefixLength());
}
diff --git a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java
index 0e806211cd6..42eb1f2d9ae 100644
--- a/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java
+++ b/lucene/queryparser/src/java/org/apache/lucene/queryparser/xml/builders/FuzzyLikeThisQueryBuilder.java
@@ -5,7 +5,7 @@ import org.apache.lucene.queryparser.xml.DOMUtils;
import org.apache.lucene.queryparser.xml.ParserException;
import org.apache.lucene.queryparser.xml.QueryBuilder;
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
-import org.apache.lucene.search.FuzzyQuery;
+import org.apache.lucene.sandbox.queries.SlowFuzzyQuery;
import org.apache.lucene.search.Query;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
@@ -33,7 +33,7 @@ import org.w3c.dom.NodeList;
public class FuzzyLikeThisQueryBuilder implements QueryBuilder {
private static final int DEFAULT_MAX_NUM_TERMS = 50;
- private static final float DEFAULT_MIN_SIMILARITY = FuzzyQuery.defaultMinSimilarity;
+ private static final float DEFAULT_MIN_SIMILARITY = SlowFuzzyQuery.defaultMinSimilarity;
private static final int DEFAULT_PREFIX_LENGTH = 1;
private static final boolean DEFAULT_IGNORE_TF = false;
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
index 46437909f1a..d8edb1887a8 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/analyzing/TestAnalyzingQueryParser.java
@@ -59,8 +59,8 @@ public class TestAnalyzingQueryParser extends LuceneTestCase {
fuzzyInput = new String[] { "Übersetzung Übersetzung~0.9",
"Mötley Crüe Mötley~0.75 Crüe~0.5",
"Renée Zellweger Renée~0.9 Zellweger~" };
- fuzzyExpected = new String[] { "ubersetzung ubersetzung~0.9",
- "motley crue motley~0.75 crue~0.5", "renee zellweger renee~0.9 zellweger~2.0" };
+ fuzzyExpected = new String[] { "ubersetzung ubersetzung~1",
+ "motley crue motley~1 crue~2", "renee zellweger renee~0 zellweger~2" };
a = new ASCIIAnalyzer();
}
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java
index 85314d88b0b..ce4b5c9517e 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestMultiFieldQueryParser.java
@@ -85,10 +85,10 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
q = mfqp.parse("one~ two");
- assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString());
+ assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString());
q = mfqp.parse("one~0.8 two^2");
- assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString());
+ assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString());
q = mfqp.parse("one* two*");
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
@@ -272,7 +272,7 @@ public class TestMultiFieldQueryParser extends LuceneTestCase {
q = parser.parse("bla*");
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
q = parser.parse("bla~");
- assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString());
+ assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString());
q = parser.parse("[a TO c]");
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
}
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
index 232fb994559..e558ad2f0c4 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/precedence/TestPrecedenceQueryParser.java
@@ -282,10 +282,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0");
- assertQueryEquals("term~", null, "term~2.0");
- assertQueryEquals("term~0.7", null, "term~0.7");
- assertQueryEquals("term~^3", null, "term~2.0^3.0");
- assertQueryEquals("term^3~", null, "term~2.0^3.0");
+ assertQueryEquals("term~", null, "term~2");
+ assertQueryEquals("term~0.7", null, "term~1");
+ assertQueryEquals("term~^3", null, "term~2^3.0");
+ assertQueryEquals("term^3~", null, "term~2^3.0");
assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
@@ -294,10 +294,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
- assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(1, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery) getQuery("term~", null);
- assertEquals(2.0f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(2, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
try {
getQuery("term~1.1", null); // value > 1, throws exception
@@ -336,9 +336,9 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
assertWildcardQueryEquals("TE?M", false, "TE?M");
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
// Fuzzy queries:
- assertWildcardQueryEquals("Term~", "term~2.0");
- assertWildcardQueryEquals("Term~", true, "term~2.0");
- assertWildcardQueryEquals("Term~", false, "Term~2.0");
+ assertWildcardQueryEquals("Term~", "term~2");
+ assertWildcardQueryEquals("Term~", true, "term~2");
+ assertWildcardQueryEquals("Term~", false, "Term~2");
// Range queries:
assertWildcardQueryEquals("[A TO C]", "[a TO c]");
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
@@ -498,10 +498,10 @@ public class TestPrecedenceQueryParser extends LuceneTestCase {
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
- assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
- assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
- assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0");
- assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0");
+ assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
+ assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
+ assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
+ assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java
index 42b2bca61a1..bb99a377e2a 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestMultiFieldQPHelper.java
@@ -100,10 +100,10 @@ public class TestMultiFieldQPHelper extends LuceneTestCase {
assertEquals("((b:one t:one)^2.0) (b:two t:two)", q.toString());
q = mfqp.parse("one~ two", null);
- assertEquals("(b:one~2.0 t:one~2.0) (b:two t:two)", q.toString());
+ assertEquals("(b:one~2 t:one~2) (b:two t:two)", q.toString());
q = mfqp.parse("one~0.8 two^2", null);
- assertEquals("(b:one~0.8 t:one~0.8) ((b:two t:two)^2.0)", q.toString());
+ assertEquals("(b:one~0 t:one~0) ((b:two t:two)^2.0)", q.toString());
q = mfqp.parse("one* two*", null);
assertEquals("(b:one* t:one*) (b:two* t:two*)", q.toString());
@@ -311,7 +311,7 @@ public class TestMultiFieldQPHelper extends LuceneTestCase {
q = parser.parse("bla*", null);
assertEquals("f1:bla* f2:bla* f3:bla*", q.toString());
q = parser.parse("bla~", null);
- assertEquals("f1:bla~2.0 f2:bla~2.0 f3:bla~2.0", q.toString());
+ assertEquals("f1:bla~2 f2:bla~2 f3:bla~2", q.toString());
q = parser.parse("[a TO c]", null);
assertEquals("f1:[a TO c] f2:[a TO c] f3:[a TO c]", q.toString());
}
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
index 51dae2356a2..91d25f86c57 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/flexible/standard/TestQPHelper.java
@@ -514,12 +514,12 @@ public class TestQPHelper extends LuceneTestCase {
public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0");
- assertQueryEquals("term~", null, "term~2.0");
- assertQueryEquals("term~0.7", null, "term~0.7");
+ assertQueryEquals("term~", null, "term~2");
+ assertQueryEquals("term~0.7", null, "term~1");
- assertQueryEquals("term~^3", null, "term~2.0^3.0");
+ assertQueryEquals("term~^3", null, "term~2^3.0");
- assertQueryEquals("term^3~", null, "term~2.0^3.0");
+ assertQueryEquals("term^3~", null, "term~2^3.0");
assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
@@ -528,10 +528,10 @@ public class TestQPHelper extends LuceneTestCase {
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery) getQuery("term~0.7", null);
- assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(1, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery) getQuery("term~", null);
- assertEquals(2.0f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(2, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
assertQueryNodeException("term~1.1"); // value > 1, throws exception
@@ -567,9 +567,9 @@ public class TestQPHelper extends LuceneTestCase {
assertWildcardQueryEquals("TE?M", false, "TE?M");
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
// Fuzzy queries:
- assertWildcardQueryEquals("Term~", "term~2.0");
- assertWildcardQueryEquals("Term~", true, "term~2.0");
- assertWildcardQueryEquals("Term~", false, "Term~2.0");
+ assertWildcardQueryEquals("Term~", "term~2");
+ assertWildcardQueryEquals("Term~", true, "term~2");
+ assertWildcardQueryEquals("Term~", false, "Term~2");
// Range queries:
// TODO: implement this on QueryParser
@@ -805,10 +805,10 @@ public class TestQPHelper extends LuceneTestCase {
assertQueryEquals("a:b\\\\?c", a, "a:b\\?c");
- assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
- assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
- assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0");
- assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0");
+ assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
+ assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
+ assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
+ assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
// TODO: implement Range queries on QueryParser
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
diff --git a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
index 96af099c1d5..5ba3b3dd496 100644
--- a/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
+++ b/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
@@ -420,10 +420,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
public void testWildcard() throws Exception {
assertQueryEquals("term*", null, "term*");
assertQueryEquals("term*^2", null, "term*^2.0");
- assertQueryEquals("term~", null, "term~2.0");
- assertQueryEquals("term~0.7", null, "term~0.7");
- assertQueryEquals("term~^3", null, "term~2.0^3.0");
- assertQueryEquals("term^3~", null, "term~2.0^3.0");
+ assertQueryEquals("term~", null, "term~2");
+ assertQueryEquals("term~0.7", null, "term~1");
+ assertQueryEquals("term~^3", null, "term~2^3.0");
+ assertQueryEquals("term^3~", null, "term~2^3.0");
assertQueryEquals("term*germ", null, "term*germ");
assertQueryEquals("term*germ^3", null, "term*germ^3.0");
@@ -432,10 +432,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertTrue(getQuery("term~", null) instanceof FuzzyQuery);
assertTrue(getQuery("term~0.7", null) instanceof FuzzyQuery);
FuzzyQuery fq = (FuzzyQuery)getQuery("term~0.7", null);
- assertEquals(0.7f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(1, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
fq = (FuzzyQuery)getQuery("term~", null);
- assertEquals(2.0f, fq.getMinSimilarity(), 0.1f);
+ assertEquals(2, fq.getMaxEdits());
assertEquals(FuzzyQuery.defaultPrefixLength, fq.getPrefixLength());
assertParseException("term~1.1"); // value > 1, throws exception
@@ -470,9 +470,9 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertWildcardQueryEquals("TE?M", false, "TE?M");
assertWildcardQueryEquals("Te?m*gerM", false, "Te?m*gerM");
// Fuzzy queries:
- assertWildcardQueryEquals("Term~", "term~2.0");
- assertWildcardQueryEquals("Term~", true, "term~2.0");
- assertWildcardQueryEquals("Term~", false, "Term~2.0");
+ assertWildcardQueryEquals("Term~", "term~2");
+ assertWildcardQueryEquals("Term~", true, "term~2");
+ assertWildcardQueryEquals("Term~", false, "Term~2");
// Range queries:
assertWildcardQueryEquals("[A TO C]", "[a TO c]");
assertWildcardQueryEquals("[A TO C]", true, "[a TO c]");
@@ -693,10 +693,10 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
assertQueryEquals("a:b\\\\?c", a, "a:b\\\\?c");
- assertQueryEquals("a:b\\-c~", a, "a:b-c~2.0");
- assertQueryEquals("a:b\\+c~", a, "a:b+c~2.0");
- assertQueryEquals("a:b\\:c~", a, "a:b:c~2.0");
- assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2.0");
+ assertQueryEquals("a:b\\-c~", a, "a:b-c~2");
+ assertQueryEquals("a:b\\+c~", a, "a:b+c~2");
+ assertQueryEquals("a:b\\:c~", a, "a:b:c~2");
+ assertQueryEquals("a:b\\\\c~", a, "a:b\\c~2");
assertQueryEquals("[ a\\- TO a\\+ ]", null, "[a- TO a+]");
assertQueryEquals("[ a\\: TO a\\~ ]", null, "[a: TO a~]");
@@ -1271,7 +1271,7 @@ public abstract class QueryParserTestBase extends LuceneTestCase {
public void testDistanceAsEditsParsing() throws Exception {
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockAnalyzer(random()));
FuzzyQuery q = (FuzzyQuery) qp.parse("foobar~2");
- assertEquals(2f, q.getMinSimilarity(), 0.0001f);
+ assertEquals(2, q.getMaxEdits());
}
public void testPhraseQueryToString() throws ParseException {
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java
index fb8a49da985..4b3ea6af2a2 100644
--- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java
@@ -211,7 +211,7 @@ public class FuzzyLikeThisQuery extends Query
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
- FuzzyTermsEnum fe = new FuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength, false);
+ SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(MultiFields.getTerms(reader, startTerm.field()), atts, startTerm, f.minSimilarity, f.prefixLength);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants=0;
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyQuery.java
new file mode 100644
index 00000000000..605b7742520
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyQuery.java
@@ -0,0 +1,204 @@
+package org.apache.lucene.sandbox.queries;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.SingleTermsEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.BooleanQuery; // javadocs
+import org.apache.lucene.search.FuzzyQuery; // javadocs
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.ToStringUtils;
+import org.apache.lucene.util.automaton.LevenshteinAutomata;
+
+/** Implements the classic fuzzy search query. The similarity measurement
+ * is based on the Levenshtein (edit distance) algorithm.
+ *
+ * Note that, unlike {@link FuzzyQuery}, this query will silently allow
+ * for a (possibly huge) number of edit distances in comparisons, and may
+ * be extremely slow (comparing every term in the index).
+ *
+ * @deprecated Use {@link FuzzyQuery} instead.
+ */
+@Deprecated
+public class SlowFuzzyQuery extends MultiTermQuery {
+
+ public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+ public final static int defaultPrefixLength = 0;
+ public final static int defaultMaxExpansions = 50;
+
+ private float minimumSimilarity;
+ private int prefixLength;
+ private boolean termLongEnough = false;
+
+ protected Term term;
+
+ /**
+ * Create a new SlowFuzzyQuery that will match terms with a similarity
+ * of at least minimumSimilarity
to term
.
+ * If a prefixLength
> 0 is specified, a common prefix
+ * of that length is also required.
+ *
+ * @param term the term to search for
+ * @param minimumSimilarity a value between 0 and 1 to set the required similarity
+ * between the query term and the matching terms. For example, for a
+ * minimumSimilarity
of 0.5
a term of the same length
+ * as the query term is considered similar to the query term if the edit distance
+ * between both terms is less than length(term)*0.5
+ *
+ * Alternatively, if minimumSimilarity
is >= 1f, it is interpreted
+ * as a pure Levenshtein edit distance. For example, a value of 2f
+ * will match all terms within an edit distance of 2
from the
+ * query term. Edit distances specified in this way may not be fractional.
+ *
+ * @param prefixLength length of common (non-fuzzy) prefix
+ * @param maxExpansions the maximum number of terms to match. If this number is
+ * greater than {@link BooleanQuery#getMaxClauseCount} when the query is rewritten,
+ * then the maxClauseCount will be used instead.
+ * @throws IllegalArgumentException if minimumSimilarity is >= 1 or < 0
+ * or if prefixLength < 0
+ */
+ public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
+ int maxExpansions) {
+ super(term.field());
+ this.term = term;
+
+ if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
+ throw new IllegalArgumentException("fractional edit distances are not allowed");
+ if (minimumSimilarity < 0.0f)
+ throw new IllegalArgumentException("minimumSimilarity < 0");
+ if (prefixLength < 0)
+ throw new IllegalArgumentException("prefixLength < 0");
+ if (maxExpansions < 0)
+ throw new IllegalArgumentException("maxExpansions < 0");
+
+ setRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
+
+ String text = term.text();
+ int len = text.codePointCount(0, text.length());
+ if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity))) {
+ this.termLongEnough = true;
+ }
+
+ this.minimumSimilarity = minimumSimilarity;
+ this.prefixLength = prefixLength;
+ }
+
+ /**
+ * Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)}.
+ */
+ public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) {
+ this(term, minimumSimilarity, prefixLength, defaultMaxExpansions);
+ }
+
+ /**
+ * Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)}.
+ */
+ public SlowFuzzyQuery(Term term, float minimumSimilarity) {
+ this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions);
+ }
+
+ /**
+ * Calls {@link #SlowFuzzyQuery(Term, float) SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)}.
+ */
+ public SlowFuzzyQuery(Term term) {
+ this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions);
+ }
+
+ /**
+ * Returns the minimum similarity that is required for this query to match.
+ * @return float value between 0.0 and 1.0
+ */
+ public float getMinSimilarity() {
+ return minimumSimilarity;
+ }
+
+ /**
+ * Returns the non-fuzzy prefix length. This is the number of characters at the start
+ * of a term that must be identical (not fuzzy) to the query term if the query
+ * is to match that term.
+ */
+ public int getPrefixLength() {
+ return prefixLength;
+ }
+
+ @Override
+ protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
+ if (!termLongEnough) { // can only match if it's exact
+ return new SingleTermsEnum(terms.iterator(null), term.bytes());
+ }
+ return new SlowFuzzyTermsEnum(terms, atts, getTerm(), minimumSimilarity, prefixLength);
+ }
+
+ /**
+ * Returns the pattern term.
+ */
+ public Term getTerm() {
+ return term;
+ }
+
+ @Override
+ public String toString(String field) {
+ final StringBuilder buffer = new StringBuilder();
+ if (!term.field().equals(field)) {
+ buffer.append(term.field());
+ buffer.append(":");
+ }
+ buffer.append(term.text());
+ buffer.append('~');
+ buffer.append(Float.toString(minimumSimilarity));
+ buffer.append(ToStringUtils.boost(getBoost()));
+ return buffer.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = super.hashCode();
+ result = prime * result + Float.floatToIntBits(minimumSimilarity);
+ result = prime * result + prefixLength;
+ result = prime * result + ((term == null) ? 0 : term.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (!super.equals(obj))
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ SlowFuzzyQuery other = (SlowFuzzyQuery) obj;
+ if (Float.floatToIntBits(minimumSimilarity) != Float
+ .floatToIntBits(other.minimumSimilarity))
+ return false;
+ if (prefixLength != other.prefixLength)
+ return false;
+ if (term == null) {
+ if (other.term != null)
+ return false;
+ } else if (!term.equals(other.term))
+ return false;
+ return true;
+ }
+}
diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java
new file mode 100644
index 00000000000..f106c917426
--- /dev/null
+++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/SlowFuzzyTermsEnum.java
@@ -0,0 +1,249 @@
+package org.apache.lucene.sandbox.queries;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.FilteredTermsEnum;
+import org.apache.lucene.search.BoostAttribute;
+import org.apache.lucene.search.FuzzyTermsEnum;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.StringHelper;
+import org.apache.lucene.util.UnicodeUtil;
+
+/** Classic fuzzy TermsEnum for enumerating all terms that are similar
+ * to the specified filter term.
+ *
+ *
Term enumerations are always ordered by
+ * {@link #getComparator}. Each term in the enumeration is
+ * greater than all that precede it.
+ *
+ * @deprecated Use {@link FuzzyTermsEnum} instead.
+ */
+@Deprecated
+public final class SlowFuzzyTermsEnum extends FuzzyTermsEnum {
+
+ public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
+ float minSimilarity, int prefixLength) throws IOException {
+ super(terms, atts, term, minSimilarity, prefixLength, false);
+ }
+
+ @Override
+ protected void maxEditDistanceChanged(BytesRef lastTerm, int maxEdits, boolean init)
+ throws IOException {
+ TermsEnum newEnum = getAutomatonEnum(maxEdits, lastTerm);
+ if (newEnum != null) {
+ setEnum(newEnum);
+ } else if (init) {
+ setEnum(new LinearFuzzyTermsEnum());
+ }
+ }
+
+ /**
+ * Implement fuzzy enumeration with linear brute force.
+ */
+ private class LinearFuzzyTermsEnum extends FilteredTermsEnum {
+ /* Allows us save time required to create a new array
+ * every time similarity is called.
+ */
+ private int[] d;
+ private int[] p;
+
+ // this is the text, minus the prefix
+ private final int[] text;
+
+ private final BoostAttribute boostAtt =
+ attributes().addAttribute(BoostAttribute.class);
+
+ /**
+ * Constructor for enumeration of all terms from specified reader
which share a prefix of
+ * length prefixLength
with term
and which have a fuzzy similarity >
+ * minSimilarity
.
+ *
+ * After calling the constructor the enumeration is already pointing to the first
+ * valid term if such a term exists.
+ *
+ * @throws IOException
+ */
+ public LinearFuzzyTermsEnum() throws IOException {
+ super(terms.iterator(null));
+
+ this.text = new int[termLength - realPrefixLength];
+ System.arraycopy(termText, realPrefixLength, text, 0, text.length);
+ final String prefix = UnicodeUtil.newString(termText, 0, realPrefixLength);
+ prefixBytesRef = new BytesRef(prefix);
+ this.d = new int[this.text.length + 1];
+ this.p = new int[this.text.length + 1];
+
+ setInitialSeekTerm(prefixBytesRef);
+ }
+
+ private final BytesRef prefixBytesRef;
+ // used for unicode conversion from BytesRef byte[] to int[]
+ private final IntsRef utf32 = new IntsRef(20);
+
+ /**
+ * The termCompare method in FuzzyTermEnum uses Levenshtein distance to
+ * calculate the distance between the given term and the comparing term.
+ */
+ @Override
+ protected final AcceptStatus accept(BytesRef term) {
+ if (StringHelper.startsWith(term, prefixBytesRef)) {
+ UnicodeUtil.UTF8toUTF32(term, utf32);
+ final float similarity = similarity(utf32.ints, realPrefixLength, utf32.length - realPrefixLength);
+ if (similarity > minSimilarity) {
+ boostAtt.setBoost((similarity - minSimilarity) * scale_factor);
+ return AcceptStatus.YES;
+ } else return AcceptStatus.NO;
+ } else {
+ return AcceptStatus.END;
+ }
+ }
+
+ /******************************
+ * Compute Levenshtein distance
+ ******************************/
+
+ /**
+ *
Similarity returns a number that is 1.0f or less (including negative numbers)
+ * based on how similar the Term is compared to a target term. It returns
+ * exactly 0.0f when
+ *
+ * editDistance > maximumEditDistance
+ * Otherwise it returns:
+ *
+ * 1 - (editDistance / length)
+ * where length is the length of the shortest term (text or target) including a
+ * prefix that are identical and editDistance is the Levenshtein distance for
+ * the two words.
+ *
+ * Embedded within this algorithm is a fail-fast Levenshtein distance
+ * algorithm. The fail-fast algorithm differs from the standard Levenshtein
+ * distance algorithm in that it is aborted if it is discovered that the
+ * minimum distance between the words is greater than some threshold.
+ *
+ *
To calculate the maximum distance threshold we use the following formula:
+ *
+ * (1 - minimumSimilarity) * length
+ * where length is the shortest term including any prefix that is not part of the
+ * similarity comparison. This formula was derived by solving for what maximum value
+ * of distance returns false for the following statements:
+ *
+ * similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+ * return (similarity > minimumSimilarity);
+ * where distance is the Levenshtein distance for the two words.
+ *
+ * Levenshtein distance (also known as edit distance) is a measure of similarity
+ * between two strings where the distance is measured as the number of character
+ * deletions, insertions or substitutions required to transform one string to
+ * the other string.
+ * @param target the target word or phrase
+ * @return the similarity, 0.0 or less indicates that it matches less than the required
+ * threshold and 1.0 indicates that the text and target are identical
+ */
+ private final float similarity(final int[] target, int offset, int length) {
+ final int m = length;
+ final int n = text.length;
+ if (n == 0) {
+ //we don't have anything to compare. That means if we just add
+ //the letters for m we get the new word
+ return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) m / realPrefixLength);
+ }
+ if (m == 0) {
+ return realPrefixLength == 0 ? 0.0f : 1.0f - ((float) n / realPrefixLength);
+ }
+
+ final int maxDistance = calculateMaxDistance(m);
+
+ if (maxDistance < Math.abs(m-n)) {
+ //just adding the characters of m to n or vice-versa results in
+ //too many edits
+ //for example "pre" length is 3 and "prefixes" length is 8. We can see that
+ //given this optimal circumstance, the edit distance cannot be less than 5.
+ //which is 8-3 or more precisely Math.abs(3-8).
+ //if our maximum edit distance is 4, then we can discard this word
+ //without looking at it.
+ return Float.NEGATIVE_INFINITY;
+ }
+
+ // init matrix d
+ for (int i = 0; i <=n; ++i) {
+ p[i] = i;
+ }
+
+ // start computing edit distance
+ for (int j = 1; j<=m; ++j) { // iterates through target
+ int bestPossibleEditDistance = m;
+ final int t_j = target[offset+j-1]; // jth character of t
+ d[0] = j;
+
+ for (int i=1; i<=n; ++i) { // iterates through text
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
+ if (t_j != text[i-1]) {
+ d[i] = Math.min(Math.min(d[i-1], p[i]), p[i-1]) + 1;
+ } else {
+ d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]);
+ }
+ bestPossibleEditDistance = Math.min(bestPossibleEditDistance, d[i]);
+ }
+
+ //After calculating row i, the best possible edit distance
+ //can be found by found by finding the smallest value in a given column.
+ //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+ if (j > maxDistance && bestPossibleEditDistance > maxDistance) { //equal is okay, but not greater
+ //the closest the target can be to the text is just too far away.
+ //this target is leaving the party early.
+ return Float.NEGATIVE_INFINITY;
+ }
+
+ // copy current distance counts to 'previous row' distance counts: swap p and d
+ int _d[] = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+
+ // this will return less than 0.0 when the edit distance is
+ // greater than the number of characters in the shorter word.
+ // but this was the formula that was previously used in FuzzyTermEnum,
+ // so it has not been changed (even though minimumSimilarity must be
+ // greater than 0.0)
+ return 1.0f - ((float)p[n] / (float) (realPrefixLength + Math.min(n, m)));
+ }
+
+ /**
+ * The max Distance is the maximum Levenshtein distance for the text
+ * compared to some other value that results in score that is
+ * better than the minimum similarity.
+ * @param m the length of the "other value"
+ * @return the maximum levenshtein distance that we care about
+ */
+ private int calculateMaxDistance(int m) {
+ return raw ? maxEdits : Math.min(maxEdits,
+ (int)((1-minSimilarity) * (Math.min(text.length, m) + realPrefixLength)));
+ }
+ }
+}
diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java
new file mode 100644
index 00000000000..8557e3d832a
--- /dev/null
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery.java
@@ -0,0 +1,468 @@
+package org.apache.lucene.sandbox.queries;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.List;
+import java.util.Arrays;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+/**
+ * Tests {@link SlowFuzzyQuery}.
+ *
+ */
+public class TestSlowFuzzyQuery extends LuceneTestCase {
+
+ public void testFuzziness() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("aaaaa", writer);
+ addDoc("aaaab", writer);
+ addDoc("aaabb", writer);
+ addDoc("aabbb", writer);
+ addDoc("abbbb", writer);
+ addDoc("bbbbb", writer);
+ addDoc("ddddd", writer);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ writer.close();
+
+ SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+
+ // same with prefix
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 1);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 3);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 4);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(2, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 5);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 6);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+
+ // test scoring
+ query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals("3 documents should match", 3, hits.length);
+ List order = Arrays.asList("bbbbb","abbbb","aabbb");
+ for (int i = 0; i < hits.length; i++) {
+ final String term = searcher.doc(hits[i].doc).get("field");
+ //System.out.println(hits[i].score);
+ assertEquals(order.get(i), term);
+ }
+
+ // test pq size by supplying maxExpansions=2
+ // This query would normally return 3 documents, because 3 terms match (see above):
+ query = new SlowFuzzyQuery(new Term("field", "bbbbb"), SlowFuzzyQuery.defaultMinSimilarity, 0, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals("only 2 documents should match", 2, hits.length);
+ order = Arrays.asList("bbbbb","abbbb");
+ for (int i = 0; i < hits.length; i++) {
+ final String term = searcher.doc(hits[i].doc).get("field");
+ //System.out.println(hits[i].score);
+ assertEquals(order.get(i), term);
+ }
+
+ // not similar enough:
+ query = new SlowFuzzyQuery(new Term("field", "xxxxx"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "aaccc"), SlowFuzzyQuery.defaultMinSimilarity, 0); // edit distance to "aaaaa" = 3
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // query identical to a word in the index:
+ query = new SlowFuzzyQuery(new Term("field", "aaaaa"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
+ // default allows for up to two edits:
+ assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
+ assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
+
+ // query similar to a word in the index:
+ query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
+ assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
+ assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
+
+ // now with prefix
+ query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 1);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
+ assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
+ assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
+ query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
+ assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
+ assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
+ query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 3);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
+ assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
+ assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb"));
+ query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 4);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(2, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa"));
+ assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab"));
+ query = new SlowFuzzyQuery(new Term("field", "aaaac"), SlowFuzzyQuery.defaultMinSimilarity, 5);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+
+ query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
+
+ // now with prefix
+ query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 1);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
+ query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
+ query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 3);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
+ query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 4);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd"));
+ query = new SlowFuzzyQuery(new Term("field", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 5);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+
+ // different field = no match:
+ query = new SlowFuzzyQuery(new Term("anotherfield", "ddddX"), SlowFuzzyQuery.defaultMinSimilarity, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ reader.close();
+ directory.close();
+ }
+
+ public void testFuzzinessLong() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("aaaaaaa", writer);
+ addDoc("segment", writer);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ writer.close();
+
+ SlowFuzzyQuery query;
+ // not similar enough:
+ query = new SlowFuzzyQuery(new Term("field", "xxxxx"), 0.5f, 0);
+ ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+ // edit distance to "aaaaaaa" = 3, this matches because the string is longer than
+ // in testDefaultFuzziness so a bigger difference is allowed:
+ query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
+
+ // now with prefix
+ query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 1);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
+ query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 4);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaaaa"));
+ query = new SlowFuzzyQuery(new Term("field", "aaaaccc"), 0.5f, 5);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // no match, more than half of the characters is wrong:
+ query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // now with prefix
+ query = new SlowFuzzyQuery(new Term("field", "aaacccc"), 0.5f, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // "student" and "stellent" are indeed similar to "segment" by default:
+ query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+
+ // now with prefix
+ query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 1);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 1);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "student"), 0.5f, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+ query = new SlowFuzzyQuery(new Term("field", "stellent"), 0.5f, 2);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // "student" doesn't match anymore thanks to increased minimum similarity:
+ query = new SlowFuzzyQuery(new Term("field", "student"), 0.6f, 0);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ try {
+ query = new SlowFuzzyQuery(new Term("field", "student"), 1.1f);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expecting exception
+ }
+ try {
+ query = new SlowFuzzyQuery(new Term("field", "student"), -0.1f);
+ fail("Expected IllegalArgumentException");
+ } catch (IllegalArgumentException e) {
+ // expecting exception
+ }
+
+ reader.close();
+ directory.close();
+ }
+
+ /**
+ * MultiTermQuery provides (via attribute) information about which values
+ * must be competitive to enter the priority queue.
+ *
+ * SlowFuzzyQuery optimizes itself around this information, if the attribute
+ * is not implemented correctly, there will be problems!
+ */
+ public void testTieBreaker() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("a123456", writer);
+ addDoc("c123456", writer);
+ addDoc("d123456", writer);
+ addDoc("e123456", writer);
+
+ Directory directory2 = newDirectory();
+ RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2);
+ addDoc("a123456", writer2);
+ addDoc("b123456", writer2);
+ addDoc("b123456", writer2);
+ addDoc("b123456", writer2);
+ addDoc("c123456", writer2);
+ addDoc("f123456", writer2);
+
+ IndexReader ir1 = writer.getReader();
+ IndexReader ir2 = writer2.getReader();
+
+ MultiReader mr = new MultiReader(ir1, ir2);
+ IndexSearcher searcher = newSearcher(mr);
+ SlowFuzzyQuery fq = new SlowFuzzyQuery(new Term("field", "z123456"), 1f, 0, 2);
+ TopDocs docs = searcher.search(fq, 2);
+ assertEquals(5, docs.totalHits); // 5 docs, from the a and b's
+ mr.close();
+ ir1.close();
+ ir2.close();
+ writer.close();
+ writer2.close();
+ directory.close();
+ directory2.close();
+ }
+
+ public void testTokenLengthOpt() throws IOException {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("12345678911", writer);
+ addDoc("segment", writer);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ writer.close();
+
+ Query query;
+ // term not over 10 chars, so optimization shortcuts
+ query = new SlowFuzzyQuery(new Term("field", "1234569"), 0.9f);
+ ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // 10 chars, so no optimization
+ query = new SlowFuzzyQuery(new Term("field", "1234567891"), 0.9f);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ // over 10 chars, so no optimization
+ query = new SlowFuzzyQuery(new Term("field", "12345678911"), 0.9f);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(1, hits.length);
+
+ // over 10 chars, no match
+ query = new SlowFuzzyQuery(new Term("field", "sdfsdfsdfsdf"), 0.9f);
+ hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(0, hits.length);
+
+ reader.close();
+ directory.close();
+ }
+
+ /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */
+ public void testBoostOnlyRewrite() throws Exception {
+ Directory directory = newDirectory();
+ RandomIndexWriter writer = new RandomIndexWriter(random(), directory);
+ addDoc("Lucene", writer);
+ addDoc("Lucene", writer);
+ addDoc("Lucenne", writer);
+
+ IndexReader reader = writer.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ writer.close();
+
+ SlowFuzzyQuery query = new SlowFuzzyQuery(new Term("field", "lucene"));
+ query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50));
+ ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
+ assertEquals(3, hits.length);
+ // normally, 'Lucenne' would be the first result as IDF will skew the score.
+ assertEquals("Lucene", reader.document(hits[0].doc).get("field"));
+ assertEquals("Lucene", reader.document(hits[1].doc).get("field"));
+ assertEquals("Lucenne", reader.document(hits[2].doc).get("field"));
+ reader.close();
+ directory.close();
+ }
+
+ public void testGiga() throws Exception {
+
+ MockAnalyzer analyzer = new MockAnalyzer(random());
+ Directory index = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), index);
+
+ addDoc("Lucene in Action", w);
+ addDoc("Lucene for Dummies", w);
+
+ //addDoc("Giga", w);
+ addDoc("Giga byte", w);
+
+ addDoc("ManagingGigabytesManagingGigabyte", w);
+ addDoc("ManagingGigabytesManagingGigabytes", w);
+
+ addDoc("The Art of Computer Science", w);
+ addDoc("J. K. Rowling", w);
+ addDoc("JK Rowling", w);
+ addDoc("Joanne K Roling", w);
+ addDoc("Bruce Willis", w);
+ addDoc("Willis bruce", w);
+ addDoc("Brute willis", w);
+ addDoc("B. willis", w);
+ IndexReader r = w.getReader();
+ w.close();
+
+ Query q = new SlowFuzzyQuery(new Term("field", "giga"), 0.9f);
+
+ // 3. search
+ IndexSearcher searcher = newSearcher(r);
+ ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field"));
+ r.close();
+ index.close();
+ }
+
+ public void testDistanceAsEditsSearching() throws Exception {
+ Directory index = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), index);
+ addDoc("foobar", w);
+ addDoc("test", w);
+ addDoc("working", w);
+ IndexReader reader = w.getReader();
+ IndexSearcher searcher = newSearcher(reader);
+ w.close();
+
+ SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", "fouba"), 2);
+ ScoreDoc[] hits = searcher.search(q, 10).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
+
+ q = new SlowFuzzyQuery(new Term("field", "foubara"), 2);
+ hits = searcher.search(q, 10).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals("foobar", searcher.doc(hits[0].doc).get("field"));
+
+ q = new SlowFuzzyQuery(new Term("field", "t"), 3);
+ hits = searcher.search(q, 10).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals("test", searcher.doc(hits[0].doc).get("field"));
+
+ q = new SlowFuzzyQuery(new Term("field", "a"), 4f, 0, 50);
+ hits = searcher.search(q, 10).scoreDocs;
+ assertEquals(1, hits.length);
+ assertEquals("test", searcher.doc(hits[0].doc).get("field"));
+
+ q = new SlowFuzzyQuery(new Term("field", "a"), 6f, 0, 50);
+ hits = searcher.search(q, 10).scoreDocs;
+ assertEquals(2, hits.length);
+ assertEquals("test", searcher.doc(hits[0].doc).get("field"));
+ assertEquals("foobar", searcher.doc(hits[1].doc).get("field"));
+
+ reader.close();
+ index.close();
+ }
+
+ private void addDoc(String text, RandomIndexWriter writer) throws IOException {
+ Document doc = new Document();
+ doc.add(newField("field", text, TextField.TYPE_STORED));
+ writer.addDocument(doc);
+ }
+}
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery2.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery2.java
similarity index 95%
rename from lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery2.java
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery2.java
index cda3d482bf5..ce1a9641cc5 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery2.java
+++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/TestSlowFuzzyQuery2.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.search;
+package org.apache.lucene.sandbox.queries;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@@ -55,7 +58,7 @@ import org.apache.lucene.util.LuceneTestCase;
*
* results line: comma-separated docID, score pair
**/
-public class TestFuzzyQuery2 extends LuceneTestCase {
+public class TestSlowFuzzyQuery2 extends LuceneTestCase {
/** epsilon for score comparisons */
static final float epsilon = 0.00001f;
@@ -115,7 +118,7 @@ public class TestFuzzyQuery2 extends LuceneTestCase {
int prefix = Integer.parseInt(params[1]);
int pqSize = Integer.parseInt(params[2]);
float minScore = Float.parseFloat(params[3]);
- FuzzyQuery q = new FuzzyQuery(new Term("field", query), minScore, prefix);
+ SlowFuzzyQuery q = new SlowFuzzyQuery(new Term("field", query), minScore, prefix);
q.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(pqSize));
int expectedResults = Integer.parseInt(reader.readLine());
TopDocs docs = searcher.search(q, expectedResults);
diff --git a/lucene/core/src/test/org/apache/lucene/search/fuzzyTestData.txt b/lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/fuzzyTestData.txt
similarity index 100%
rename from lucene/core/src/test/org/apache/lucene/search/fuzzyTestData.txt
rename to lucene/sandbox/src/test/org/apache/lucene/sandbox/queries/fuzzyTestData.txt