diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
index 8e0cfff612a..3c1eacd80ee 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
@@ -31,7 +31,7 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
* though you can explicitly choose classic Levenshtein by passing false
* to the transpositions
parameter.
*
- *
This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
+ *
This query uses {@link MultiTermQuery.TopTermsBlendedFreqScoringRewrite}
* as default. So terms will be collected and scored according to their
* edit distance. Only the top terms are used for building the {@link BooleanQuery}.
* It is not recommended to change the rewrite mode for fuzzy queries.
diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
index e79dbf2c4c1..66a64e14527 100644
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@@ -329,7 +329,7 @@ public class FuzzyTermsEnum extends TermsEnum {
//System.out.println("AFTE.accept term=" + term);
int ed = matchers.length - 1;
- // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum,
+ // we are wrapping either an intersect() TermsEnum or an AutomatonTermsEnum,
// so we know the outer DFA always matches.
// now compute exact edit distance
while (ed > 0) {
diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
index 79dc157d883..1e90525d891 100644
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@@ -18,13 +18,19 @@ package org.apache.lucene.search;
import java.io.IOException;
+import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
@@ -32,7 +38,10 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/**
@@ -489,4 +498,210 @@ public class TestFuzzyQuery extends LuceneTestCase {
doc.add(newTextField("field", text, Field.Store.YES));
writer.addDocument(doc);
}
+
+ private String randomSimpleString(int digits) {
+ int termLength = TestUtil.nextInt(random(), 1, 8);
+ char[] chars = new char[termLength];
+ for(int i=0;i terms = new HashSet<>();
+ while (terms.size() < numTerms) {
+ terms.add(randomSimpleString(digits));
+ }
+
+ Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+ for(String term : terms) {
+ Document doc = new Document();
+ doc.add(new StringField("field", term, Field.Store.YES));
+ w.addDocument(doc);
+ }
+ DirectoryReader r = w.getReader();
+ //System.out.println("TEST: reader=" + r);
+ IndexSearcher s = newSearcher(r);
+ int iters = atLeast(1000);
+ for(int iter=0;iter[] expected = new List[3];
+ for(int ed=0;ed<3;ed++) {
+ expected[ed] = new ArrayList();
+ }
+ for(String term : terms) {
+ if (term.startsWith(queryPrefix) == false) {
+ continue;
+ }
+ int ed = getDistance(term, queryTerm);
+ if (Math.min(queryTerm.length(), term.length()) > ed) {
+ float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
+ while (ed < 3) {
+ expected[ed].add(new TermAndScore(term, score));
+ ed++;
+ }
+ }
+ }
+
+ for(int ed=0;ed<3;ed++) {
+ Collections.sort(expected[ed]);
+ int queueSize = TestUtil.nextInt(random(), 1, terms.size());
+ /*
+ System.out.println("\nTEST: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength);
+ for(TermAndScore ent : expected[ed]) {
+ System.out.println(" " + ent);
+ }
+ */
+ FuzzyQuery query = new FuzzyQuery(new Term("field", queryTerm), ed, prefixLength, queueSize, true);
+ TopDocs hits = s.search(query, terms.size());
+ Set actual = new HashSet<>();
+ for(ScoreDoc hit : hits.scoreDocs) {
+ Document doc = s.doc(hit.doc);
+ actual.add(doc.get("field"));
+ //System.out.println(" actual: " + doc.get("field") + " score=" + hit.score);
+ }
+ Set expectedTop = new HashSet<>();
+ int limit = Math.min(queueSize, expected[ed].size());
+ for(int i=0;i {
+ final String term;
+ final float score;
+
+ public TermAndScore(String term, float score) {
+ this.term = term;
+ this.score = score;
+ }
+
+ @Override
+ public int compareTo(TermAndScore other) {
+ // higher score sorts first, and if scores are tied, lower term sorts first
+ if (score > other.score) {
+ return -1;
+ } else if (score < other.score) {
+ return 1;
+ } else {
+ return term.compareTo(other.term);
+ }
+ }
+
+ @Override
+ public String toString() {
+ return term + " score=" + score;
+ }
+ }
+
+ // Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2)
+ private static int getDistance(String target, String other) {
+ IntsRef targetPoints;
+ IntsRef otherPoints;
+ int n;
+ int d[][]; // cost array
+
+ // NOTE: if we cared, we could 3*m space instead of m*n space, similar to
+ // what LevenshteinDistance does, except cycling thru a ring of three
+ // horizontal cost arrays... but this comparator is never actually used by
+ // DirectSpellChecker, it's only used for merging results from multiple shards
+ // in "distributed spellcheck", and it's inefficient in other ways too...
+
+ // cheaper to do this up front once
+ targetPoints = toIntsRef(target);
+ otherPoints = toIntsRef(other);
+ n = targetPoints.length;
+ final int m = otherPoints.length;
+ d = new int[n+1][m+1];
+
+ if (n == 0 || m == 0) {
+ if (n == m) {
+ return 0;
+ }
+ else {
+ return Math.max(n, m);
+ }
+ }
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ int t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i<=n; i++) {
+ d[i][0] = i;
+ }
+
+ for (j = 0; j<=m; j++) {
+ d[0][j] = j;
+ }
+
+ for (j = 1; j<=m; j++) {
+ t_j = otherPoints.ints[j-1];
+
+ for (i=1; i<=n; i++) {
+ cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
+ // transposition
+ if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
+ d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
+ }
+ }
+ }
+
+ return d[n][m];
+ }
+
+ private static IntsRef toIntsRef(String s) {
+ IntsRef ref = new IntsRef(s.length()); // worst case
+ int utf16Len = s.length();
+ for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
+ cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
+ }
+ return ref;
+ }
}