diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java index 8e0cfff612a..3c1eacd80ee 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java @@ -31,7 +31,7 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata; * though you can explicitly choose classic Levenshtein by passing false * to the transpositions parameter. * - *

This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite} + *

This query uses {@link MultiTermQuery.TopTermsBlendedFreqScoringRewrite} * as default. So terms will be collected and scored according to their * edit distance. Only the top terms are used for building the {@link BooleanQuery}. * It is not recommended to change the rewrite mode for fuzzy queries. diff --git a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java index e79dbf2c4c1..66a64e14527 100644 --- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java @@ -329,7 +329,7 @@ public class FuzzyTermsEnum extends TermsEnum { //System.out.println("AFTE.accept term=" + term); int ed = matchers.length - 1; - // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum, + // we are wrapping either an intersect() TermsEnum or an AutomatonTermsEnum, // so we know the outer DFA always matches. // now compute exact edit distance while (ed > 0) { diff --git a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java index 79dc157d883..1e90525d891 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java @@ -18,13 +18,19 @@ package org.apache.lucene.search; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; @@ -32,7 +38,10 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.LevenshteinAutomata; /** @@ -489,4 +498,210 @@ public class TestFuzzyQuery extends LuceneTestCase { doc.add(newTextField("field", text, Field.Store.YES)); writer.addDocument(doc); } + + private String randomSimpleString(int digits) { + int termLength = TestUtil.nextInt(random(), 1, 8); + char[] chars = new char[termLength]; + for(int i=0;i terms = new HashSet<>(); + while (terms.size() < numTerms) { + terms.add(randomSimpleString(digits)); + } + + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir); + for(String term : terms) { + Document doc = new Document(); + doc.add(new StringField("field", term, Field.Store.YES)); + w.addDocument(doc); + } + DirectoryReader r = w.getReader(); + //System.out.println("TEST: reader=" + r); + IndexSearcher s = newSearcher(r); + int iters = atLeast(1000); + for(int iter=0;iter[] expected = new List[3]; + for(int ed=0;ed<3;ed++) { + expected[ed] = new ArrayList(); + } + for(String term : terms) { + if (term.startsWith(queryPrefix) == false) { + continue; + } + int ed = getDistance(term, queryTerm); + if (Math.min(queryTerm.length(), term.length()) > ed) { + float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length()); + while (ed < 3) { + expected[ed].add(new TermAndScore(term, score)); + ed++; + } + } + } + + for(int ed=0;ed<3;ed++) { + Collections.sort(expected[ed]); + int queueSize = TestUtil.nextInt(random(), 1, terms.size()); + /* + System.out.println("\nTEST: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength); + for(TermAndScore ent : expected[ed]) { + System.out.println(" " + ent); + } + */ + FuzzyQuery query = new FuzzyQuery(new Term("field", queryTerm), ed, prefixLength, queueSize, true); + TopDocs hits = s.search(query, terms.size()); + Set actual = new HashSet<>(); + for(ScoreDoc hit : hits.scoreDocs) { + Document doc = s.doc(hit.doc); + actual.add(doc.get("field")); + //System.out.println(" actual: " + doc.get("field") + " score=" + hit.score); + } + Set expectedTop = new HashSet<>(); + int limit = Math.min(queueSize, expected[ed].size()); + for(int i=0;i { + final String term; + final float score; + + public TermAndScore(String term, float score) { + this.term = term; + this.score = score; + } + + @Override + public int compareTo(TermAndScore other) { + // higher score sorts first, and if scores are tied, lower term sorts first + if (score > other.score) { + return -1; + } else if (score < other.score) { + return 1; + } else { + return term.compareTo(other.term); + } + } + + @Override + public String toString() { + return term + " score=" + score; + } + } + + // Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2) + private static int getDistance(String target, String other) { + IntsRef targetPoints; + IntsRef otherPoints; + int n; + int d[][]; // cost array + + // NOTE: if we cared, we could 3*m space instead of m*n space, similar to + // what LevenshteinDistance does, except cycling thru a ring of three + // horizontal cost arrays... but this comparator is never actually used by + // DirectSpellChecker, it's only used for merging results from multiple shards + // in "distributed spellcheck", and it's inefficient in other ways too... + + // cheaper to do this up front once + targetPoints = toIntsRef(target); + otherPoints = toIntsRef(other); + n = targetPoints.length; + final int m = otherPoints.length; + d = new int[n+1][m+1]; + + if (n == 0 || m == 0) { + if (n == m) { + return 0; + } + else { + return Math.max(n, m); + } + } + + // indexes into strings s and t + int i; // iterates through s + int j; // iterates through t + + int t_j; // jth character of t + + int cost; // cost + + for (i = 0; i<=n; i++) { + d[i][0] = i; + } + + for (j = 0; j<=m; j++) { + d[0][j] = j; + } + + for (j = 1; j<=m; j++) { + t_j = otherPoints.ints[j-1]; + + for (i=1; i<=n; i++) { + cost = targetPoints.ints[i-1]==t_j ? 0 : 1; + // minimum of cell to the left+1, to the top+1, diagonally left and up +cost + d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost); + // transposition + if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { + d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); + } + } + } + + return d[n][m]; + } + + private static IntsRef toIntsRef(String s) { + IntsRef ref = new IntsRef(s.length()); // worst case + int utf16Len = s.length(); + for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { + cp = ref.ints[ref.length++] = Character.codePointAt(s, i); + } + return ref; + } }