LUCENE-7439: back port test case to 6.x

2025-02-13 13:35:37 +00:00 · 2016-09-13 05:22:07 -04:00 · 2016-09-13 05:22:07 -04:00 · ba085d4c84
commit ba085d4c84
parent 08453fb7f0
3 changed files with 217 additions and 2 deletions
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyQuery.java
@ -31,7 +31,7 @@ import org.apache.lucene.util.automaton.LevenshteinAutomata;
 * though you can explicitly choose classic Levenshtein by passing <code>false</code>
 * to the <code>transpositions</code> parameter.
 * 
- * <p>This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite}
+ * <p>This query uses {@link MultiTermQuery.TopTermsBlendedFreqScoringRewrite}
 * as default. So terms will be collected and scored according to their
 * edit distance. Only the top terms are used for building the {@link BooleanQuery}.
 * It is not recommended to change the rewrite mode for fuzzy queries.
--- a/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FuzzyTermsEnum.java
@ -329,7 +329,7 @@ public class FuzzyTermsEnum extends TermsEnum {
      //System.out.println("AFTE.accept term=" + term);
      int ed = matchers.length - 1;
      
-      // we are wrapping either an intersect() TermsEnum or an AutomatonTermsENum,
+      // we are wrapping either an intersect() TermsEnum or an AutomatonTermsEnum,
      // so we know the outer DFA always matches.
      // now compute exact edit distance
      while (ed > 0) {
--- a/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestFuzzyQuery.java
@ -18,13 +18,19 @@ package org.apache.lucene.search;


 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;

 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.RandomIndexWriter;
@ -32,7 +38,10 @@ import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.similarities.ClassicSimilarity;
 import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
 import org.apache.lucene.util.automaton.LevenshteinAutomata;

 /**
@ -489,4 +498,210 @@ public class TestFuzzyQuery extends LuceneTestCase {
    doc.add(newTextField("field", text, Field.Store.YES));
    writer.addDocument(doc);
  }
+
+  private String randomSimpleString(int digits) {
+    int termLength = TestUtil.nextInt(random(), 1, 8);
+    char[] chars = new char[termLength];
+    for(int i=0;i<termLength;i++) {
+      chars[i] = (char) ('a' + random().nextInt(digits));
+    }
+    return new String(chars);
+  }
+
+  @SuppressWarnings({"unchecked","rawtypes"})
+  public void testRandom() throws Exception {
+    int numTerms = atLeast(100);
+    int digits = TestUtil.nextInt(random(), 2, 3);
+    Set<String> terms = new HashSet<>();
+    while (terms.size() < numTerms) {
+      terms.add(randomSimpleString(digits));
+    }
+
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
+    for(String term : terms) {
+      Document doc = new Document();
+      doc.add(new StringField("field", term, Field.Store.YES));
+      w.addDocument(doc);
+    }
+    DirectoryReader r = w.getReader();
+    //System.out.println("TEST: reader=" + r);
+    IndexSearcher s = newSearcher(r);
+    int iters = atLeast(1000);
+    for(int iter=0;iter<iters;iter++) {
+      String queryTerm = randomSimpleString(digits);
+      int prefixLength = random().nextInt(queryTerm.length());
+      String queryPrefix = queryTerm.substring(0, prefixLength);
+
+      // we don't look at scores here:
+      List<TermAndScore>[] expected = new List[3];
+      for(int ed=0;ed<3;ed++) {
+        expected[ed] = new ArrayList<TermAndScore>();
+      }
+      for(String term : terms) {
+        if (term.startsWith(queryPrefix) == false) {
+          continue;
+        }
+        int ed = getDistance(term, queryTerm);
+        if (Math.min(queryTerm.length(), term.length()) > ed) {        
+          float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length());
+          while (ed < 3) {
+            expected[ed].add(new TermAndScore(term, score));
+            ed++;
+          }
+        }
+      }
+
+      for(int ed=0;ed<3;ed++) {
+        Collections.sort(expected[ed]);
+        int queueSize = TestUtil.nextInt(random(), 1, terms.size());
+        /*
+        System.out.println("\nTEST: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength);
+        for(TermAndScore ent : expected[ed]) {
+          System.out.println("  " + ent);
+        }
+        */
+        FuzzyQuery query = new FuzzyQuery(new Term("field", queryTerm), ed, prefixLength, queueSize, true);
+        TopDocs hits = s.search(query, terms.size());
+        Set<String> actual = new HashSet<>();
+        for(ScoreDoc hit : hits.scoreDocs) {
+          Document doc = s.doc(hit.doc);
+          actual.add(doc.get("field"));
+          //System.out.println("   actual: " + doc.get("field") + " score=" + hit.score);
+        }
+        Set<String> expectedTop = new HashSet<>();
+        int limit = Math.min(queueSize, expected[ed].size());
+        for(int i=0;i<limit;i++) {
+          expectedTop.add(expected[ed].get(i).term);
+        }
+        
+        if (actual.equals(expectedTop) == false) {
+          StringBuilder sb = new StringBuilder();
+          sb.append("FAILED: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength + "\n");
+
+          boolean first = true;
+          for(String term : actual) {
+            if (expectedTop.contains(term) == false) {
+              if (first) {
+                sb.append("  these matched but shouldn't:\n");
+                first = false;
+              }
+              sb.append("    " + term + "\n");
+            }
+          }
+          first = true;
+          for(String term : expectedTop) {
+            if (actual.contains(term) == false) {
+              if (first) {
+                sb.append("  these did not match but should:\n");
+                first = false;
+              }
+              sb.append("    " + term + "\n");
+            }
+          }
+          throw new AssertionError(sb.toString());
+        }
+      }
+    }
+    
+    IOUtils.close(r, w, dir);
+  }
+
+  private static class TermAndScore implements Comparable<TermAndScore> {
+    final String term;
+    final float score;
+    
+    public TermAndScore(String term, float score) {
+      this.term = term;
+      this.score = score;
+    }
+
+    @Override
+    public int compareTo(TermAndScore other) {
+      // higher score sorts first, and if scores are tied, lower term sorts first
+      if (score > other.score) {
+        return -1;
+      } else if (score < other.score) {
+        return 1;
+      } else {
+        return term.compareTo(other.term);
+      }
+    }
+
+    @Override
+    public String toString() {
+      return term + " score=" + score;
+    }
+  }
+
+  // Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2)
+  private static int getDistance(String target, String other) {
+    IntsRef targetPoints;
+    IntsRef otherPoints;
+    int n;
+    int d[][]; // cost array
+    
+    // NOTE: if we cared, we could 3*m space instead of m*n space, similar to 
+    // what LevenshteinDistance does, except cycling thru a ring of three 
+    // horizontal cost arrays... but this comparator is never actually used by 
+    // DirectSpellChecker, it's only used for merging results from multiple shards 
+    // in "distributed spellcheck", and it's inefficient in other ways too...
+
+    // cheaper to do this up front once
+    targetPoints = toIntsRef(target);
+    otherPoints = toIntsRef(other);
+    n = targetPoints.length;
+    final int m = otherPoints.length;
+    d = new int[n+1][m+1];
+    
+    if (n == 0 || m == 0) {
+      if (n == m) {
+        return 0;
+      }
+      else {
+        return Math.max(n, m);
+      }
+    } 
+
+    // indexes into strings s and t
+    int i; // iterates through s
+    int j; // iterates through t
+
+    int t_j; // jth character of t
+
+    int cost; // cost
+
+    for (i = 0; i<=n; i++) {
+      d[i][0] = i;
+    }
+    
+    for (j = 0; j<=m; j++) {
+      d[0][j] = j;
+    }
+
+    for (j = 1; j<=m; j++) {
+      t_j = otherPoints.ints[j-1];
+
+      for (i=1; i<=n; i++) {
+        cost = targetPoints.ints[i-1]==t_j ? 0 : 1;
+        // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+        d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost);
+        // transposition
+        if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) {
+          d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost);
+        }
+      }
+    }
+    
+    return d[n][m];
+  }
+  
+  private static IntsRef toIntsRef(String s) {
+    IntsRef ref = new IntsRef(s.length()); // worst case
+    int utf16Len = s.length();
+    for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) {
+      cp = ref.ints[ref.length++] = Character.codePointAt(s, i);
+    }
+    return ref;
+  }
 }