Hunspell suggestions: speed up for some non-Latin scripts (#19)

2021-03-15 10:02:45 +01:00 · 2021-03-15 10:02:45 +01:00 · cdff0accaa
parent 8913a98379
commit cdff0accaa
3 changed files with 29 additions and 7 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -94,10 +94,12 @@ class GeneratingSuggester {
            return;
          }

-          int sc =
-              automaton.ngramScore(rootChars)
-                  - longerWorsePenalty(word.length(), rootChars.length)
-                  + commonPrefix(word, rootChars);
+          int sc = automaton.ngramScore(rootChars);
+          if (sc == 0) {
+            return; // no common characters at all, don't suggest this root
+          }
+
+          sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);

          if (roots.size() == MAX_ROOTS && sc < roots.peek().score) {
            return;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TrigramAutomaton.java
@ -33,6 +33,7 @@ class TrigramAutomaton {
  private final CharacterRunAutomaton automaton;
  private final int[] state2Score;
  private final FixedBitSet countedSubstrings;
+  private final char minChar;

  TrigramAutomaton(String s1) {
    Map<String, Integer> substringCounts = new HashMap<>();
@ -40,6 +41,8 @@ class TrigramAutomaton {
    Automaton.Builder builder = new Automaton.Builder(s1.length() * N, s1.length() * N);
    int initialState = builder.createState();

+    minChar = (char) s1.chars().min().orElseThrow();
+
    for (int start = 0; start < s1.length(); start++) {
      int limit = Math.min(s1.length(), start + N);
      for (int end = start + 1; end <= limit; end++) {
@ -49,7 +52,7 @@ class TrigramAutomaton {
      int state = initialState;
      for (int i = start; i < limit; i++) {
        int next = builder.createState();
-        builder.addTransition(state, next, s1.charAt(i));
+        builder.addTransition(state, next, s1.charAt(i) - minChar);
        state = next;
      }
    }
@ -70,7 +73,7 @@ class TrigramAutomaton {
  private int runAutomatonOnStringChars(String s) {
    int state = 0;
    for (int i = 0; i < s.length(); i++) {
-      state = automaton.step(state, s.charAt(i));
+      state = automaton.step(state, s.charAt(i) - minChar);
    }
    return state;
  }
@ -86,6 +89,11 @@ class TrigramAutomaton {
    int limit = s2.length + s2.offset;
    for (int i = s2.offset; i < limit; i++) {
      char c = transformChar(s2.chars[i]);
+      if (c < minChar) {
+        state1 = state2 = -1;
+        continue;
+      }
+      c -= minChar;

      int state3 = state2 <= 0 ? 0 : automaton.step(state2, c);
      if (state3 > 0) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -65,6 +65,16 @@ public class TestPerformance extends LuceneTestCase {
    checkSuggestionPerformance("en", 3_000);
  }

+  @Test
+  public void ru() throws Exception {
+    checkAnalysisPerformance("ru", 400_000);
+  }
+
+  @Test
+  public void ru_suggest() throws Exception {
+    checkSuggestionPerformance("ru", 1000);
+  }
+
  @Test
  public void de() throws Exception {
    checkAnalysisPerformance("de", 300_000);
@ -121,6 +131,7 @@ public class TestPerformance extends LuceneTestCase {
    Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
    List<String> words =
        loadWords(code, wordCount, dictionary).stream()
+            .distinct()
            .filter(w -> hasQuickSuggestions(speller, w))
            .collect(Collectors.toList());
    System.out.println("Checking " + words.size() + " misspelled words");
@ -181,7 +192,8 @@ public class TestPerformance extends LuceneTestCase {
        String line = reader.readLine();
        if (line == null) break;

-        for (String token : line.split("[^a-zA-Z" + Pattern.quote(dictionary.wordChars) + "]+")) {
+        for (String token :
+            line.split("[^\\p{IsLetter}" + Pattern.quote(dictionary.wordChars) + "]+")) {
          String word = stripPunctuation(token);
          if (word != null) {
            words.add(word);