LUCENE-9801: Hunspell suggestions: speed up expandWord by enumerating only applicable affixes (#2416)

2025-02-28 21:39:25 +00:00 · 2021-02-23 05:25:21 +01:00 · 2021-02-23 05:25:21 +01:00 · 34993c22dd
commit 34993c22dd
parent af49df4851
4 changed files with 89 additions and 84 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -297,29 +297,29 @@ public class Dictionary {
    final FST.BytesReader bytesReader = fst.getBytesReader();
    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
    // Accumulate output as we go
-    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-    IntsRef output = NO_OUTPUT;
+    IntsRef output = fst.outputs.getNoOutput();

    int l = offset + length;
-    try {
-      for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
-        cp = Character.codePointAt(word, i, l);
-        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
-          return null;
-        } else if (arc.output() != NO_OUTPUT) {
-          output = fst.outputs.add(output, arc.output());
-        }
-      }
-      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+    for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
+      cp = Character.codePointAt(word, i, l);
+      output = nextArc(fst, arc, bytesReader, output, cp);
+      if (output == null) {
+        return null;
+      }
+    }
+    return nextArc(fst, arc, bytesReader, output, FST.END_LABEL);
+  }
+
+  static IntsRef nextArc(
+      FST<IntsRef> fst, FST.Arc<IntsRef> arc, FST.BytesReader reader, IntsRef output, int ch) {
+    try {
+      if (fst.findTargetArc(ch, arc, arc, reader) == null) {
        return null;
-      } else if (arc.output() != NO_OUTPUT) {
-        return fst.outputs.add(output, arc.output());
-      } else {
-        return output;
      }
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
+    return fst.outputs.add(output, arc.output());
  }

  /**
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -175,67 +175,85 @@ class GeneratingSuggester {
    }

    // suffixes
-    processFST(
-        dictionary.suffixes,
-        (key, ids) -> {
-          String suffix = new StringBuilder(toString(key)).reverse().toString();
-          if (misspelled.length() <= suffix.length() || !misspelled.endsWith(suffix)) return;
+    processAffixes(
+        false,
+        misspelled,
+        (suffixLength, suffixId) -> {
+          if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
+            return;
+          }

-          for (int i = 0; i < ids.length; i++) {
-            int suffixId = ids.ints[ids.offset + i];
-            if (!hasCompatibleFlags(root, suffixId) || !checkAffixCondition(suffixId, root.word)) {
-              continue;
-            }
-
-            String withSuffix =
-                root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
-            result.add(withSuffix);
-            if (dictionary.isCrossProduct(suffixId)) {
-              crossProducts.add(withSuffix);
-            }
+          String suffix = misspelled.substring(misspelled.length() - suffixLength);
+          String withSuffix =
+              root.word.substring(0, root.word.length() - affixStripLength(suffixId)) + suffix;
+          result.add(withSuffix);
+          if (dictionary.isCrossProduct(suffixId)) {
+            crossProducts.add(withSuffix);
          }
        });

    // cross-product prefixes
-    processFST(
-        dictionary.prefixes,
-        (key, ids) -> {
-          String prefix = toString(key);
-          if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
+    processAffixes(
+        true,
+        misspelled,
+        (prefixLength, prefixId) -> {
+          if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
+              || !dictionary.isCrossProduct(prefixId)) {
+            return;
+          }

-          for (int i = 0; i < ids.length; i++) {
-            int prefixId = ids.ints[ids.offset + i];
-            if (!dictionary.hasFlag(root.entryId, dictionary.affixData(prefixId, AFFIX_FLAG))
-                || !dictionary.isCrossProduct(prefixId)) {
-              continue;
-            }
-
-            for (String suffixed : crossProducts) {
-              if (checkAffixCondition(prefixId, suffixed)) {
-                result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
-              }
+          String prefix = misspelled.substring(0, prefixLength);
+          for (String suffixed : crossProducts) {
+            if (checkAffixCondition(prefixId, suffixed)) {
+              result.add(prefix + suffixed.substring(affixStripLength(prefixId)));
            }
          }
        });

    // pure prefixes
-    processFST(
-        dictionary.prefixes,
-        (key, ids) -> {
-          String prefix = toString(key);
-          if (misspelled.length() <= prefix.length() || !misspelled.startsWith(prefix)) return;
-
-          for (int i = 0; i < ids.length; i++) {
-            int prefixId = ids.ints[ids.offset + i];
-            if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
-              result.add(prefix + root.word.substring(affixStripLength(prefixId)));
-            }
+    processAffixes(
+        true,
+        misspelled,
+        (prefixLength, prefixId) -> {
+          if (hasCompatibleFlags(root, prefixId) && checkAffixCondition(prefixId, root.word)) {
+            String prefix = misspelled.substring(0, prefixLength);
+            result.add(prefix + root.word.substring(affixStripLength(prefixId)));
          }
        });

    return result.stream().limit(MAX_WORDS).collect(Collectors.toList());
  }

+  private void processAffixes(boolean prefixes, String word, AffixProcessor processor) {
+    FST<IntsRef> fst = prefixes ? dictionary.prefixes : dictionary.suffixes;
+    if (fst == null) return;
+
+    FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
+    FST.BytesReader reader = fst.getBytesReader();
+
+    IntsRef output = fst.outputs.getNoOutput();
+    int length = word.length();
+    int step = prefixes ? 1 : -1;
+    int limit = prefixes ? length : -1;
+    for (int i = prefixes ? 0 : length - 1; i != limit; i += step) {
+      output = Dictionary.nextArc(fst, arc, reader, output, word.charAt(i));
+      if (output == null) {
+        break;
+      }
+
+      if (arc.isFinal()) {
+        IntsRef affixIds = fst.outputs.add(output, arc.nextFinalOutput());
+        for (int j = 0; j < affixIds.length; j++) {
+          processor.processAffix(prefixes ? i + 1 : length - i, affixIds.ints[affixIds.offset + j]);
+        }
+      }
+    }
+  }
+
+  private interface AffixProcessor {
+    void processAffix(int affixLength, int affixId);
+  }
+
  private boolean hasCompatibleFlags(Root<?> root, int affixId) {
    if (!dictionary.hasFlag(root.entryId, dictionary.affixData(affixId, AFFIX_FLAG))) {
      return false;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -16,7 +16,6 @@
 */
 package org.apache.lucene.analysis.hunspell;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.stream.Collectors;
@ -259,12 +258,8 @@ final class Stemmer {
        }
      }
    }
-    try {
-      return stem(
-          word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
-    } catch (IOException bogus) {
-      throw new RuntimeException(bogus);
-    }
+    return stem(
+        word, offset, length, context, -1, Dictionary.FLAG_UNSET, -1, 0, true, false, processor);
  }

  /**
@ -373,22 +368,18 @@ final class Stemmer {
      int recursionDepth,
      boolean doPrefix,
      boolean previousWasPrefix,
-      RootProcessor processor)
-      throws IOException {
+      RootProcessor processor) {
    if (doPrefix && dictionary.prefixes != null) {
      FST<IntsRef> fst = dictionary.prefixes;
      FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
      fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-      IntsRef output = NO_OUTPUT;
+      IntsRef output = fst.outputs.getNoOutput();
      int limit = dictionary.fullStrip ? length + 1 : length;
      for (int i = 0; i < limit; i++) {
        if (i > 0) {
-          char ch = word[offset + i - 1];
-          if (fst.findTargetArc(ch, arc, arc, prefixReader) == null) {
+          output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]);
+          if (output == null) {
            break;
-          } else if (arc.output() != NO_OUTPUT) {
-            output = fst.outputs.add(output, arc.output());
          }
        }
        if (!arc.isFinal()) {
@ -431,16 +422,13 @@ final class Stemmer {
      FST<IntsRef> fst = dictionary.suffixes;
      FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
      fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
-      IntsRef output = NO_OUTPUT;
+      IntsRef output = fst.outputs.getNoOutput();
      int limit = dictionary.fullStrip ? 0 : 1;
      for (int i = length; i >= limit; i--) {
        if (i < length) {
-          char ch = word[offset + i];
-          if (fst.findTargetArc(ch, arc, arc, suffixReader) == null) {
+          output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]);
+          if (output == null) {
            break;
-          } else if (arc.output() != NO_OUTPUT) {
-            output = fst.outputs.add(output, arc.output());
          }
        }
        if (!arc.isFinal()) {
@ -610,8 +598,7 @@ final class Stemmer {
      int prefixId,
      int recursionDepth,
      boolean prefix,
-      RootProcessor processor)
-      throws IOException {
+      RootProcessor processor) {
    char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);

    boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix, prefixId);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -82,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {

  @Test
  public void fr_suggest() throws Exception {
-    checkSuggestionPerformance("fr", 100);
+    checkSuggestionPerformance("fr", 120);
  }

  private Dictionary loadDictionary(String code) throws IOException, ParseException {