hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in) (#11893)

hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in)
2022-11-09 08:20:50 +01:00 · 2022-11-09 08:20:50 +01:00 · f7417d5961
parent c66a559050
commit f7417d5961
11 changed files with 490 additions and 193 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -63,7 +63,7 @@ Improvements
 Optimizations
 ---------------------

-* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
+* GITHUB#11857, GITHUB#11859, GITHUB#11893: Hunspell: improved suggestion performance

 Bug Fixes
 ---------------------
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -28,7 +28,11 @@ import java.util.Objects;
 import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.TreeSet;
+import java.util.function.BiConsumer;
+import java.util.function.IntPredicate;
+import java.util.function.Supplier;
 import java.util.stream.Collectors;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST;

@ -43,10 +47,12 @@ class GeneratingSuggester {
  private static final int MAX_ROOT_LENGTH_DIFF = 4;
  private final Dictionary dictionary;
  private final Hunspell speller;
+  private final SuggestibleEntryCache entryCache;

-  GeneratingSuggester(Hunspell speller) {
+  GeneratingSuggester(Hunspell speller, SuggestibleEntryCache entryCache) {
    this.dictionary = speller.dictionary;
    this.speller = speller;
+    this.entryCache = entryCache;
  }

  List<String> suggest(String word, WordCase originalCase, Set<Suggestion> prevSuggestions) {
@ -60,7 +66,11 @@ class GeneratingSuggester {
      String word, WordCase originalCase) {
    Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
    PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
-    EntryFilter filter = new EntryFilter(dictionary);
+
+    char[] excludeFlags = dictionary.allNonSuggestibleFlags();
+    FlagEnumerator.Lookup flagLookup = dictionary.flagLookup;
+    IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);
+
    boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
    TrigramAutomaton automaton =
        new TrigramAutomaton(word) {
@ -70,10 +80,10 @@ class GeneratingSuggester {
          }
        };

-    dictionary.words.processSuggestibleWords(
+    processSuggestibleWords(
        Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
        word.length() + MAX_ROOT_LENGTH_DIFF,
-        (rootChars, forms) -> {
+        (rootChars, formSupplier) -> {
          if (ignoreTitleCaseRoots
              && Character.isUpperCase(rootChars.charAt(0))
              && WordCase.caseOf(rootChars) == WordCase.TITLE) {
@ -87,44 +97,34 @@ class GeneratingSuggester {

          sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);

-          if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
+          boolean overflow = roots.size() == MAX_ROOTS;
+          if (overflow && sc <= roots.peek().score) {
            return;
          }

          speller.checkCanceled.run();

          String root = rootChars.toString();
-          int suitable = filter.findSuitableFormIndex(forms, 0);
-          do {
-            roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
-            suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
-          } while (suitable > 0);
-          while (roots.size() > MAX_ROOTS) {
+          IntsRef forms = formSupplier.get();
+          for (int i = 0; i < forms.length; i++) {
+            if (isSuggestible.test(forms.ints[forms.offset + i])) {
+              roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
+              if (overflow) {
                roots.poll();
              }
+            }
+          }
        });

    return roots.stream().sorted().collect(Collectors.toList());
  }

-  private static class EntryFilter {
-    private final int formStep;
-    private final FlagEnumerator.Lookup flagLookup;
-    private final char[] excludeFlags;
-
-    EntryFilter(Dictionary dic) {
-      formStep = dic.formStep();
-      flagLookup = dic.flagLookup;
-      excludeFlags = dic.allNonSuggestibleFlags();
-    }
-
-    int findSuitableFormIndex(IntsRef forms, int start) {
-      for (int i = start; i < forms.length; i += formStep) {
-        if (!flagLookup.hasAnyFlag(forms.ints[forms.offset + i], excludeFlags)) {
-          return i;
-        }
-      }
-      return -1;
+  private void processSuggestibleWords(
+      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
+    if (entryCache != null) {
+      entryCache.processSuggestibleWords(minLength, maxLength, processor);
+    } else {
+      dictionary.words.processSuggestibleWords(minLength, maxLength, processor);
    }
  }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -26,16 +26,8 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

 import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.LinkedHashSet;
 import java.util.List;
-import java.util.Map;
-import java.util.Optional;
 import java.util.Set;
-import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -56,7 +48,6 @@ import org.apache.lucene.util.IntsRef;
 */
 public class Hunspell {
  static final long SUGGEST_TIME_LIMIT = 250;
-
  final Dictionary dictionary;
  final Stemmer stemmer;
  private final TimeoutPolicy policy;
@ -75,7 +66,7 @@ public class Hunspell {
    this.dictionary = dictionary;
    this.policy = policy;
    this.checkCanceled = checkCanceled;
-    stemmer = new Stemmer(dictionary);
+    this.stemmer = new Stemmer(dictionary);
  }

  /**
@ -568,6 +559,7 @@ public class Hunspell {
   * @return suggestions for the given misspelled word
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
+   * @see Suggester for finer-grained APIs and performance optimizations
   */
  public List<String> suggest(String word) throws SuggestionTimeoutException {
    return suggest(word, SUGGEST_TIME_LIMIT);
@ -579,140 +571,19 @@ public class Hunspell {
   *     TimeoutPolicy}'s effects (exception or partial result) may kick in
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
+   * @see Suggester for finer-grained APIs and performance optimizations
   */
  public List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
-    checkCanceled.run();
-    if (word.length() >= 100) return Collections.emptyList();
+    Suggester suggester = new Suggester(dictionary);
+    if (policy == NO_TIMEOUT) return suggester.suggestNoTimeout(word, checkCanceled);

-    if (dictionary.needsInputCleaning(word)) {
-      word = dictionary.cleanInput(word, new StringBuilder()).toString();
-    }
-
-    WordCase wordCase = WordCase.caseOf(word);
-    if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
-      String title = dictionary.toTitleCase(word);
-      if (spell(title)) {
-        return Collections.singletonList(title);
-      }
-    }
-
-    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
-    Runnable checkCanceled =
-        policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, suggestions, timeLimitMs);
    try {
-      doSuggest(word, wordCase, suggestions, checkCanceled);
+      return suggester.suggestWithTimeout(word, timeLimitMs, checkCanceled);
    } catch (SuggestionTimeoutException e) {
-      if (policy != RETURN_PARTIAL_RESULT) {
+      if (policy == RETURN_PARTIAL_RESULT) {
+        return e.getPartialResult();
+      }
      throw e;
    }
  }
-
-    return postprocess(suggestions);
-  }
-
-  private void doSuggest(
-      String word,
-      WordCase wordCase,
-      LinkedHashSet<Suggestion> suggestions,
-      Runnable checkCanceled) {
-    Hunspell suggestionSpeller =
-        new Hunspell(dictionary, policy, checkCanceled) {
-          // Cache for expensive "findStem" requests issued when trying to split a compound word.
-          // The suggestion algorithm issues many of them, often with the same text.
-          // The cache can be large, but will be GC-ed after the "suggest" call.
-          final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
-
-          @Override
-          boolean acceptsStem(int formID) {
-            return !dictionary.hasFlag(formID, dictionary.noSuggest)
-                && !dictionary.hasFlag(formID, dictionary.subStandard);
-          }
-
-          @Override
-          Root<CharsRef> findStem(
-              char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
-            if (context == COMPOUND_BEGIN && originalCase == null) {
-              return compoundCache
-                  .computeIfAbsent(
-                      new String(chars, offset, length),
-                      __ ->
-                          Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
-                  .orElse(null);
-            }
-            return super.findStem(chars, offset, length, originalCase, context);
-          }
-        };
-    boolean hasGoodSuggestions =
-        new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
-
-    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
-      List<String> generated =
-          new GeneratingSuggester(suggestionSpeller)
-              .suggest(dictionary.toLowerCase(word), wordCase, suggestions);
-      for (String raw : generated) {
-        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
-      }
-    }
-
-    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
-      for (String raw : modifyChunksBetweenDashes(word)) {
-        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
-      }
-    }
-  }
-
-  private Runnable checkTimeLimit(String word, Set<Suggestion> suggestions, long timeLimitMs) {
-    return new Runnable() {
-      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
-      int invocationCounter = 100;
-
-      @Override
-      public void run() {
-        checkCanceled.run();
-        if (--invocationCounter <= 0) {
-          if (System.nanoTime() - deadline > 0) {
-            stop();
-          }
-          invocationCounter = 100;
-        }
-      }
-
-      private void stop() {
-        List<String> partialResult =
-            policy == RETURN_PARTIAL_RESULT ? null : postprocess(suggestions);
-        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
-        throw new SuggestionTimeoutException(message, partialResult);
-      }
-    };
-  }
-
-  private List<String> postprocess(Collection<Suggestion> suggestions) {
-    return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
-  }
-
-  private List<String> modifyChunksBetweenDashes(String word) {
-    List<String> result = new ArrayList<>();
-    int chunkStart = 0;
-    while (chunkStart < word.length()) {
-      int chunkEnd = word.indexOf('-', chunkStart);
-      if (chunkEnd < 0) {
-        chunkEnd = word.length();
-      }
-
-      if (chunkEnd > chunkStart) {
-        String chunk = word.substring(chunkStart, chunkEnd);
-        if (!spell(chunk)) {
-          for (String chunkSug : suggest(chunk)) {
-            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
-            if (spell(replaced)) {
-              result.add(replaced);
-            }
-          }
-        }
-      }
-
-      chunkStart = chunkEnd + 1;
-    }
-    return result;
-  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggester.java
@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
+import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
+import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import org.apache.lucene.util.CharsRef;
+
+/**
+ * A generator for misspelled word corrections based on Hunspell flags. The suggestions are searched
+ * for in two main ways:
+ *
+ * <ol>
+ *   <li>Modification: trying to insert/remove/delete/swap parts of the word to get something
+ *       acceptable. The performance of this part depends heavily on the contents of TRY, MAP, REP,
+ *       KEY directives in the .aff file.
+ *   <li>Enumeration: if the modification hasn't produced "good enough" suggestions, the whole
+ *       dictionary is scanned and simple affixes are added onto the entries to check if that
+ *       produces anything similar to the given misspelled word. This depends on the dictionary size
+ *       and the affix count, and it can take noticeable amount of time. To speed this up, {@link
+ *       #withSuggestibleEntryCache()} can be used.
+ * </ol>
+ */
+public class Suggester {
+  private final Dictionary dictionary;
+  private final SuggestibleEntryCache suggestibleCache;
+
+  public Suggester(Dictionary dictionary) {
+    this(dictionary, null);
+  }
+
+  private Suggester(Dictionary dictionary, SuggestibleEntryCache suggestibleCache) {
+    this.dictionary = dictionary;
+    this.suggestibleCache = suggestibleCache;
+  }
+
+  /**
+   * Returns a copy of this suggester instance with better "Enumeration" phase performance (see
+   * {@link Suggester} documentation), but using more memory. With this option, the dictionary
+   * entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
+   */
+  public Suggester withSuggestibleEntryCache() {
+    return new Suggester(dictionary, SuggestibleEntryCache.buildCache(dictionary.words));
+  }
+
+  /**
+   * Compute suggestions for the given misspelled word
+   *
+   * @param word the misspelled word to calculate suggestions for
+   * @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
+   *     generation by throwing an exception
+   */
+  public List<String> suggestNoTimeout(String word, Runnable checkCanceled) {
+    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
+    return suggest(word, suggestions, handleCustomTimeoutException(checkCanceled, suggestions));
+  }
+
+  private Runnable handleCustomTimeoutException(
+      Runnable checkCanceled, LinkedHashSet<Suggestion> suggestions) {
+    return () -> {
+      try {
+        checkCanceled.run();
+      } catch (SuggestionTimeoutException e) {
+        if (e.getPartialResult() != null) {
+          throw e;
+        }
+
+        throw new SuggestionTimeoutException(e.getMessage(), postprocess(suggestions));
+      }
+    };
+  }
+
+  /**
+   * @param word the misspelled word to calculate suggestions for
+   * @param timeLimitMs the duration limit in milliseconds after which the computation is interruped
+   *     by an exception
+   * @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
+   *     generation by throwing an exception
+   * @throws SuggestionTimeoutException if the computation takes too long. Use {@link
+   *     SuggestionTimeoutException#getPartialResult()} to get the suggestions computed up to that
+   *     point
+   */
+  public List<String> suggestWithTimeout(String word, long timeLimitMs, Runnable checkCanceled)
+      throws SuggestionTimeoutException {
+    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
+    Runnable checkTime = checkTimeLimit(word, suggestions, timeLimitMs, checkCanceled);
+    return suggest(word, suggestions, handleCustomTimeoutException(checkTime, suggestions));
+  }
+
+  private List<String> suggest(
+      String word, LinkedHashSet<Suggestion> suggestions, Runnable checkCanceled)
+      throws SuggestionTimeoutException {
+    checkCanceled.run();
+    if (word.length() >= 100) return Collections.emptyList();
+
+    if (dictionary.needsInputCleaning(word)) {
+      word = dictionary.cleanInput(word, new StringBuilder()).toString();
+    }
+
+    Hunspell suggestionSpeller =
+        new Hunspell(dictionary, NO_TIMEOUT, checkCanceled) {
+          // Cache for expensive "findStem" requests issued when trying to split a compound word.
+          // The suggestion algorithm issues many of them, often with the same text.
+          // The cache can be large, but will be GC-ed after the "suggest" call.
+          final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
+
+          @Override
+          boolean acceptsStem(int formID) {
+            return !dictionary.hasFlag(formID, dictionary.noSuggest)
+                && !dictionary.hasFlag(formID, dictionary.subStandard);
+          }
+
+          @Override
+          Root<CharsRef> findStem(
+              char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
+            if (context == COMPOUND_BEGIN && originalCase == null) {
+              return compoundCache
+                  .computeIfAbsent(
+                      new String(chars, offset, length),
+                      __ ->
+                          Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
+                  .orElse(null);
+            }
+            return super.findStem(chars, offset, length, originalCase, context);
+          }
+        };
+
+    WordCase wordCase = WordCase.caseOf(word);
+    if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
+      String title = dictionary.toTitleCase(word);
+      if (suggestionSpeller.spell(title)) {
+        return Collections.singletonList(title);
+      }
+    }
+
+    boolean hasGoodSuggestions =
+        new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
+
+    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
+      List<String> generated =
+          new GeneratingSuggester(suggestionSpeller, suggestibleCache)
+              .suggest(dictionary.toLowerCase(word), wordCase, suggestions);
+      for (String raw : generated) {
+        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
+      }
+    }
+
+    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
+      for (String raw : modifyChunksBetweenDashes(word, suggestionSpeller, checkCanceled)) {
+        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
+      }
+    }
+    return postprocess(suggestions);
+  }
+
+  private Runnable checkTimeLimit(
+      String word, Set<Suggestion> suggestions, long timeLimitMs, Runnable checkCanceled) {
+    return new Runnable() {
+      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
+      int invocationCounter = 100;
+
+      @Override
+      public void run() {
+        checkCanceled.run();
+        if (--invocationCounter <= 0) {
+          if (System.nanoTime() - deadline > 0) {
+            stop();
+          }
+          invocationCounter = 100;
+        }
+      }
+
+      private void stop() {
+        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
+        throw new SuggestionTimeoutException(message, postprocess(suggestions));
+      }
+    };
+  }
+
+  private List<String> postprocess(Collection<Suggestion> suggestions) {
+    return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
+  }
+
+  private List<String> modifyChunksBetweenDashes(
+      String word, Hunspell speller, Runnable checkCanceled) {
+    List<String> result = new ArrayList<>();
+    int chunkStart = 0;
+    while (chunkStart < word.length()) {
+      int chunkEnd = word.indexOf('-', chunkStart);
+      if (chunkEnd < 0) {
+        chunkEnd = word.length();
+      }
+
+      if (chunkEnd > chunkStart) {
+        String chunk = word.substring(chunkStart, chunkEnd);
+        if (!speller.spell(chunk)) {
+          for (String chunkSug : suggestNoTimeout(chunk, checkCanceled)) {
+            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
+            if (speller.spell(replaced)) {
+              result.add(replaced);
+            }
+          }
+        }
+      }
+
+      chunkStart = chunkEnd + 1;
+    }
+    return result;
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java
@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import java.util.function.BiConsumer;
+import java.util.function.Supplier;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
+
+/**
+ * A cache allowing for CPU-cache-friendlier iteration over {@link WordStorage} entries that can be
+ * used for suggestions. The words and the form data are stored in plain contiguous arrays with no
+ * compression.
+ */
+class SuggestibleEntryCache {
+  private final short[] lengths;
+  private final char[] roots;
+  private final int[] formData;
+
+  private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
+    this.lengths = lengths;
+    this.roots = roots;
+    this.formData = formData;
+  }
+
+  static SuggestibleEntryCache buildCache(WordStorage storage) {
+    var consumer =
+        new BiConsumer<CharsRef, Supplier<IntsRef>>() {
+          short[] lengths = new short[10];
+          final StringBuilder roots = new StringBuilder();
+          int[] formData = new int[10];
+          int lenOffset = 0;
+          int formDataOffset = 0;
+
+          @Override
+          public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
+            if (root.length > Short.MAX_VALUE) {
+              throw new UnsupportedOperationException(
+                  "Too long dictionary entry, please report this to dev@lucene.apache.org");
+            }
+
+            IntsRef forms = formSupplier.get();
+
+            lengths = ArrayUtil.grow(lengths, lenOffset + 2);
+            lengths[lenOffset] = (short) root.length;
+            lengths[lenOffset + 1] = (short) forms.length;
+            lenOffset += 2;
+
+            roots.append(root.chars, root.offset, root.length);
+
+            formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
+            System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
+            formDataOffset += forms.length;
+          }
+        };
+
+    storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);
+
+    return new SuggestibleEntryCache(
+        ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
+        consumer.roots.toString().toCharArray(),
+        ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
+  }
+
+  void processSuggestibleWords(
+      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
+    CharsRef chars = new CharsRef(roots, 0, 0);
+    IntsRef forms = new IntsRef(formData, 0, 0);
+    Supplier<IntsRef> formSupplier = () -> forms;
+    int rootOffset = 0;
+    int formDataOffset = 0;
+    for (int i = 0; i < lengths.length; i += 2) {
+      int rootLength = lengths[i];
+      short formDataLength = lengths[i + 1];
+      if (rootLength >= minLength && rootLength <= maxLength) {
+        chars.offset = rootOffset;
+        chars.length = rootLength;
+        forms.offset = formDataOffset;
+        forms.length = formDataLength;
+        processor.accept(chars, formSupplier);
+      }
+      rootOffset += rootLength;
+      formDataOffset += formDataLength;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.BiConsumer;
+import java.util.function.Supplier;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataOutput;
@ -54,7 +55,8 @@ class WordStorage {
  private static final int COLLISION_MASK = 0x40;
  private static final int SUGGESTIBLE_MASK = 0x20;
  private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
-
+  private final int maxEntryLength;
+  private final boolean hasCustomMorphData;
  /**
   * A map from word's hash (modulo array's length) into an int containing:
   *
@ -89,7 +91,10 @@ class WordStorage {
   */
  private final byte[] wordData;

-  private WordStorage(int[] hashTable, byte[] wordData) {
+  private WordStorage(
+      int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
+    this.maxEntryLength = maxEntryLength;
+    this.hasCustomMorphData = hasCustomMorphData;
    this.hashTable = hashTable;
    this.wordData = wordData;
  }
@ -153,11 +158,13 @@ class WordStorage {
   * can be modified in any way, but may not be saved for later by the processor
   */
  void processSuggestibleWords(
-      int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
+      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
    assert minLength <= maxLength;
+    maxLength = Math.min(maxEntryLength, maxLength);
+
    CharsRef chars = new CharsRef(maxLength);
-    IntsRef forms = new IntsRef();
    ByteArrayDataInput in = new ByteArrayDataInput(wordData);
+    var formSupplier = new LazyFormReader(in);
    for (int entryCode : hashTable) {
      int pos = entryCode & OFFSET_MASK;
      int mask = entryCode >>> OFFSET_BITS;
@ -179,11 +186,7 @@ class WordStorage {
        }

        if (mightMatch) {
-          int dataLength = in.readVInt();
-          if (forms.ints.length < dataLength) {
-            forms.ints = new int[dataLength];
-          }
-          readForms(forms, in, dataLength);
+          formSupplier.dataPos = in.getPosition();
          while (prevPos != 0 && wordStart > 0) {
            in.setPosition(prevPos);
            chars.chars[--wordStart] = (char) in.readVInt();
@ -193,7 +196,7 @@ class WordStorage {
          if (prevPos == 0) {
            chars.offset = wordStart;
            chars.length = maxLength - wordStart;
-            processor.accept(chars, forms);
+            processor.accept(chars, formSupplier);
          }
        }

@ -257,6 +260,7 @@ class WordStorage {
    private final ByteArrayDataOutput dataWriter;
    private int commonPrefixLength, commonPrefixPos;
    private int actualWords;
+    private int maxEntryLength;

    /**
     * @param wordCount an approximate number of the words in the resulting dictionary, used to
@ -297,6 +301,8 @@ class WordStorage {
     * {@link String#compareTo} rules.
     */
    void add(String entry, char[] flags, int morphDataID) throws IOException {
+      maxEntryLength = Math.max(maxEntryLength, entry.length());
+
      if (!entry.equals(currentEntry)) {
        if (currentEntry != null) {
          if (entry.compareTo(currentEntry) < 0) {
@ -411,8 +417,36 @@ class WordStorage {
    WordStorage build() throws IOException {
      assert !group.isEmpty() : "build() should be only called once";
      flushGroup();
-      return new WordStorage(
-          hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));
+      byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
+      return new WordStorage(maxEntryLength, hasCustomMorphData, hashTable, trimmedData);
+    }
+  }
+
+  private class LazyFormReader implements Supplier<IntsRef> {
+    int dataPos;
+    private final ByteArrayDataInput in;
+    private final IntsRef forms;
+
+    LazyFormReader(ByteArrayDataInput in) {
+      this.in = in;
+      forms = new IntsRef();
+    }
+
+    @Override
+    public IntsRef get() {
+      in.setPosition(dataPos);
+      int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
+      if (forms.ints.length < entryCount) {
+        forms.ints = new int[entryCount];
+      }
+      for (int i = 0; i < entryCount; i++) {
+        forms.ints[i] = in.readVInt();
+        if (hasCustomMorphData) {
+          in.readVInt();
+        }
+      }
+      forms.length = entryCount;
+      return forms;
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -149,6 +149,7 @@ public class TestAllDictionaries extends LuceneTestCase {
  }

  public void testDictionariesLoadSuccessfully() throws Exception {
+    AtomicLong memoryWithCache = new AtomicLong();
    AtomicLong totalMemory = new AtomicLong();
    AtomicLong totalWords = new AtomicLong();
    int threads = Runtime.getRuntime().availableProcessors();
@ -159,7 +160,16 @@ public class TestAllDictionaries extends LuceneTestCase {
        (Path aff) -> {
          try {
            Dictionary dic = loadDictionary(aff);
+            new Hunspell(dic).spell("aaaa");
+            Suggester suggester = new Suggester(dic).withSuggestibleEntryCache();
+            try {
+              suggester.suggestWithTimeout("aaaaaaaaaa", Hunspell.SUGGEST_TIME_LIMIT, () -> {});
+            } catch (
+                @SuppressWarnings("unused")
+                SuggestionTimeoutException e) {
+            }
            totalMemory.addAndGet(RamUsageTester.ramUsed(dic));
+            memoryWithCache.addAndGet(RamUsageTester.ramUsed(suggester));
            totalWords.addAndGet(RamUsageTester.ramUsed(dic.words));
            System.out.println(aff + "\t" + memoryUsageSummary(dic));
          } catch (Throwable e) {
@ -195,6 +205,9 @@ public class TestAllDictionaries extends LuceneTestCase {
    System.out.println("Total memory: " + RamUsageEstimator.humanReadableUnits(totalMemory.get()));
    System.out.println(
        "Total memory for word storage: " + RamUsageEstimator.humanReadableUnits(totalWords.get()));
+    System.out.println(
+        "Additional memory if withSuggestibleEntryCache is enabled: "
+            + RamUsageEstimator.humanReadableUnits(memoryWithCache.get() - totalMemory.get()));
  }

  private static String memoryUsageSummary(Dictionary dic) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -110,6 +110,12 @@ public class TestDictionary extends LuceneTestCase {
    Set<String> processed = new HashSet<>();
    dictionary.words.processSuggestibleWords(
        minLength, maxLength, (word, __) -> processed.add(word.toString()));
+
+    Set<String> cached = new HashSet<>();
+    SuggestibleEntryCache.buildCache(dictionary.words)
+        .processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
+    assertEquals(processed, cached);
+
    return processed;
  }

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@ -29,6 +29,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CancellationException;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.tests.util.LuceneTestCase;
@ -54,6 +55,33 @@ public class TestHunspell extends LuceneTestCase {
    assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
  }

+  public void testCustomCheckCanceledGivesPartialResult() throws Exception {
+    Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
+
+    List<String> expected = List.of("apach");
+    assertEquals(expected, new Hunspell(dictionary, NO_TIMEOUT, () -> {}).suggest("apac"));
+
+    AtomicInteger counter = new AtomicInteger();
+    String msg = "msg";
+    Runnable checkCanceled =
+        () -> {
+          if (counter.incrementAndGet() > 400) {
+            throw new SuggestionTimeoutException(msg, null);
+          }
+        };
+
+    Hunspell hunspell = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, checkCanceled);
+    assertEquals(expected, hunspell.suggest("apac"));
+
+    counter.set(0);
+    var e =
+        assertThrows(
+            SuggestionTimeoutException.class,
+            () -> new Suggester(dictionary).suggestNoTimeout("apac", checkCanceled));
+    assertEquals(expected, e.getPartialResult());
+    assertEquals("msg", e.getMessage());
+  }
+
  public void testSuggestionTimeLimit() throws IOException, ParseException {
    int timeLimitMs = 10;

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -86,7 +86,7 @@ public class TestPerformance extends LuceneTestCase {

  @Test
  public void de_suggest() throws Exception {
-    checkSuggestionPerformance("de", 100);
+    checkSuggestionPerformance("de", 150);
  }

  @Test
@ -163,33 +163,33 @@ public class TestPerformance extends LuceneTestCase {

  private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
    Dictionary dictionary = loadDictionary(code);
+    Suggester suggester = new Suggester(dictionary).withSuggestibleEntryCache();
    Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
    List<String> words =
        loadWords(code, wordCount, dictionary).stream()
            .distinct()
-            .filter(w -> hasQuickSuggestions(speller, w))
+            .filter(w -> hasQuickSuggestions(speller, suggester, w))
            .collect(Collectors.toList());
    System.out.println("Checking " + words.size() + " misspelled words");

-    Hunspell fullSpeller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
    measure(
        "Suggestions for " + code,
        words.size(),
        blackHole -> {
          for (String word : words) {
-            blackHole.accept(fullSpeller.suggest(word));
+            blackHole.accept(suggester.suggestNoTimeout(word, () -> {}));
          }
        });
    System.out.println();
  }

-  private boolean hasQuickSuggestions(Hunspell speller, String word) {
+  private boolean hasQuickSuggestions(Hunspell speller, Suggester suggester, String word) {
    if (speller.spell(word)) {
      return false;
    }

    try {
-      speller.suggest(word);
+      suggester.suggestWithTimeout(word, Hunspell.SUGGEST_TIME_LIMIT, () -> {});
    } catch (
        @SuppressWarnings("unused")
        SuggestionTimeoutException e) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -249,10 +249,14 @@ public class TestSpellChecking extends LuceneTestCase {
    InputStream dictStream = Files.newInputStream(dicFile);

    Hunspell speller;
+    Suggester defaultSuggester;
+    Suggester cachingSuggester;
    try {
      Dictionary dictionary =
          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
      speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
+      defaultSuggester = new Suggester(dictionary);
+      cachingSuggester = new Suggester(dictionary).withSuggestibleEntryCache();
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStream);
@ -273,12 +277,8 @@ public class TestSpellChecking extends LuceneTestCase {
        assertFalse("Unexpectedly considered correct: " + word, speller.spell(word.trim()));
      }
      if (Files.exists(sug)) {
-        String suggestions =
-            wrongWords.stream()
-                .map(s -> String.join(", ", speller.suggest(s)))
-                .filter(s -> !s.isEmpty())
-                .collect(Collectors.joining("\n"));
-        assertEquals(Files.readString(sug).trim(), suggestions);
+        assertEquals(Files.readString(sug).trim(), suggest(defaultSuggester, wrongWords));
+        assertEquals(Files.readString(sug).trim(), suggest(cachingSuggester, wrongWords));
      }
    } else {
      assertFalse(".sug file without .wrong file!", Files.exists(sug));
@ -290,6 +290,13 @@ public class TestSpellChecking extends LuceneTestCase {
    }
  }

+  private static String suggest(Suggester suggester, List<String> wrongWords) {
+    return wrongWords.stream()
+        .map(s -> String.join(", ", suggester.suggestNoTimeout(s, () -> {})))
+        .filter(s -> !s.isEmpty())
+        .collect(Collectors.joining("\n"));
+  }
+
  private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
    Set<String> everythingGenerated = new HashSet<>();
    boolean generatedEverything = true;