hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in) (#11893)

hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in)
2022-11-09 08:20:50 +01:00 · 2022-11-09 08:20:50 +01:00 · f7417d5961
parent c66a559050
commit f7417d5961
11 changed files with 490 additions and 193 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -63,7 +63,7 @@ Improvements
 Optimizations
 ---------------------
-* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
+* GITHUB#11857, GITHUB#11859, GITHUB#11893: Hunspell: improved suggestion performance
 Bug Fixes
 ---------------------
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -28,7 +28,11 @@ import java.util.Objects;
 import java.util.PriorityQueue;
 import java.util.Set;
 import java.util.TreeSet;
 import java.util.function.BiConsumer;
 import java.util.function.IntPredicate;
 import java.util.function.Supplier;
 import java.util.stream.Collectors;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.fst.FST;
@ -43,10 +47,12 @@ class GeneratingSuggester {
  private static final int MAX_ROOT_LENGTH_DIFF = 4;
  private final Dictionary dictionary;
  private final Hunspell speller;
  private final SuggestibleEntryCache entryCache;
-  GeneratingSuggester(Hunspell speller) {
+  GeneratingSuggester(Hunspell speller, SuggestibleEntryCache entryCache) {
    this.dictionary = speller.dictionary;
    this.speller = speller;
    this.entryCache = entryCache;
  }
  List<String> suggest(String word, WordCase originalCase, Set<Suggestion> prevSuggestions) {
@ -60,7 +66,11 @@ class GeneratingSuggester {
      String word, WordCase originalCase) {
    Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
    PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
-    EntryFilter filter = new EntryFilter(dictionary);
+
    char[] excludeFlags = dictionary.allNonSuggestibleFlags();
    FlagEnumerator.Lookup flagLookup = dictionary.flagLookup;
    IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);
    boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
    TrigramAutomaton automaton =
        new TrigramAutomaton(word) {
@ -70,10 +80,10 @@ class GeneratingSuggester {
          }
        };
-    dictionary.words.processSuggestibleWords(
+    processSuggestibleWords(
        Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
        word.length() + MAX_ROOT_LENGTH_DIFF,
-        (rootChars, forms) -> {
+        (rootChars, formSupplier) -> {
          if (ignoreTitleCaseRoots
              && Character.isUpperCase(rootChars.charAt(0))
              && WordCase.caseOf(rootChars) == WordCase.TITLE) {
@ -87,44 +97,34 @@ class GeneratingSuggester {
          sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
-          if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
+          boolean overflow = roots.size() == MAX_ROOTS;
          if (overflow && sc <= roots.peek().score) {
            return;
          }
          speller.checkCanceled.run();
          String root = rootChars.toString();
-          int suitable = filter.findSuitableFormIndex(forms, 0);
+          IntsRef forms = formSupplier.get();
-          do {
+          for (int i = 0; i < forms.length; i++) {
-            roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
+            if (isSuggestible.test(forms.ints[forms.offset + i])) {
-            suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
+              roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
-          } while (suitable > 0);
+              if (overflow) {
-          while (roots.size() > MAX_ROOTS) {
+                roots.poll();
-            roots.poll();
+              }
            }
          }
        });
    return roots.stream().sorted().collect(Collectors.toList());
  }
-  private static class EntryFilter {
+  private void processSuggestibleWords(
-    private final int formStep;
+      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
-    private final FlagEnumerator.Lookup flagLookup;
+    if (entryCache != null) {
-    private final char[] excludeFlags;
+      entryCache.processSuggestibleWords(minLength, maxLength, processor);
-
+    } else {
-    EntryFilter(Dictionary dic) {
+      dictionary.words.processSuggestibleWords(minLength, maxLength, processor);
      formStep = dic.formStep();
      flagLookup = dic.flagLookup;
      excludeFlags = dic.allNonSuggestibleFlags();
    }
    int findSuitableFormIndex(IntsRef forms, int start) {
      for (int i = start; i < forms.length; i += formStep) {
        if (!flagLookup.hasAnyFlag(forms.ints[forms.offset + i], excludeFlags)) {
          return i;
        }
      }
      return -1;
    }
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -26,16 +26,8 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -56,7 +48,6 @@ import org.apache.lucene.util.IntsRef;
 */
 public class Hunspell {
  static final long SUGGEST_TIME_LIMIT = 250;
  final Dictionary dictionary;
  final Stemmer stemmer;
  private final TimeoutPolicy policy;
@ -75,7 +66,7 @@ public class Hunspell {
    this.dictionary = dictionary;
    this.policy = policy;
    this.checkCanceled = checkCanceled;
-    stemmer = new Stemmer(dictionary);
+    this.stemmer = new Stemmer(dictionary);
  }
  /**
@ -568,6 +559,7 @@ public class Hunspell {
   * @return suggestions for the given misspelled word
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
   * @see Suggester for finer-grained APIs and performance optimizations
   */
  public List<String> suggest(String word) throws SuggestionTimeoutException {
    return suggest(word, SUGGEST_TIME_LIMIT);
@ -579,140 +571,19 @@ public class Hunspell {
   *     TimeoutPolicy}'s effects (exception or partial result) may kick in
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
   * @see Suggester for finer-grained APIs and performance optimizations
   */
  public List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
-    checkCanceled.run();
+    Suggester suggester = new Suggester(dictionary);
-    if (word.length() >= 100) return Collections.emptyList();
+    if (policy == NO_TIMEOUT) return suggester.suggestNoTimeout(word, checkCanceled);
    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }
    WordCase wordCase = WordCase.caseOf(word);
    if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
      String title = dictionary.toTitleCase(word);
      if (spell(title)) {
        return Collections.singletonList(title);
      }
    }
    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
    Runnable checkCanceled =
        policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, suggestions, timeLimitMs);
    try {
-      doSuggest(word, wordCase, suggestions, checkCanceled);
+      return suggester.suggestWithTimeout(word, timeLimitMs, checkCanceled);
    } catch (SuggestionTimeoutException e) {
-      if (policy != RETURN_PARTIAL_RESULT) {
+      if (policy == RETURN_PARTIAL_RESULT) {
-        throw e;
+        return e.getPartialResult();
      }
      throw e;
    }
    return postprocess(suggestions);
  }
  private void doSuggest(
      String word,
      WordCase wordCase,
      LinkedHashSet<Suggestion> suggestions,
      Runnable checkCanceled) {
    Hunspell suggestionSpeller =
        new Hunspell(dictionary, policy, checkCanceled) {
          // Cache for expensive "findStem" requests issued when trying to split a compound word.
          // The suggestion algorithm issues many of them, often with the same text.
          // The cache can be large, but will be GC-ed after the "suggest" call.
          final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
          @Override
          boolean acceptsStem(int formID) {
            return !dictionary.hasFlag(formID, dictionary.noSuggest)
                && !dictionary.hasFlag(formID, dictionary.subStandard);
          }
          @Override
          Root<CharsRef> findStem(
              char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
            if (context == COMPOUND_BEGIN && originalCase == null) {
              return compoundCache
                  .computeIfAbsent(
                      new String(chars, offset, length),
                      __ ->
                          Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
                  .orElse(null);
            }
            return super.findStem(chars, offset, length, originalCase, context);
          }
        };
    boolean hasGoodSuggestions =
        new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
      List<String> generated =
          new GeneratingSuggester(suggestionSpeller)
              .suggest(dictionary.toLowerCase(word), wordCase, suggestions);
      for (String raw : generated) {
        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
      }
    }
    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
      for (String raw : modifyChunksBetweenDashes(word)) {
        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
      }
    }
  }
  private Runnable checkTimeLimit(String word, Set<Suggestion> suggestions, long timeLimitMs) {
    return new Runnable() {
      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
      int invocationCounter = 100;
      @Override
      public void run() {
        checkCanceled.run();
        if (--invocationCounter <= 0) {
          if (System.nanoTime() - deadline > 0) {
            stop();
          }
          invocationCounter = 100;
        }
      }
      private void stop() {
        List<String> partialResult =
            policy == RETURN_PARTIAL_RESULT ? null : postprocess(suggestions);
        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
        throw new SuggestionTimeoutException(message, partialResult);
      }
    };
  }
  private List<String> postprocess(Collection<Suggestion> suggestions) {
    return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
  }
  private List<String> modifyChunksBetweenDashes(String word) {
    List<String> result = new ArrayList<>();
    int chunkStart = 0;
    while (chunkStart < word.length()) {
      int chunkEnd = word.indexOf('-', chunkStart);
      if (chunkEnd < 0) {
        chunkEnd = word.length();
      }
      if (chunkEnd > chunkStart) {
        String chunk = word.substring(chunkStart, chunkEnd);
        if (!spell(chunk)) {
          for (String chunkSug : suggest(chunk)) {
            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
            if (spell(replaced)) {
              result.add(replaced);
            }
          }
        }
      }
      chunkStart = chunkEnd + 1;
    }
    return result;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggester.java
@ -0,0 +1,237 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import org.apache.lucene.util.CharsRef;
 /**
 * A generator for misspelled word corrections based on Hunspell flags. The suggestions are searched
 * for in two main ways:
 *
 * <ol>
 *   <li>Modification: trying to insert/remove/delete/swap parts of the word to get something
 *       acceptable. The performance of this part depends heavily on the contents of TRY, MAP, REP,
 *       KEY directives in the .aff file.
 *   <li>Enumeration: if the modification hasn't produced "good enough" suggestions, the whole
 *       dictionary is scanned and simple affixes are added onto the entries to check if that
 *       produces anything similar to the given misspelled word. This depends on the dictionary size
 *       and the affix count, and it can take noticeable amount of time. To speed this up, {@link
 *       #withSuggestibleEntryCache()} can be used.
 * </ol>
 */
 public class Suggester {
  private final Dictionary dictionary;
  private final SuggestibleEntryCache suggestibleCache;
  public Suggester(Dictionary dictionary) {
    this(dictionary, null);
  }
  private Suggester(Dictionary dictionary, SuggestibleEntryCache suggestibleCache) {
    this.dictionary = dictionary;
    this.suggestibleCache = suggestibleCache;
  }
  /**
   * Returns a copy of this suggester instance with better "Enumeration" phase performance (see
   * {@link Suggester} documentation), but using more memory. With this option, the dictionary
   * entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
   */
  public Suggester withSuggestibleEntryCache() {
    return new Suggester(dictionary, SuggestibleEntryCache.buildCache(dictionary.words));
  }
  /**
   * Compute suggestions for the given misspelled word
   *
   * @param word the misspelled word to calculate suggestions for
   * @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
   *     generation by throwing an exception
   */
  public List<String> suggestNoTimeout(String word, Runnable checkCanceled) {
    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
    return suggest(word, suggestions, handleCustomTimeoutException(checkCanceled, suggestions));
  }
  private Runnable handleCustomTimeoutException(
      Runnable checkCanceled, LinkedHashSet<Suggestion> suggestions) {
    return () -> {
      try {
        checkCanceled.run();
      } catch (SuggestionTimeoutException e) {
        if (e.getPartialResult() != null) {
          throw e;
        }
        throw new SuggestionTimeoutException(e.getMessage(), postprocess(suggestions));
      }
    };
  }
  /**
   * @param word the misspelled word to calculate suggestions for
   * @param timeLimitMs the duration limit in milliseconds after which the computation is interruped
   *     by an exception
   * @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
   *     generation by throwing an exception
   * @throws SuggestionTimeoutException if the computation takes too long. Use {@link
   *     SuggestionTimeoutException#getPartialResult()} to get the suggestions computed up to that
   *     point
   */
  public List<String> suggestWithTimeout(String word, long timeLimitMs, Runnable checkCanceled)
      throws SuggestionTimeoutException {
    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
    Runnable checkTime = checkTimeLimit(word, suggestions, timeLimitMs, checkCanceled);
    return suggest(word, suggestions, handleCustomTimeoutException(checkTime, suggestions));
  }
  private List<String> suggest(
      String word, LinkedHashSet<Suggestion> suggestions, Runnable checkCanceled)
      throws SuggestionTimeoutException {
    checkCanceled.run();
    if (word.length() >= 100) return Collections.emptyList();
    if (dictionary.needsInputCleaning(word)) {
      word = dictionary.cleanInput(word, new StringBuilder()).toString();
    }
    Hunspell suggestionSpeller =
        new Hunspell(dictionary, NO_TIMEOUT, checkCanceled) {
          // Cache for expensive "findStem" requests issued when trying to split a compound word.
          // The suggestion algorithm issues many of them, often with the same text.
          // The cache can be large, but will be GC-ed after the "suggest" call.
          final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
          @Override
          boolean acceptsStem(int formID) {
            return !dictionary.hasFlag(formID, dictionary.noSuggest)
                && !dictionary.hasFlag(formID, dictionary.subStandard);
          }
          @Override
          Root<CharsRef> findStem(
              char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
            if (context == COMPOUND_BEGIN && originalCase == null) {
              return compoundCache
                  .computeIfAbsent(
                      new String(chars, offset, length),
                      __ ->
                          Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
                  .orElse(null);
            }
            return super.findStem(chars, offset, length, originalCase, context);
          }
        };
    WordCase wordCase = WordCase.caseOf(word);
    if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
      String title = dictionary.toTitleCase(word);
      if (suggestionSpeller.spell(title)) {
        return Collections.singletonList(title);
      }
    }
    boolean hasGoodSuggestions =
        new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
      List<String> generated =
          new GeneratingSuggester(suggestionSpeller, suggestibleCache)
              .suggest(dictionary.toLowerCase(word), wordCase, suggestions);
      for (String raw : generated) {
        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
      }
    }
    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
      for (String raw : modifyChunksBetweenDashes(word, suggestionSpeller, checkCanceled)) {
        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
      }
    }
    return postprocess(suggestions);
  }
  private Runnable checkTimeLimit(
      String word, Set<Suggestion> suggestions, long timeLimitMs, Runnable checkCanceled) {
    return new Runnable() {
      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
      int invocationCounter = 100;
      @Override
      public void run() {
        checkCanceled.run();
        if (--invocationCounter <= 0) {
          if (System.nanoTime() - deadline > 0) {
            stop();
          }
          invocationCounter = 100;
        }
      }
      private void stop() {
        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
        throw new SuggestionTimeoutException(message, postprocess(suggestions));
      }
    };
  }
  private List<String> postprocess(Collection<Suggestion> suggestions) {
    return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
  }
  private List<String> modifyChunksBetweenDashes(
      String word, Hunspell speller, Runnable checkCanceled) {
    List<String> result = new ArrayList<>();
    int chunkStart = 0;
    while (chunkStart < word.length()) {
      int chunkEnd = word.indexOf('-', chunkStart);
      if (chunkEnd < 0) {
        chunkEnd = word.length();
      }
      if (chunkEnd > chunkStart) {
        String chunk = word.substring(chunkStart, chunkEnd);
        if (!speller.spell(chunk)) {
          for (String chunkSug : suggestNoTimeout(chunk, checkCanceled)) {
            String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
            if (speller.spell(replaced)) {
              result.add(replaced);
            }
          }
        }
      }
      chunkStart = chunkEnd + 1;
    }
    return result;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestibleEntryCache.java
@ -0,0 +1,101 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.util.function.BiConsumer;
 import java.util.function.Supplier;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 /**
 * A cache allowing for CPU-cache-friendlier iteration over {@link WordStorage} entries that can be
 * used for suggestions. The words and the form data are stored in plain contiguous arrays with no
 * compression.
 */
 class SuggestibleEntryCache {
  private final short[] lengths;
  private final char[] roots;
  private final int[] formData;
  private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
    this.lengths = lengths;
    this.roots = roots;
    this.formData = formData;
  }
  static SuggestibleEntryCache buildCache(WordStorage storage) {
    var consumer =
        new BiConsumer<CharsRef, Supplier<IntsRef>>() {
          short[] lengths = new short[10];
          final StringBuilder roots = new StringBuilder();
          int[] formData = new int[10];
          int lenOffset = 0;
          int formDataOffset = 0;
          @Override
          public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
            if (root.length > Short.MAX_VALUE) {
              throw new UnsupportedOperationException(
                  "Too long dictionary entry, please report this to dev@lucene.apache.org");
            }
            IntsRef forms = formSupplier.get();
            lengths = ArrayUtil.grow(lengths, lenOffset + 2);
            lengths[lenOffset] = (short) root.length;
            lengths[lenOffset + 1] = (short) forms.length;
            lenOffset += 2;
            roots.append(root.chars, root.offset, root.length);
            formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
            System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
            formDataOffset += forms.length;
          }
        };
    storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);
    return new SuggestibleEntryCache(
        ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
        consumer.roots.toString().toCharArray(),
        ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
  }
  void processSuggestibleWords(
      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
    CharsRef chars = new CharsRef(roots, 0, 0);
    IntsRef forms = new IntsRef(formData, 0, 0);
    Supplier<IntsRef> formSupplier = () -> forms;
    int rootOffset = 0;
    int formDataOffset = 0;
    for (int i = 0; i < lengths.length; i += 2) {
      int rootLength = lengths[i];
      short formDataLength = lengths[i + 1];
      if (rootLength >= minLength && rootLength <= maxLength) {
        chars.offset = rootOffset;
        chars.length = rootLength;
        forms.offset = formDataOffset;
        forms.length = formDataLength;
        processor.accept(chars, formSupplier);
      }
      rootOffset += rootLength;
      formDataOffset += formDataLength;
    }
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordStorage.java
@ -20,6 +20,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.BiConsumer;
 import java.util.function.Supplier;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.store.ByteArrayDataOutput;
 import org.apache.lucene.store.DataOutput;
@ -54,7 +55,8 @@ class WordStorage {
  private static final int COLLISION_MASK = 0x40;
  private static final int SUGGESTIBLE_MASK = 0x20;
  private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
-
+  private final int maxEntryLength;
  private final boolean hasCustomMorphData;
  /**
   * A map from word's hash (modulo array's length) into an int containing:
   *
@ -89,7 +91,10 @@ class WordStorage {
   */
  private final byte[] wordData;
-  private WordStorage(int[] hashTable, byte[] wordData) {
+  private WordStorage(
      int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
    this.maxEntryLength = maxEntryLength;
    this.hasCustomMorphData = hasCustomMorphData;
    this.hashTable = hashTable;
    this.wordData = wordData;
  }
@ -153,11 +158,13 @@ class WordStorage {
   * can be modified in any way, but may not be saved for later by the processor
   */
  void processSuggestibleWords(
-      int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
+      int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
    assert minLength <= maxLength;
    maxLength = Math.min(maxEntryLength, maxLength);
    CharsRef chars = new CharsRef(maxLength);
    IntsRef forms = new IntsRef();
    ByteArrayDataInput in = new ByteArrayDataInput(wordData);
    var formSupplier = new LazyFormReader(in);
    for (int entryCode : hashTable) {
      int pos = entryCode & OFFSET_MASK;
      int mask = entryCode >>> OFFSET_BITS;
@ -179,11 +186,7 @@ class WordStorage {
        }
        if (mightMatch) {
-          int dataLength = in.readVInt();
+          formSupplier.dataPos = in.getPosition();
          if (forms.ints.length < dataLength) {
            forms.ints = new int[dataLength];
          }
          readForms(forms, in, dataLength);
          while (prevPos != 0 && wordStart > 0) {
            in.setPosition(prevPos);
            chars.chars[--wordStart] = (char) in.readVInt();
@ -193,7 +196,7 @@ class WordStorage {
          if (prevPos == 0) {
            chars.offset = wordStart;
            chars.length = maxLength - wordStart;
-            processor.accept(chars, forms);
+            processor.accept(chars, formSupplier);
          }
        }
@ -257,6 +260,7 @@ class WordStorage {
    private final ByteArrayDataOutput dataWriter;
    private int commonPrefixLength, commonPrefixPos;
    private int actualWords;
    private int maxEntryLength;
    /**
     * @param wordCount an approximate number of the words in the resulting dictionary, used to
@ -297,6 +301,8 @@ class WordStorage {
     * {@link String#compareTo} rules.
     */
    void add(String entry, char[] flags, int morphDataID) throws IOException {
      maxEntryLength = Math.max(maxEntryLength, entry.length());
      if (!entry.equals(currentEntry)) {
        if (currentEntry != null) {
          if (entry.compareTo(currentEntry) < 0) {
@ -411,8 +417,36 @@ class WordStorage {
    WordStorage build() throws IOException {
      assert !group.isEmpty() : "build() should be only called once";
      flushGroup();
-      return new WordStorage(
+      byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
-          hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));
+      return new WordStorage(maxEntryLength, hasCustomMorphData, hashTable, trimmedData);
    }
  }
  private class LazyFormReader implements Supplier<IntsRef> {
    int dataPos;
    private final ByteArrayDataInput in;
    private final IntsRef forms;
    LazyFormReader(ByteArrayDataInput in) {
      this.in = in;
      forms = new IntsRef();
    }
    @Override
    public IntsRef get() {
      in.setPosition(dataPos);
      int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
      if (forms.ints.length < entryCount) {
        forms.ints = new int[entryCount];
      }
      for (int i = 0; i < entryCount; i++) {
        forms.ints[i] = in.readVInt();
        if (hasCustomMorphData) {
          in.readVInt();
        }
      }
      forms.length = entryCount;
      return forms;
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -149,6 +149,7 @@ public class TestAllDictionaries extends LuceneTestCase {
  }
  public void testDictionariesLoadSuccessfully() throws Exception {
    AtomicLong memoryWithCache = new AtomicLong();
    AtomicLong totalMemory = new AtomicLong();
    AtomicLong totalWords = new AtomicLong();
    int threads = Runtime.getRuntime().availableProcessors();
@ -159,7 +160,16 @@ public class TestAllDictionaries extends LuceneTestCase {
        (Path aff) -> {
          try {
            Dictionary dic = loadDictionary(aff);
            new Hunspell(dic).spell("aaaa");
            Suggester suggester = new Suggester(dic).withSuggestibleEntryCache();
            try {
              suggester.suggestWithTimeout("aaaaaaaaaa", Hunspell.SUGGEST_TIME_LIMIT, () -> {});
            } catch (
                @SuppressWarnings("unused")
                SuggestionTimeoutException e) {
            }
            totalMemory.addAndGet(RamUsageTester.ramUsed(dic));
            memoryWithCache.addAndGet(RamUsageTester.ramUsed(suggester));
            totalWords.addAndGet(RamUsageTester.ramUsed(dic.words));
            System.out.println(aff + "\t" + memoryUsageSummary(dic));
          } catch (Throwable e) {
@ -195,6 +205,9 @@ public class TestAllDictionaries extends LuceneTestCase {
    System.out.println("Total memory: " + RamUsageEstimator.humanReadableUnits(totalMemory.get()));
    System.out.println(
        "Total memory for word storage: " + RamUsageEstimator.humanReadableUnits(totalWords.get()));
    System.out.println(
        "Additional memory if withSuggestibleEntryCache is enabled: "
            + RamUsageEstimator.humanReadableUnits(memoryWithCache.get() - totalMemory.get()));
  }
  private static String memoryUsageSummary(Dictionary dic) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -110,6 +110,12 @@ public class TestDictionary extends LuceneTestCase {
    Set<String> processed = new HashSet<>();
    dictionary.words.processSuggestibleWords(
        minLength, maxLength, (word, __) -> processed.add(word.toString()));
    Set<String> cached = new HashSet<>();
    SuggestibleEntryCache.buildCache(dictionary.words)
        .processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
    assertEquals(processed, cached);
    return processed;
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@ -29,6 +29,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CancellationException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.lucene.tests.util.LuceneTestCase;
@ -54,6 +55,33 @@ public class TestHunspell extends LuceneTestCase {
    assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
  }
  public void testCustomCheckCanceledGivesPartialResult() throws Exception {
    Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
    List<String> expected = List.of("apach");
    assertEquals(expected, new Hunspell(dictionary, NO_TIMEOUT, () -> {}).suggest("apac"));
    AtomicInteger counter = new AtomicInteger();
    String msg = "msg";
    Runnable checkCanceled =
        () -> {
          if (counter.incrementAndGet() > 400) {
            throw new SuggestionTimeoutException(msg, null);
          }
        };
    Hunspell hunspell = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, checkCanceled);
    assertEquals(expected, hunspell.suggest("apac"));
    counter.set(0);
    var e =
        assertThrows(
            SuggestionTimeoutException.class,
            () -> new Suggester(dictionary).suggestNoTimeout("apac", checkCanceled));
    assertEquals(expected, e.getPartialResult());
    assertEquals("msg", e.getMessage());
  }
  public void testSuggestionTimeLimit() throws IOException, ParseException {
    int timeLimitMs = 10;
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -86,7 +86,7 @@ public class TestPerformance extends LuceneTestCase {
  @Test
  public void de_suggest() throws Exception {
-    checkSuggestionPerformance("de", 100);
+    checkSuggestionPerformance("de", 150);
  }
  @Test
@ -163,33 +163,33 @@ public class TestPerformance extends LuceneTestCase {
  private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
    Dictionary dictionary = loadDictionary(code);
    Suggester suggester = new Suggester(dictionary).withSuggestibleEntryCache();
    Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
    List<String> words =
        loadWords(code, wordCount, dictionary).stream()
            .distinct()
-            .filter(w -> hasQuickSuggestions(speller, w))
+            .filter(w -> hasQuickSuggestions(speller, suggester, w))
            .collect(Collectors.toList());
    System.out.println("Checking " + words.size() + " misspelled words");
    Hunspell fullSpeller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
    measure(
        "Suggestions for " + code,
        words.size(),
        blackHole -> {
          for (String word : words) {
-            blackHole.accept(fullSpeller.suggest(word));
+            blackHole.accept(suggester.suggestNoTimeout(word, () -> {}));
          }
        });
    System.out.println();
  }
-  private boolean hasQuickSuggestions(Hunspell speller, String word) {
+  private boolean hasQuickSuggestions(Hunspell speller, Suggester suggester, String word) {
    if (speller.spell(word)) {
      return false;
    }
    try {
-      speller.suggest(word);
+      suggester.suggestWithTimeout(word, Hunspell.SUGGEST_TIME_LIMIT, () -> {});
    } catch (
        @SuppressWarnings("unused")
        SuggestionTimeoutException e) {
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -249,10 +249,14 @@ public class TestSpellChecking extends LuceneTestCase {
    InputStream dictStream = Files.newInputStream(dicFile);
    Hunspell speller;
    Suggester defaultSuggester;
    Suggester cachingSuggester;
    try {
      Dictionary dictionary =
          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
      speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
      defaultSuggester = new Suggester(dictionary);
      cachingSuggester = new Suggester(dictionary).withSuggestibleEntryCache();
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStream);
@ -273,12 +277,8 @@ public class TestSpellChecking extends LuceneTestCase {
        assertFalse("Unexpectedly considered correct: " + word, speller.spell(word.trim()));
      }
      if (Files.exists(sug)) {
-        String suggestions =
+        assertEquals(Files.readString(sug).trim(), suggest(defaultSuggester, wrongWords));
-            wrongWords.stream()
+        assertEquals(Files.readString(sug).trim(), suggest(cachingSuggester, wrongWords));
                .map(s -> String.join(", ", speller.suggest(s)))
                .filter(s -> !s.isEmpty())
                .collect(Collectors.joining("\n"));
        assertEquals(Files.readString(sug).trim(), suggestions);
      }
    } else {
      assertFalse(".sug file without .wrong file!", Files.exists(sug));
@ -290,6 +290,13 @@ public class TestSpellChecking extends LuceneTestCase {
    }
  }
  private static String suggest(Suggester suggester, List<String> wrongWords) {
    return wrongWords.stream()
        .map(s -> String.join(", ", suggester.suggestNoTimeout(s, () -> {})))
        .filter(s -> !s.isEmpty())
        .collect(Collectors.joining("\n"));
  }
  private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
    Set<String> everythingGenerated = new HashSet<>();
    boolean generatedEverything = true;