LUCENE-10451 Hunspell: don't perform potentially expensive spellchecking after timeout (#721)

move all expensive operations closer to the suggestion creation, encapsulate case and output conversion into a new Suggestion class
2022-03-15 18:43:56 +01:00 · 2022-03-15 18:43:56 +01:00 · 92a20c24e9
parent b6c1024f55
commit 92a20c24e9
4 changed files with 157 additions and 89 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -52,7 +52,7 @@ class GeneratingSuggester {
    this.speller = speller;
  }
-  List<String> suggest(String word, WordCase originalCase, Set<String> prevSuggestions) {
+  List<String> suggest(String word, WordCase originalCase, Set<Suggestion> prevSuggestions) {
    List<Weighted<Root<String>>> roots = findSimilarDictionaryEntries(word, originalCase);
    List<Weighted<String>> expanded = expandRoots(word, roots);
    TreeSet<Weighted<String>> bySimilarity = rankBySimilarity(word, expanded);
@ -331,7 +331,7 @@ class GeneratingSuggester {
  }
  private List<String> getMostRelevantSuggestions(
-      TreeSet<Weighted<String>> bySimilarity, Set<String> prevSuggestions) {
+      TreeSet<Weighted<String>> bySimilarity, Set<Suggestion> prevSuggestions) {
    List<String> result = new ArrayList<>();
    boolean hasExcellent = false;
    for (Weighted<String> weighted : bySimilarity) {
@ -347,7 +347,7 @@ class GeneratingSuggester {
        break;
      }
-      if (prevSuggestions.stream().noneMatch(weighted.word::contains)
+      if (prevSuggestions.stream().noneMatch(s -> weighted.word.contains(s.raw))
          && result.stream().noneMatch(weighted.word::contains)
          && speller.checkWord(weighted.word)) {
        result.add(weighted.word);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -17,7 +17,8 @@
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
-import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
+import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
@ -25,11 +26,11 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
@ -543,25 +544,25 @@ public class Hunspell {
      }
    }
-    LinkedHashSet<String> suggestions = new LinkedHashSet<>();
+    LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
    Runnable checkCanceled =
-        policy == NO_TIMEOUT
+        policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, suggestions, timeLimitMs);
            ? this.checkCanceled
            : checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
    try {
      doSuggest(word, wordCase, suggestions, checkCanceled);
    } catch (SuggestionTimeoutException e) {
-      if (policy == RETURN_PARTIAL_RESULT) {
+      if (policy != RETURN_PARTIAL_RESULT) {
-        return postprocess(word, wordCase, suggestions);
+        throw e;
      }
      throw e;
    }
-    return postprocess(word, wordCase, suggestions);
+    return postprocess(suggestions);
  }
  private void doSuggest(
-      String word, WordCase wordCase, LinkedHashSet<String> suggestions, Runnable checkCanceled) {
+      String word,
      WordCase wordCase,
      LinkedHashSet<Suggestion> suggestions,
      Runnable checkCanceled) {
    Hunspell suggestionSpeller =
        new Hunspell(dictionary, policy, checkCanceled) {
          @Override
@ -570,22 +571,26 @@ public class Hunspell {
                && !dictionary.hasFlag(formID, dictionary.subStandard);
          }
        };
-    ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
+    boolean hasGoodSuggestions =
-    boolean hasGoodSuggestions = modifier.suggest(word, wordCase);
+        new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
    if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
-      suggestions.addAll(
+      List<String> generated =
          new GeneratingSuggester(suggestionSpeller)
-              .suggest(dictionary.toLowerCase(word), wordCase, suggestions));
+              .suggest(dictionary.toLowerCase(word), wordCase, suggestions);
      for (String raw : generated) {
        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
      }
    }
-    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
+    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
-      suggestions.addAll(modifyChunksBetweenDashes(word));
+      for (String raw : modifyChunksBetweenDashes(word)) {
        suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
      }
    }
  }
-  private Runnable checkTimeLimit(
+  private Runnable checkTimeLimit(String word, Set<Suggestion> suggestions, long timeLimitMs) {
      String word, WordCase wordCase, Set<String> suggestions, long timeLimitMs) {
    return new Runnable() {
      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
      int invocationCounter = 100;
@ -603,38 +608,15 @@ public class Hunspell {
      private void stop() {
        List<String> partialResult =
-            policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
+            policy == RETURN_PARTIAL_RESULT ? null : postprocess(suggestions);
        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
        throw new SuggestionTimeoutException(message, partialResult);
      }
    };
  }
-  private List<String> postprocess(String word, WordCase wordCase, Collection<String> suggestions) {
+  private List<String> postprocess(Collection<Suggestion> suggestions) {
-    Set<String> result = new LinkedHashSet<>();
+    return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
    for (String candidate : suggestions) {
      result.add(adjustSuggestionCase(candidate, wordCase, word));
      if (wordCase == WordCase.UPPER && dictionary.checkSharpS && candidate.contains("ß")) {
        result.add(candidate);
      }
    }
    return result.stream().map(this::cleanOutput).collect(Collectors.toList());
  }
  private String adjustSuggestionCase(String candidate, WordCase originalCase, String original) {
    if (originalCase == WordCase.UPPER) {
      String upper = candidate.toUpperCase(Locale.ROOT);
      if (upper.contains(" ") || spell(upper)) {
        return upper;
      }
    }
    if (Character.isUpperCase(original.charAt(0))) {
      String title = Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
      if (title.contains(" ") || spell(title)) {
        return title;
      }
    }
    return candidate;
  }
  private List<String> modifyChunksBetweenDashes(String word) {
@ -662,12 +644,4 @@ public class Hunspell {
    }
    return result;
  }
  private String cleanOutput(String s) {
    if (dictionary.oconv == null) return s;
    StringBuilder sb = new StringBuilder(s);
    dictionary.oconv.applyMappings(sb);
    return sb.toString();
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -25,42 +25,50 @@ import java.util.Locale;
 /** A class that modifies the given misspelled word in various ways to get correct suggestions */
 class ModifyingSuggester {
  private static final int MAX_CHAR_DISTANCE = 4;
-  private final LinkedHashSet<String> result;
+  private final LinkedHashSet<Suggestion> result;
  private final String misspelled;
  private final WordCase wordCase;
  private final char[] tryChars;
  private final Hunspell speller;
-  ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
+  ModifyingSuggester(
      Hunspell speller, LinkedHashSet<Suggestion> result, String misspelled, WordCase wordCase) {
    this.speller = speller;
    tryChars = speller.dictionary.tryChars.toCharArray();
    this.result = result;
    this.misspelled = misspelled;
    this.wordCase = wordCase;
  }
  /** @return whether any of the added suggestions are considered "good" */
-  boolean suggest(String word, WordCase wordCase) {
+  boolean suggest() {
-    String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
+    String low =
        wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(misspelled) : misspelled;
    if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
      trySuggestion(low);
    }
-    boolean hasGoodSuggestions = tryVariationsOf(word);
+    boolean hasGoodSuggestions = tryVariationsOf(misspelled);
    if (wordCase == WordCase.TITLE) {
      hasGoodSuggestions |= tryVariationsOf(low);
    } else if (wordCase == WordCase.UPPER) {
      hasGoodSuggestions |= tryVariationsOf(low);
-      hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(word));
+      hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(misspelled));
    } else if (wordCase == WordCase.MIXED) {
-      int dot = word.indexOf('.');
+      int dot = misspelled.indexOf('.');
-      if (dot > 0
+      if (dot > 0 && dot < misspelled.length() - 1) {
-          && dot < word.length() - 1
+        String afterDot = misspelled.substring(dot + 1);
-          && WordCase.caseOf(word.substring(dot + 1)) == WordCase.TITLE) {
+        if (WordCase.caseOf(afterDot) == WordCase.TITLE) {
-        result.add(word.substring(0, dot + 1) + " " + word.substring(dot + 1));
+          result.add(createSuggestion(misspelled.substring(0, dot + 1) + " " + afterDot));
        }
      }
-      boolean capitalized = Character.isUpperCase(word.charAt(0));
+      char first = misspelled.charAt(0);
      boolean capitalized = Character.isUpperCase(first);
      if (capitalized) {
        hasGoodSuggestions |=
-            tryVariationsOf(speller.dictionary.caseFold(word.charAt(0)) + word.substring(1));
+            tryVariationsOf(speller.dictionary.caseFold(first) + misspelled.substring(1));
      }
      hasGoodSuggestions |= tryVariationsOf(low);
@ -69,29 +77,38 @@ class ModifyingSuggester {
        hasGoodSuggestions |= tryVariationsOf(speller.dictionary.toTitleCase(low));
      }
-      List<String> adjusted = new ArrayList<>();
+      List<Suggestion> reordered = new ArrayList<>();
-      for (String candidate : result) {
+      for (Suggestion candidate : result) {
-        String s = capitalizeAfterSpace(word, candidate);
+        Suggestion changed = capitalizeAfterSpace(candidate.raw);
-        adjusted.add(s.equals(candidate) ? adjusted.size() : 0, s);
+        if (changed == null) {
          reordered.add(candidate);
        } else {
          reordered.add(0, changed);
        }
      }
      result.clear();
-      result.addAll(adjusted);
+      result.addAll(reordered);
    }
    return hasGoodSuggestions;
  }
  private Suggestion createSuggestion(String candidate) {
    return new Suggestion(candidate, misspelled, wordCase, speller);
  }
  // aNew -> "a New" (instead of "a new")
-  private String capitalizeAfterSpace(String misspelled, String candidate) {
+  private Suggestion capitalizeAfterSpace(String candidate) {
    int space = candidate.indexOf(' ');
    int tail = candidate.length() - space - 1;
    if (space > 0
        && !misspelled.regionMatches(misspelled.length() - tail, candidate, space + 1, tail)) {
-      return candidate.substring(0, space + 1)
+      return createSuggestion(
-          + Character.toUpperCase(candidate.charAt(space + 1))
+          candidate.substring(0, space + 1)
-          + candidate.substring(space + 2);
+              + Character.toUpperCase(candidate.charAt(space + 1))
              + candidate.substring(space + 2));
    }
-    return candidate;
+    return null;
  }
  private boolean tryVariationsOf(String word) {
@ -111,9 +128,9 @@ class ModifyingSuggester {
    tryReplacingChar(word);
    tryTwoDuplicateChars(word);
-    List<String> goodSplit = checkDictionaryForSplitSuggestions(word);
+    List<Suggestion> goodSplit = checkDictionaryForSplitSuggestions(word);
    if (!goodSplit.isEmpty()) {
-      List<String> copy = new ArrayList<>(result);
+      List<Suggestion> copy = new ArrayList<>(result);
      result.clear();
      result.addAll(goodSplit);
      if (hasGoodSuggestions) {
@ -139,7 +156,7 @@ class ModifyingSuggester {
        if (candidate.contains(" ")
            && Arrays.stream(candidate.split(" ")).allMatch(this::checkSimpleWord)) {
-          result.add(candidate);
+          result.add(createSuggestion(candidate));
        }
      }
    }
@ -294,19 +311,19 @@ class ModifyingSuggester {
    }
  }
-  private List<String> checkDictionaryForSplitSuggestions(String word) {
+  private List<Suggestion> checkDictionaryForSplitSuggestions(String word) {
-    List<String> result = new ArrayList<>();
+    List<Suggestion> result = new ArrayList<>();
    for (int i = 1; i < word.length() - 1; i++) {
      String w1 = word.substring(0, i);
      String w2 = word.substring(i);
      String spaced = w1 + " " + w2;
      if (speller.checkWord(spaced)) {
-        result.add(spaced);
+        result.add(createSuggestion(spaced));
      }
      if (shouldSplitByDash()) {
        String dashed = w1 + "-" + w2;
        if (speller.checkWord(dashed)) {
-          result.add(dashed);
+          result.add(createSuggestion(dashed));
        }
      }
    }
@ -318,9 +335,9 @@ class ModifyingSuggester {
      String w1 = word.substring(0, i);
      String w2 = word.substring(i);
      if (checkSimpleWord(w1) && checkSimpleWord(w2)) {
-        result.add(w1 + " " + w2);
+        result.add(createSuggestion(w1 + " " + w2));
        if (w1.length() > 1 && w2.length() > 1 && shouldSplitByDash()) {
-          result.add(w1 + "-" + w2);
+          result.add(createSuggestion(w1 + "-" + w2));
        }
      }
    }
@ -331,6 +348,6 @@ class ModifyingSuggester {
  }
  private boolean trySuggestion(String candidate) {
-    return speller.checkWord(candidate) && result.add(candidate);
+    return speller.checkWord(candidate) && result.add(createSuggestion(candidate));
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggestion.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Suggestion.java
@ -0,0 +1,77 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 import java.util.Objects;
 class Suggestion {
  final String raw;
  final String[] result;
  Suggestion(String raw, String misspelled, WordCase originalCase, Hunspell speller) {
    this.raw = raw;
    List<String> result = new ArrayList<>();
    String adjusted = adjustSuggestionCase(raw, misspelled, originalCase);
    result.add(
        cleanOutput(speller, adjusted.contains(" ") || speller.spell(adjusted) ? adjusted : raw));
    if (originalCase == WordCase.UPPER && speller.dictionary.checkSharpS && raw.contains("ß")) {
      result.add(cleanOutput(speller, raw));
    }
    this.result = result.toArray(new String[0]);
  }
  private String adjustSuggestionCase(String candidate, String misspelled, WordCase originalCase) {
    if (originalCase == WordCase.UPPER) {
      return candidate.toUpperCase(Locale.ROOT);
    }
    if (Character.isUpperCase(misspelled.charAt(0))) {
      return Character.toUpperCase(candidate.charAt(0)) + candidate.substring(1);
    }
    return candidate;
  }
  private String cleanOutput(Hunspell speller, String s) {
    if (speller.dictionary.oconv == null) return s;
    StringBuilder sb = new StringBuilder(s);
    speller.dictionary.oconv.applyMappings(sb);
    return sb.toString();
  }
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (!(o instanceof Suggestion)) return false;
    Suggestion that = (Suggestion) o;
    return raw.equals(that.raw) && Arrays.equals(result, that.result);
  }
  @Override
  public int hashCode() {
    return 31 * Objects.hash(raw) + Arrays.hashCode(result);
  }
  @Override
  public String toString() {
    return raw;
  }
 }