LUCENE-9800: Hunspell: put a time limit on suggestion calculation (#2414)

* LUCENE-9800: Hunspell: put a time limit on suggestion calculation * fix review remarks
2021-02-22 20:06:24 +01:00 · 2021-02-22 20:06:24 +01:00 · 42da2b45e6
parent bfce5f36da
commit 42da2b45e6
10 changed files with 266 additions and 76 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -17,6 +17,7 @@
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
@ -24,11 +25,13 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -49,20 +52,25 @@ import org.apache.lucene.util.IntsRef;
 * shared by multiple spell-checkers in different threads).
 */
 public class Hunspell {
  static final long SUGGEST_TIME_LIMIT = 250;
  final Dictionary dictionary;
  final Stemmer stemmer;
  private final TimeoutPolicy policy;
  final Runnable checkCanceled;
  public Hunspell(Dictionary dictionary) {
-    this(dictionary, () -> {});
+    this(dictionary, RETURN_PARTIAL_RESULT, () -> {});
  }
  /**
   * @param policy a strategy determining what to do when API calls take too much time
   * @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
   *     or suggestion generation by throwing an exception
   */
-  public Hunspell(Dictionary dictionary, Runnable checkCanceled) {
+  public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) {
    this.dictionary = dictionary;
    this.policy = policy;
    this.checkCanceled = checkCanceled;
    stemmer = new Stemmer(dictionary);
  }
@ -504,7 +512,16 @@ public class Hunspell {
        && spell(word.substring(breakPos + breakStr.length()));
  }
-  public List<String> suggest(String word) {
+  /**
   * @return suggestions for the given misspelled word
   * @throws SuggestionTimeoutException if the computation takes too long and {@link
   *     TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
   */
  public List<String> suggest(String word) throws SuggestionTimeoutException {
    return suggest(word, SUGGEST_TIME_LIMIT);
  }
  List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
    checkCanceled.run();
    if (word.length() >= 100) return Collections.emptyList();
@ -520,16 +537,35 @@ public class Hunspell {
      }
    }
    LinkedHashSet<String> suggestions = new LinkedHashSet<>();
    Runnable checkCanceled =
        policy == NO_TIMEOUT
            ? this.checkCanceled
            : checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
    try {
      doSuggest(word, wordCase, suggestions, checkCanceled);
    } catch (SuggestionTimeoutException e) {
      if (policy == RETURN_PARTIAL_RESULT) {
        return postprocess(word, wordCase, suggestions);
      }
      throw e;
    }
    return postprocess(word, wordCase, suggestions);
  }
  private void doSuggest(
      String word, WordCase wordCase, LinkedHashSet<String> suggestions, Runnable checkCanceled) {
    Hunspell suggestionSpeller =
-        new Hunspell(dictionary, checkCanceled) {
+        new Hunspell(dictionary, policy, checkCanceled) {
          @Override
          boolean acceptsStem(int formID) {
            return !dictionary.hasFlag(formID, dictionary.noSuggest)
                && !dictionary.hasFlag(formID, dictionary.subStandard);
          }
        };
-    ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
+    ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
-    Set<String> suggestions = modifier.suggest(word, wordCase);
+    modifier.suggest(word, wordCase);
    if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
      suggestions.addAll(
@ -540,7 +576,35 @@ public class Hunspell {
    if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
      suggestions.addAll(modifyChunksBetweenDashes(word));
    }
  }
  private Runnable checkTimeLimit(
      String word, WordCase wordCase, Set<String> suggestions, long timeLimitMs) {
    return new Runnable() {
      final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
      int invocationCounter = 100;
      @Override
      public void run() {
        checkCanceled.run();
        if (--invocationCounter <= 0) {
          if (System.nanoTime() - deadline > 0) {
            stop();
          }
          invocationCounter = 100;
        }
      }
      private void stop() {
        List<String> partialResult =
            policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
        String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
        throw new SuggestionTimeoutException(message, partialResult);
      }
    };
  }
  private List<String> postprocess(String word, WordCase wordCase, Collection<String> suggestions) {
    Set<String> result = new LinkedHashSet<>();
    for (String candidate : suggestions) {
      result.add(adjustSuggestionCase(candidate, wordCase, word));
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ModifyingSuggester.java
@ -26,17 +26,18 @@ import java.util.stream.Collectors;
 /** A class that modifies the given misspelled word in various ways to get correct suggestions */
 class ModifyingSuggester {
  private static final int MAX_CHAR_DISTANCE = 4;
-  private final LinkedHashSet<String> result = new LinkedHashSet<>();
+  private final LinkedHashSet<String> result;
  private final char[] tryChars;
  private final Hunspell speller;
  boolean hasGoodSuggestions;
-  ModifyingSuggester(Hunspell speller) {
+  ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
    this.speller = speller;
    tryChars = speller.dictionary.tryChars.toCharArray();
    this.result = result;
  }
-  LinkedHashSet<String> suggest(String word, WordCase wordCase) {
+  void suggest(String word, WordCase wordCase) {
    String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
    if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
      trySuggestion(low);
@ -68,12 +69,11 @@ class ModifyingSuggester {
        tryVariationsOf(speller.dictionary.toTitleCase(low));
      }
-      return result.stream()
+      List<String> adjusted =
-          .map(s -> capitalizeAfterSpace(low, s))
+          result.stream().map(s -> capitalizeAfterSpace(low, s)).collect(Collectors.toList());
-          .collect(Collectors.toCollection(LinkedHashSet::new));
+      result.clear();
      result.addAll(adjusted);
    }
    return result;
  }
  // aNew -> "a New" (instead of "a new")
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestionTimeoutException.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SuggestionTimeoutException.java
@ -0,0 +1,41 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import java.util.Collections;
 import java.util.List;
 /**
 * An exception thrown when {@link Hunspell#suggest} call takes too long, if {@link
 * TimeoutPolicy#THROW_EXCEPTION} is used.
 */
 public class SuggestionTimeoutException extends RuntimeException {
  private final List<String> partialResult;
  SuggestionTimeoutException(String message, List<String> partialResult) {
    super(message);
    this.partialResult = partialResult == null ? null : Collections.unmodifiableList(partialResult);
  }
  /**
   * @return partial result calculated by {@link Hunspell#suggest} before the time limit was
   *     exceeded
   */
  public List<String> getPartialResult() {
    return partialResult;
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TimeoutPolicy.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/TimeoutPolicy.java
@ -0,0 +1,33 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 /** A strategy determining what to do when Hunspell API calls take too much time */
 public enum TimeoutPolicy {
  /** Let the computation complete even if it takes ages */
  NO_TIMEOUT,
  /** Just stop the calculation and return whatever has been computed so far */
  RETURN_PARTIAL_RESULT,
  /**
   * Throw an exception (e.g. {@link SuggestionTimeoutException}) to make it more clear to the
   * caller that the timeout happened and the returned results might not be reliable or
   * reproducible.
   */
  THROW_EXCEPTION
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellTest.java
@ -1,45 +0,0 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
 import java.util.Collections;
 import java.util.concurrent.CancellationException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import org.apache.lucene.util.LuceneTestCase;
 public class HunspellTest extends LuceneTestCase {
  public void testCheckCanceled() throws Exception {
    AtomicBoolean canceled = new AtomicBoolean();
    Runnable checkCanceled =
        () -> {
          if (canceled.get()) {
            throw new CancellationException();
          }
        };
    Hunspell hunspell =
        new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"), checkCanceled);
    assertTrue(hunspell.spell("apache"));
    assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
    canceled.set(true);
    assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
    assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspell.java
@ -0,0 +1,80 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
 import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
 import java.io.IOException;
 import java.text.ParseException;
 import java.util.Collections;
 import java.util.concurrent.CancellationException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import org.apache.lucene.util.LuceneTestCase;
 import org.junit.Test;
 public class TestHunspell extends LuceneTestCase {
  public void testCheckCanceled() throws Exception {
    AtomicBoolean canceled = new AtomicBoolean();
    Runnable checkCanceled =
        () -> {
          if (canceled.get()) {
            throw new CancellationException();
          }
        };
    Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
    Hunspell hunspell = new Hunspell(dictionary, NO_TIMEOUT, checkCanceled);
    assertTrue(hunspell.spell("apache"));
    assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
    canceled.set(true);
    assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
    assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
  }
  public void testSuggestionTimeLimit() throws IOException, ParseException {
    int timeLimitMs = 10;
    Dictionary dictionary = loadDictionary(false, "timelimit.aff", "simple.dic");
    String word = "qazwsxedcrfvtgbyhnujm";
    Hunspell incomplete = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, () -> {});
    assertFalse(incomplete.spell(word));
    assertEquals(Collections.emptyList(), incomplete.suggest(word, timeLimitMs));
    Hunspell throwing = new Hunspell(dictionary, THROW_EXCEPTION, () -> {});
    assertFalse(throwing.spell(word));
    SuggestionTimeoutException exception =
        assertThrows(SuggestionTimeoutException.class, () -> throwing.suggest(word, timeLimitMs));
    assertEquals(Collections.emptyList(), exception.getPartialResult());
    Hunspell noLimit = new Hunspell(dictionary, NO_TIMEOUT, () -> {});
    assertEquals(Collections.emptyList(), noLimit.suggest(word.substring(0, 5), timeLimitMs));
  }
  @Test
  public void testStemmingApi() throws Exception {
    Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
    Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
    assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
    assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestPerformance.java
@ -28,6 +28,7 @@ import java.nio.file.Paths;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Consumer;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
@ -81,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
  @Test
  public void fr_suggest() throws Exception {
-    checkSuggestionPerformance("fr", 10);
+    checkSuggestionPerformance("fr", 100);
  }
  private Dictionary loadDictionary(String code) throws IOException, ParseException {
@ -97,7 +98,7 @@ public class TestPerformance extends LuceneTestCase {
    List<String> words = loadWords(code, wordCount, dictionary);
    Stemmer stemmer = new Stemmer(dictionary);
-    Hunspell speller = new Hunspell(dictionary);
+    Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
    measure(
        "Stemming " + code,
        blackHole -> {
@ -117,10 +118,10 @@ public class TestPerformance extends LuceneTestCase {
  private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
    Dictionary dictionary = loadDictionary(code);
-    Hunspell speller = new Hunspell(dictionary);
+    Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
    List<String> words =
        loadWords(code, wordCount, dictionary).stream()
-            .filter(w -> !speller.spell(w))
+            .filter(w -> hasQuickSuggestions(speller, w))
            .collect(Collectors.toList());
    System.out.println("Checking " + words.size() + " misspelled words");
@ -134,6 +135,25 @@ public class TestPerformance extends LuceneTestCase {
    System.out.println();
  }
  private boolean hasQuickSuggestions(Hunspell speller, String word) {
    if (speller.spell(word)) {
      return false;
    }
    long start = System.nanoTime();
    try {
      speller.suggest(word);
    } catch (SuggestionTimeoutException e) {
      System.out.println("Timeout happened for " + word + ", skipping");
      return false;
    }
    long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
    if (elapsed > Hunspell.SUGGEST_TIME_LIMIT * 4 / 5) {
      System.out.println(elapsed + "ms for " + word + ", too close to time limit, skipping");
    }
    return true;
  }
  private Path findAffFile(String code) throws IOException {
    return TestAllDictionaries.findAllAffixFiles()
        .filter(
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@ -25,8 +25,9 @@ import java.util.List;
 import java.util.stream.Collectors;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
-public class TestSpellChecking extends StemmerTestBase {
+public class TestSpellChecking extends LuceneTestCase {
  public void testBase() throws Exception {
    doTest("base");
@ -233,7 +234,7 @@ public class TestSpellChecking extends StemmerTestBase {
    try {
      Dictionary dictionary =
          new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
-      speller = new Hunspell(dictionary);
+      speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
    } finally {
      IOUtils.closeWhileHandlingException(affixStream);
      IOUtils.closeWhileHandlingException(dictStream);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
@ -16,11 +16,7 @@
 */
 package org.apache.lucene.analysis.hunspell;
 import java.io.IOException;
 import java.text.ParseException;
 import java.util.Collections;
 import org.junit.BeforeClass;
 import org.junit.Test;
 public class TestStemmer extends StemmerTestBase {
@ -62,13 +58,6 @@ public class TestStemmer extends StemmerTestBase {
    assertStemsTo("solr", "olr");
  }
  @Test
  public void testHunspellStemmingApi() throws IOException, ParseException {
    Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"));
    assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
    assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
  }
  // some bogus stuff that should not stem (empty lists)!
  public void testBogusStems() {
    assertStemsTo("abs");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/timelimit.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/timelimit.aff
@ -0,0 +1,7 @@
 # A very expensive MAP: each character from each group has to be replaced with every other character from the same group,
 # so for long words the enumeration is very long
 MAP 3
 MAP qwertyuiop
 MAP asdfghjkl
 MAP zxcvbnm