LUCENE-9800: Hunspell: put a time limit on suggestion calculation (#2414)

* LUCENE-9800: Hunspell: put a time limit on suggestion calculation

* fix review remarks
This commit is contained in:
Peter Gromov 2021-02-22 20:06:24 +01:00 committed by GitHub
parent bfce5f36da
commit 42da2b45e6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 266 additions and 76 deletions

View File

@ -17,6 +17,7 @@
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET; import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE; import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
@ -24,11 +25,13 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD; import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedHashSet; import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
@ -49,20 +52,25 @@ import org.apache.lucene.util.IntsRef;
* shared by multiple spell-checkers in different threads). * shared by multiple spell-checkers in different threads).
*/ */
public class Hunspell { public class Hunspell {
static final long SUGGEST_TIME_LIMIT = 250;
final Dictionary dictionary; final Dictionary dictionary;
final Stemmer stemmer; final Stemmer stemmer;
private final TimeoutPolicy policy;
final Runnable checkCanceled; final Runnable checkCanceled;
public Hunspell(Dictionary dictionary) { public Hunspell(Dictionary dictionary) {
this(dictionary, () -> {}); this(dictionary, RETURN_PARTIAL_RESULT, () -> {});
} }
/** /**
* @param policy a strategy determining what to do when API calls take too much time
* @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking * @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
* or suggestion generation by throwing an exception * or suggestion generation by throwing an exception
*/ */
public Hunspell(Dictionary dictionary, Runnable checkCanceled) { public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) {
this.dictionary = dictionary; this.dictionary = dictionary;
this.policy = policy;
this.checkCanceled = checkCanceled; this.checkCanceled = checkCanceled;
stemmer = new Stemmer(dictionary); stemmer = new Stemmer(dictionary);
} }
@ -504,7 +512,16 @@ public class Hunspell {
&& spell(word.substring(breakPos + breakStr.length())); && spell(word.substring(breakPos + breakStr.length()));
} }
public List<String> suggest(String word) { /**
* @return suggestions for the given misspelled word
* @throws SuggestionTimeoutException if the computation takes too long and {@link
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
*/
public List<String> suggest(String word) throws SuggestionTimeoutException {
return suggest(word, SUGGEST_TIME_LIMIT);
}
List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
checkCanceled.run(); checkCanceled.run();
if (word.length() >= 100) return Collections.emptyList(); if (word.length() >= 100) return Collections.emptyList();
@ -520,16 +537,35 @@ public class Hunspell {
} }
} }
LinkedHashSet<String> suggestions = new LinkedHashSet<>();
Runnable checkCanceled =
policy == NO_TIMEOUT
? this.checkCanceled
: checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
try {
doSuggest(word, wordCase, suggestions, checkCanceled);
} catch (SuggestionTimeoutException e) {
if (policy == RETURN_PARTIAL_RESULT) {
return postprocess(word, wordCase, suggestions);
}
throw e;
}
return postprocess(word, wordCase, suggestions);
}
private void doSuggest(
String word, WordCase wordCase, LinkedHashSet<String> suggestions, Runnable checkCanceled) {
Hunspell suggestionSpeller = Hunspell suggestionSpeller =
new Hunspell(dictionary, checkCanceled) { new Hunspell(dictionary, policy, checkCanceled) {
@Override @Override
boolean acceptsStem(int formID) { boolean acceptsStem(int formID) {
return !dictionary.hasFlag(formID, dictionary.noSuggest) return !dictionary.hasFlag(formID, dictionary.noSuggest)
&& !dictionary.hasFlag(formID, dictionary.subStandard); && !dictionary.hasFlag(formID, dictionary.subStandard);
} }
}; };
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller); ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
Set<String> suggestions = modifier.suggest(word, wordCase); modifier.suggest(word, wordCase);
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) { if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
suggestions.addAll( suggestions.addAll(
@ -540,7 +576,35 @@ public class Hunspell {
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) { if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
suggestions.addAll(modifyChunksBetweenDashes(word)); suggestions.addAll(modifyChunksBetweenDashes(word));
} }
}
private Runnable checkTimeLimit(
String word, WordCase wordCase, Set<String> suggestions, long timeLimitMs) {
return new Runnable() {
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
int invocationCounter = 100;
@Override
public void run() {
checkCanceled.run();
if (--invocationCounter <= 0) {
if (System.nanoTime() - deadline > 0) {
stop();
}
invocationCounter = 100;
}
}
private void stop() {
List<String> partialResult =
policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
throw new SuggestionTimeoutException(message, partialResult);
}
};
}
private List<String> postprocess(String word, WordCase wordCase, Collection<String> suggestions) {
Set<String> result = new LinkedHashSet<>(); Set<String> result = new LinkedHashSet<>();
for (String candidate : suggestions) { for (String candidate : suggestions) {
result.add(adjustSuggestionCase(candidate, wordCase, word)); result.add(adjustSuggestionCase(candidate, wordCase, word));

View File

@ -26,17 +26,18 @@ import java.util.stream.Collectors;
/** A class that modifies the given misspelled word in various ways to get correct suggestions */ /** A class that modifies the given misspelled word in various ways to get correct suggestions */
class ModifyingSuggester { class ModifyingSuggester {
private static final int MAX_CHAR_DISTANCE = 4; private static final int MAX_CHAR_DISTANCE = 4;
private final LinkedHashSet<String> result = new LinkedHashSet<>(); private final LinkedHashSet<String> result;
private final char[] tryChars; private final char[] tryChars;
private final Hunspell speller; private final Hunspell speller;
boolean hasGoodSuggestions; boolean hasGoodSuggestions;
ModifyingSuggester(Hunspell speller) { ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
this.speller = speller; this.speller = speller;
tryChars = speller.dictionary.tryChars.toCharArray(); tryChars = speller.dictionary.tryChars.toCharArray();
this.result = result;
} }
LinkedHashSet<String> suggest(String word, WordCase wordCase) { void suggest(String word, WordCase wordCase) {
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word; String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) { if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
trySuggestion(low); trySuggestion(low);
@ -68,12 +69,11 @@ class ModifyingSuggester {
tryVariationsOf(speller.dictionary.toTitleCase(low)); tryVariationsOf(speller.dictionary.toTitleCase(low));
} }
return result.stream() List<String> adjusted =
.map(s -> capitalizeAfterSpace(low, s)) result.stream().map(s -> capitalizeAfterSpace(low, s)).collect(Collectors.toList());
.collect(Collectors.toCollection(LinkedHashSet::new)); result.clear();
result.addAll(adjusted);
} }
return result;
} }
// aNew -> "a New" (instead of "a new") // aNew -> "a New" (instead of "a new")

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.util.Collections;
import java.util.List;
/**
* An exception thrown when {@link Hunspell#suggest} call takes too long, if {@link
* TimeoutPolicy#THROW_EXCEPTION} is used.
*/
public class SuggestionTimeoutException extends RuntimeException {
private final List<String> partialResult;
SuggestionTimeoutException(String message, List<String> partialResult) {
super(message);
this.partialResult = partialResult == null ? null : Collections.unmodifiableList(partialResult);
}
/**
* @return partial result calculated by {@link Hunspell#suggest} before the time limit was
* exceeded
*/
public List<String> getPartialResult() {
return partialResult;
}
}

View File

@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
/** A strategy determining what to do when Hunspell API calls take too much time */
public enum TimeoutPolicy {
/** Let the computation complete even if it takes ages */
NO_TIMEOUT,
/** Just stop the calculation and return whatever has been computed so far */
RETURN_PARTIAL_RESULT,
/**
* Throw an exception (e.g. {@link SuggestionTimeoutException}) to make it more clear to the
* caller that the timeout happened and the returned results might not be reliable or
* reproducible.
*/
THROW_EXCEPTION
}

View File

@ -1,45 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
import java.util.Collections;
import java.util.concurrent.CancellationException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.util.LuceneTestCase;
public class HunspellTest extends LuceneTestCase {
public void testCheckCanceled() throws Exception {
AtomicBoolean canceled = new AtomicBoolean();
Runnable checkCanceled =
() -> {
if (canceled.get()) {
throw new CancellationException();
}
};
Hunspell hunspell =
new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"), checkCanceled);
assertTrue(hunspell.spell("apache"));
assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
canceled.set(true);
assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
}
}

View File

@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
import java.io.IOException;
import java.text.ParseException;
import java.util.Collections;
import java.util.concurrent.CancellationException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class TestHunspell extends LuceneTestCase {
public void testCheckCanceled() throws Exception {
AtomicBoolean canceled = new AtomicBoolean();
Runnable checkCanceled =
() -> {
if (canceled.get()) {
throw new CancellationException();
}
};
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
Hunspell hunspell = new Hunspell(dictionary, NO_TIMEOUT, checkCanceled);
assertTrue(hunspell.spell("apache"));
assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
canceled.set(true);
assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
}
public void testSuggestionTimeLimit() throws IOException, ParseException {
int timeLimitMs = 10;
Dictionary dictionary = loadDictionary(false, "timelimit.aff", "simple.dic");
String word = "qazwsxedcrfvtgbyhnujm";
Hunspell incomplete = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, () -> {});
assertFalse(incomplete.spell(word));
assertEquals(Collections.emptyList(), incomplete.suggest(word, timeLimitMs));
Hunspell throwing = new Hunspell(dictionary, THROW_EXCEPTION, () -> {});
assertFalse(throwing.spell(word));
SuggestionTimeoutException exception =
assertThrows(SuggestionTimeoutException.class, () -> throwing.suggest(word, timeLimitMs));
assertEquals(Collections.emptyList(), exception.getPartialResult());
Hunspell noLimit = new Hunspell(dictionary, NO_TIMEOUT, () -> {});
assertEquals(Collections.emptyList(), noLimit.suggest(word.substring(0, 5), timeLimitMs));
}
@Test
public void testStemmingApi() throws Exception {
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
}
}

View File

@ -28,6 +28,7 @@ import java.nio.file.Paths;
import java.text.ParseException; import java.text.ParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -81,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
@Test @Test
public void fr_suggest() throws Exception { public void fr_suggest() throws Exception {
checkSuggestionPerformance("fr", 10); checkSuggestionPerformance("fr", 100);
} }
private Dictionary loadDictionary(String code) throws IOException, ParseException { private Dictionary loadDictionary(String code) throws IOException, ParseException {
@ -97,7 +98,7 @@ public class TestPerformance extends LuceneTestCase {
List<String> words = loadWords(code, wordCount, dictionary); List<String> words = loadWords(code, wordCount, dictionary);
Stemmer stemmer = new Stemmer(dictionary); Stemmer stemmer = new Stemmer(dictionary);
Hunspell speller = new Hunspell(dictionary); Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
measure( measure(
"Stemming " + code, "Stemming " + code,
blackHole -> { blackHole -> {
@ -117,10 +118,10 @@ public class TestPerformance extends LuceneTestCase {
private void checkSuggestionPerformance(String code, int wordCount) throws Exception { private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
Dictionary dictionary = loadDictionary(code); Dictionary dictionary = loadDictionary(code);
Hunspell speller = new Hunspell(dictionary); Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
List<String> words = List<String> words =
loadWords(code, wordCount, dictionary).stream() loadWords(code, wordCount, dictionary).stream()
.filter(w -> !speller.spell(w)) .filter(w -> hasQuickSuggestions(speller, w))
.collect(Collectors.toList()); .collect(Collectors.toList());
System.out.println("Checking " + words.size() + " misspelled words"); System.out.println("Checking " + words.size() + " misspelled words");
@ -134,6 +135,25 @@ public class TestPerformance extends LuceneTestCase {
System.out.println(); System.out.println();
} }
private boolean hasQuickSuggestions(Hunspell speller, String word) {
if (speller.spell(word)) {
return false;
}
long start = System.nanoTime();
try {
speller.suggest(word);
} catch (SuggestionTimeoutException e) {
System.out.println("Timeout happened for " + word + ", skipping");
return false;
}
long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
if (elapsed > Hunspell.SUGGEST_TIME_LIMIT * 4 / 5) {
System.out.println(elapsed + "ms for " + word + ", too close to time limit, skipping");
}
return true;
}
private Path findAffFile(String code) throws IOException { private Path findAffFile(String code) throws IOException {
return TestAllDictionaries.findAllAffixFiles() return TestAllDictionaries.findAllAffixFiles()
.filter( .filter(

View File

@ -25,8 +25,9 @@ import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
public class TestSpellChecking extends StemmerTestBase { public class TestSpellChecking extends LuceneTestCase {
public void testBase() throws Exception { public void testBase() throws Exception {
doTest("base"); doTest("base");
@ -233,7 +234,7 @@ public class TestSpellChecking extends StemmerTestBase {
try { try {
Dictionary dictionary = Dictionary dictionary =
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream); new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
speller = new Hunspell(dictionary); speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
} finally { } finally {
IOUtils.closeWhileHandlingException(affixStream); IOUtils.closeWhileHandlingException(affixStream);
IOUtils.closeWhileHandlingException(dictStream); IOUtils.closeWhileHandlingException(dictStream);

View File

@ -16,11 +16,7 @@
*/ */
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.text.ParseException;
import java.util.Collections;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test;
public class TestStemmer extends StemmerTestBase { public class TestStemmer extends StemmerTestBase {
@ -62,13 +58,6 @@ public class TestStemmer extends StemmerTestBase {
assertStemsTo("solr", "olr"); assertStemsTo("solr", "olr");
} }
@Test
public void testHunspellStemmingApi() throws IOException, ParseException {
Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"));
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
}
// some bogus stuff that should not stem (empty lists)! // some bogus stuff that should not stem (empty lists)!
public void testBogusStems() { public void testBogusStems() {
assertStemsTo("abs"); assertStemsTo("abs");

View File

@ -0,0 +1,7 @@
# A very expensive MAP: each character from each group has to be replaced with every other character from the same group,
# so for long words the enumeration is very long
MAP 3
MAP qwertyuiop
MAP asdfghjkl
MAP zxcvbnm