mirror of https://github.com/apache/lucene.git
LUCENE-9800: Hunspell: put a time limit on suggestion calculation (#2414)
* LUCENE-9800: Hunspell: put a time limit on suggestion calculation * fix review remarks
This commit is contained in:
parent
bfce5f36da
commit
42da2b45e6
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.*;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||||
|
@ -24,11 +25,13 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
@ -49,20 +52,25 @@ import org.apache.lucene.util.IntsRef;
|
||||||
* shared by multiple spell-checkers in different threads).
|
* shared by multiple spell-checkers in different threads).
|
||||||
*/
|
*/
|
||||||
public class Hunspell {
|
public class Hunspell {
|
||||||
|
static final long SUGGEST_TIME_LIMIT = 250;
|
||||||
|
|
||||||
final Dictionary dictionary;
|
final Dictionary dictionary;
|
||||||
final Stemmer stemmer;
|
final Stemmer stemmer;
|
||||||
|
private final TimeoutPolicy policy;
|
||||||
final Runnable checkCanceled;
|
final Runnable checkCanceled;
|
||||||
|
|
||||||
public Hunspell(Dictionary dictionary) {
|
public Hunspell(Dictionary dictionary) {
|
||||||
this(dictionary, () -> {});
|
this(dictionary, RETURN_PARTIAL_RESULT, () -> {});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* @param policy a strategy determining what to do when API calls take too much time
|
||||||
* @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
|
* @param checkCanceled an object that's periodically called, allowing to interrupt spell-checking
|
||||||
* or suggestion generation by throwing an exception
|
* or suggestion generation by throwing an exception
|
||||||
*/
|
*/
|
||||||
public Hunspell(Dictionary dictionary, Runnable checkCanceled) {
|
public Hunspell(Dictionary dictionary, TimeoutPolicy policy, Runnable checkCanceled) {
|
||||||
this.dictionary = dictionary;
|
this.dictionary = dictionary;
|
||||||
|
this.policy = policy;
|
||||||
this.checkCanceled = checkCanceled;
|
this.checkCanceled = checkCanceled;
|
||||||
stemmer = new Stemmer(dictionary);
|
stemmer = new Stemmer(dictionary);
|
||||||
}
|
}
|
||||||
|
@ -504,7 +512,16 @@ public class Hunspell {
|
||||||
&& spell(word.substring(breakPos + breakStr.length()));
|
&& spell(word.substring(breakPos + breakStr.length()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> suggest(String word) {
|
/**
|
||||||
|
* @return suggestions for the given misspelled word
|
||||||
|
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
||||||
|
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
||||||
|
*/
|
||||||
|
public List<String> suggest(String word) throws SuggestionTimeoutException {
|
||||||
|
return suggest(word, SUGGEST_TIME_LIMIT);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
|
||||||
checkCanceled.run();
|
checkCanceled.run();
|
||||||
if (word.length() >= 100) return Collections.emptyList();
|
if (word.length() >= 100) return Collections.emptyList();
|
||||||
|
|
||||||
|
@ -520,16 +537,35 @@ public class Hunspell {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LinkedHashSet<String> suggestions = new LinkedHashSet<>();
|
||||||
|
Runnable checkCanceled =
|
||||||
|
policy == NO_TIMEOUT
|
||||||
|
? this.checkCanceled
|
||||||
|
: checkTimeLimit(word, wordCase, suggestions, timeLimitMs);
|
||||||
|
try {
|
||||||
|
doSuggest(word, wordCase, suggestions, checkCanceled);
|
||||||
|
} catch (SuggestionTimeoutException e) {
|
||||||
|
if (policy == RETURN_PARTIAL_RESULT) {
|
||||||
|
return postprocess(word, wordCase, suggestions);
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
return postprocess(word, wordCase, suggestions);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doSuggest(
|
||||||
|
String word, WordCase wordCase, LinkedHashSet<String> suggestions, Runnable checkCanceled) {
|
||||||
Hunspell suggestionSpeller =
|
Hunspell suggestionSpeller =
|
||||||
new Hunspell(dictionary, checkCanceled) {
|
new Hunspell(dictionary, policy, checkCanceled) {
|
||||||
@Override
|
@Override
|
||||||
boolean acceptsStem(int formID) {
|
boolean acceptsStem(int formID) {
|
||||||
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
||||||
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller);
|
ModifyingSuggester modifier = new ModifyingSuggester(suggestionSpeller, suggestions);
|
||||||
Set<String> suggestions = modifier.suggest(word, wordCase);
|
modifier.suggest(word, wordCase);
|
||||||
|
|
||||||
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
if (!modifier.hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||||
suggestions.addAll(
|
suggestions.addAll(
|
||||||
|
@ -540,7 +576,35 @@ public class Hunspell {
|
||||||
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
|
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.contains("-"))) {
|
||||||
suggestions.addAll(modifyChunksBetweenDashes(word));
|
suggestions.addAll(modifyChunksBetweenDashes(word));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Runnable checkTimeLimit(
|
||||||
|
String word, WordCase wordCase, Set<String> suggestions, long timeLimitMs) {
|
||||||
|
return new Runnable() {
|
||||||
|
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
|
||||||
|
int invocationCounter = 100;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
checkCanceled.run();
|
||||||
|
if (--invocationCounter <= 0) {
|
||||||
|
if (System.nanoTime() - deadline > 0) {
|
||||||
|
stop();
|
||||||
|
}
|
||||||
|
invocationCounter = 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void stop() {
|
||||||
|
List<String> partialResult =
|
||||||
|
policy == RETURN_PARTIAL_RESULT ? null : postprocess(word, wordCase, suggestions);
|
||||||
|
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
|
||||||
|
throw new SuggestionTimeoutException(message, partialResult);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> postprocess(String word, WordCase wordCase, Collection<String> suggestions) {
|
||||||
Set<String> result = new LinkedHashSet<>();
|
Set<String> result = new LinkedHashSet<>();
|
||||||
for (String candidate : suggestions) {
|
for (String candidate : suggestions) {
|
||||||
result.add(adjustSuggestionCase(candidate, wordCase, word));
|
result.add(adjustSuggestionCase(candidate, wordCase, word));
|
||||||
|
|
|
@ -26,17 +26,18 @@ import java.util.stream.Collectors;
|
||||||
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
|
/** A class that modifies the given misspelled word in various ways to get correct suggestions */
|
||||||
class ModifyingSuggester {
|
class ModifyingSuggester {
|
||||||
private static final int MAX_CHAR_DISTANCE = 4;
|
private static final int MAX_CHAR_DISTANCE = 4;
|
||||||
private final LinkedHashSet<String> result = new LinkedHashSet<>();
|
private final LinkedHashSet<String> result;
|
||||||
private final char[] tryChars;
|
private final char[] tryChars;
|
||||||
private final Hunspell speller;
|
private final Hunspell speller;
|
||||||
boolean hasGoodSuggestions;
|
boolean hasGoodSuggestions;
|
||||||
|
|
||||||
ModifyingSuggester(Hunspell speller) {
|
ModifyingSuggester(Hunspell speller, LinkedHashSet<String> result) {
|
||||||
this.speller = speller;
|
this.speller = speller;
|
||||||
tryChars = speller.dictionary.tryChars.toCharArray();
|
tryChars = speller.dictionary.tryChars.toCharArray();
|
||||||
|
this.result = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
LinkedHashSet<String> suggest(String word, WordCase wordCase) {
|
void suggest(String word, WordCase wordCase) {
|
||||||
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
|
String low = wordCase != WordCase.LOWER ? speller.dictionary.toLowerCase(word) : word;
|
||||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
|
if (wordCase == WordCase.UPPER || wordCase == WordCase.MIXED) {
|
||||||
trySuggestion(low);
|
trySuggestion(low);
|
||||||
|
@ -68,12 +69,11 @@ class ModifyingSuggester {
|
||||||
tryVariationsOf(speller.dictionary.toTitleCase(low));
|
tryVariationsOf(speller.dictionary.toTitleCase(low));
|
||||||
}
|
}
|
||||||
|
|
||||||
return result.stream()
|
List<String> adjusted =
|
||||||
.map(s -> capitalizeAfterSpace(low, s))
|
result.stream().map(s -> capitalizeAfterSpace(low, s)).collect(Collectors.toList());
|
||||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
result.clear();
|
||||||
|
result.addAll(adjusted);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// aNew -> "a New" (instead of "a new")
|
// aNew -> "a New" (instead of "a new")
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An exception thrown when {@link Hunspell#suggest} call takes too long, if {@link
|
||||||
|
* TimeoutPolicy#THROW_EXCEPTION} is used.
|
||||||
|
*/
|
||||||
|
public class SuggestionTimeoutException extends RuntimeException {
|
||||||
|
private final List<String> partialResult;
|
||||||
|
|
||||||
|
SuggestionTimeoutException(String message, List<String> partialResult) {
|
||||||
|
super(message);
|
||||||
|
this.partialResult = partialResult == null ? null : Collections.unmodifiableList(partialResult);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return partial result calculated by {@link Hunspell#suggest} before the time limit was
|
||||||
|
* exceeded
|
||||||
|
*/
|
||||||
|
public List<String> getPartialResult() {
|
||||||
|
return partialResult;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,33 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
/** A strategy determining what to do when Hunspell API calls take too much time */
|
||||||
|
public enum TimeoutPolicy {
|
||||||
|
/** Let the computation complete even if it takes ages */
|
||||||
|
NO_TIMEOUT,
|
||||||
|
|
||||||
|
/** Just stop the calculation and return whatever has been computed so far */
|
||||||
|
RETURN_PARTIAL_RESULT,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Throw an exception (e.g. {@link SuggestionTimeoutException}) to make it more clear to the
|
||||||
|
* caller that the timeout happened and the returned results might not be reliable or
|
||||||
|
* reproducible.
|
||||||
|
*/
|
||||||
|
THROW_EXCEPTION
|
||||||
|
}
|
|
@ -1,45 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.analysis.hunspell;
|
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
|
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.concurrent.CancellationException;
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
public class HunspellTest extends LuceneTestCase {
|
|
||||||
public void testCheckCanceled() throws Exception {
|
|
||||||
AtomicBoolean canceled = new AtomicBoolean();
|
|
||||||
Runnable checkCanceled =
|
|
||||||
() -> {
|
|
||||||
if (canceled.get()) {
|
|
||||||
throw new CancellationException();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Hunspell hunspell =
|
|
||||||
new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"), checkCanceled);
|
|
||||||
|
|
||||||
assertTrue(hunspell.spell("apache"));
|
|
||||||
assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
|
|
||||||
|
|
||||||
canceled.set(true);
|
|
||||||
assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
|
|
||||||
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.hunspell.StemmerTestBase.loadDictionary;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.RETURN_PARTIAL_RESULT;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.THROW_EXCEPTION;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.concurrent.CancellationException;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class TestHunspell extends LuceneTestCase {
|
||||||
|
public void testCheckCanceled() throws Exception {
|
||||||
|
AtomicBoolean canceled = new AtomicBoolean();
|
||||||
|
Runnable checkCanceled =
|
||||||
|
() -> {
|
||||||
|
if (canceled.get()) {
|
||||||
|
throw new CancellationException();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
|
||||||
|
Hunspell hunspell = new Hunspell(dictionary, NO_TIMEOUT, checkCanceled);
|
||||||
|
|
||||||
|
assertTrue(hunspell.spell("apache"));
|
||||||
|
assertEquals(Collections.singletonList("apach"), hunspell.suggest("apac"));
|
||||||
|
|
||||||
|
canceled.set(true);
|
||||||
|
assertThrows(CancellationException.class, () -> hunspell.spell("apache"));
|
||||||
|
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSuggestionTimeLimit() throws IOException, ParseException {
|
||||||
|
int timeLimitMs = 10;
|
||||||
|
|
||||||
|
Dictionary dictionary = loadDictionary(false, "timelimit.aff", "simple.dic");
|
||||||
|
String word = "qazwsxedcrfvtgbyhnujm";
|
||||||
|
|
||||||
|
Hunspell incomplete = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, () -> {});
|
||||||
|
assertFalse(incomplete.spell(word));
|
||||||
|
assertEquals(Collections.emptyList(), incomplete.suggest(word, timeLimitMs));
|
||||||
|
|
||||||
|
Hunspell throwing = new Hunspell(dictionary, THROW_EXCEPTION, () -> {});
|
||||||
|
assertFalse(throwing.spell(word));
|
||||||
|
|
||||||
|
SuggestionTimeoutException exception =
|
||||||
|
assertThrows(SuggestionTimeoutException.class, () -> throwing.suggest(word, timeLimitMs));
|
||||||
|
assertEquals(Collections.emptyList(), exception.getPartialResult());
|
||||||
|
|
||||||
|
Hunspell noLimit = new Hunspell(dictionary, NO_TIMEOUT, () -> {});
|
||||||
|
assertEquals(Collections.emptyList(), noLimit.suggest(word.substring(0, 5), timeLimitMs));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testStemmingApi() throws Exception {
|
||||||
|
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
|
||||||
|
Hunspell hunspell = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
|
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
|
||||||
|
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -28,6 +28,7 @@ import java.nio.file.Paths;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -81,7 +82,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void fr_suggest() throws Exception {
|
public void fr_suggest() throws Exception {
|
||||||
checkSuggestionPerformance("fr", 10);
|
checkSuggestionPerformance("fr", 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
private Dictionary loadDictionary(String code) throws IOException, ParseException {
|
||||||
|
@ -97,7 +98,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
List<String> words = loadWords(code, wordCount, dictionary);
|
List<String> words = loadWords(code, wordCount, dictionary);
|
||||||
|
|
||||||
Stemmer stemmer = new Stemmer(dictionary);
|
Stemmer stemmer = new Stemmer(dictionary);
|
||||||
Hunspell speller = new Hunspell(dictionary);
|
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
measure(
|
measure(
|
||||||
"Stemming " + code,
|
"Stemming " + code,
|
||||||
blackHole -> {
|
blackHole -> {
|
||||||
|
@ -117,10 +118,10 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
|
|
||||||
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
||||||
Dictionary dictionary = loadDictionary(code);
|
Dictionary dictionary = loadDictionary(code);
|
||||||
Hunspell speller = new Hunspell(dictionary);
|
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
||||||
List<String> words =
|
List<String> words =
|
||||||
loadWords(code, wordCount, dictionary).stream()
|
loadWords(code, wordCount, dictionary).stream()
|
||||||
.filter(w -> !speller.spell(w))
|
.filter(w -> hasQuickSuggestions(speller, w))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
System.out.println("Checking " + words.size() + " misspelled words");
|
System.out.println("Checking " + words.size() + " misspelled words");
|
||||||
|
|
||||||
|
@ -134,6 +135,25 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean hasQuickSuggestions(Hunspell speller, String word) {
|
||||||
|
if (speller.spell(word)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
long start = System.nanoTime();
|
||||||
|
try {
|
||||||
|
speller.suggest(word);
|
||||||
|
} catch (SuggestionTimeoutException e) {
|
||||||
|
System.out.println("Timeout happened for " + word + ", skipping");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
long elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
|
||||||
|
if (elapsed > Hunspell.SUGGEST_TIME_LIMIT * 4 / 5) {
|
||||||
|
System.out.println(elapsed + "ms for " + word + ", too close to time limit, skipping");
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private Path findAffFile(String code) throws IOException {
|
private Path findAffFile(String code) throws IOException {
|
||||||
return TestAllDictionaries.findAllAffixFiles()
|
return TestAllDictionaries.findAllAffixFiles()
|
||||||
.filter(
|
.filter(
|
||||||
|
|
|
@ -25,8 +25,9 @@ import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestSpellChecking extends StemmerTestBase {
|
public class TestSpellChecking extends LuceneTestCase {
|
||||||
|
|
||||||
public void testBase() throws Exception {
|
public void testBase() throws Exception {
|
||||||
doTest("base");
|
doTest("base");
|
||||||
|
@ -233,7 +234,7 @@ public class TestSpellChecking extends StemmerTestBase {
|
||||||
try {
|
try {
|
||||||
Dictionary dictionary =
|
Dictionary dictionary =
|
||||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
||||||
speller = new Hunspell(dictionary);
|
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeWhileHandlingException(affixStream);
|
IOUtils.closeWhileHandlingException(affixStream);
|
||||||
IOUtils.closeWhileHandlingException(dictStream);
|
IOUtils.closeWhileHandlingException(dictStream);
|
||||||
|
|
|
@ -16,11 +16,7 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.text.ParseException;
|
|
||||||
import java.util.Collections;
|
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
import org.junit.Test;
|
|
||||||
|
|
||||||
public class TestStemmer extends StemmerTestBase {
|
public class TestStemmer extends StemmerTestBase {
|
||||||
|
|
||||||
|
@ -62,13 +58,6 @@ public class TestStemmer extends StemmerTestBase {
|
||||||
assertStemsTo("solr", "olr");
|
assertStemsTo("solr", "olr");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testHunspellStemmingApi() throws IOException, ParseException {
|
|
||||||
Hunspell hunspell = new Hunspell(loadDictionary(false, "simple.aff", "simple.dic"));
|
|
||||||
assertEquals(Collections.singletonList("apach"), hunspell.getRoots("apache"));
|
|
||||||
assertEquals(Collections.singletonList("foo"), hunspell.getRoots("foo"));
|
|
||||||
}
|
|
||||||
|
|
||||||
// some bogus stuff that should not stem (empty lists)!
|
// some bogus stuff that should not stem (empty lists)!
|
||||||
public void testBogusStems() {
|
public void testBogusStems() {
|
||||||
assertStemsTo("abs");
|
assertStemsTo("abs");
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
# A very expensive MAP: each character from each group has to be replaced with every other character from the same group,
|
||||||
|
# so for long words the enumeration is very long
|
||||||
|
|
||||||
|
MAP 3
|
||||||
|
MAP qwertyuiop
|
||||||
|
MAP asdfghjkl
|
||||||
|
MAP zxcvbnm
|
Loading…
Reference in New Issue