mirror of https://github.com/apache/lucene.git
hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in) (#11893)
hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in)
This commit is contained in:
parent
c66a559050
commit
f7417d5961
|
@ -63,7 +63,7 @@ Improvements
|
||||||
Optimizations
|
Optimizations
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
|
* GITHUB#11857, GITHUB#11859, GITHUB#11893: Hunspell: improved suggestion performance
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -28,7 +28,11 @@ import java.util.Objects;
|
||||||
import java.util.PriorityQueue;
|
import java.util.PriorityQueue;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.IntPredicate;
|
||||||
|
import java.util.function.Supplier;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
|
||||||
|
@ -43,10 +47,12 @@ class GeneratingSuggester {
|
||||||
private static final int MAX_ROOT_LENGTH_DIFF = 4;
|
private static final int MAX_ROOT_LENGTH_DIFF = 4;
|
||||||
private final Dictionary dictionary;
|
private final Dictionary dictionary;
|
||||||
private final Hunspell speller;
|
private final Hunspell speller;
|
||||||
|
private final SuggestibleEntryCache entryCache;
|
||||||
|
|
||||||
GeneratingSuggester(Hunspell speller) {
|
GeneratingSuggester(Hunspell speller, SuggestibleEntryCache entryCache) {
|
||||||
this.dictionary = speller.dictionary;
|
this.dictionary = speller.dictionary;
|
||||||
this.speller = speller;
|
this.speller = speller;
|
||||||
|
this.entryCache = entryCache;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> suggest(String word, WordCase originalCase, Set<Suggestion> prevSuggestions) {
|
List<String> suggest(String word, WordCase originalCase, Set<Suggestion> prevSuggestions) {
|
||||||
|
@ -60,7 +66,11 @@ class GeneratingSuggester {
|
||||||
String word, WordCase originalCase) {
|
String word, WordCase originalCase) {
|
||||||
Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
|
Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
|
||||||
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
|
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
|
||||||
EntryFilter filter = new EntryFilter(dictionary);
|
|
||||||
|
char[] excludeFlags = dictionary.allNonSuggestibleFlags();
|
||||||
|
FlagEnumerator.Lookup flagLookup = dictionary.flagLookup;
|
||||||
|
IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);
|
||||||
|
|
||||||
boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
|
boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
|
||||||
TrigramAutomaton automaton =
|
TrigramAutomaton automaton =
|
||||||
new TrigramAutomaton(word) {
|
new TrigramAutomaton(word) {
|
||||||
|
@ -70,10 +80,10 @@ class GeneratingSuggester {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
dictionary.words.processSuggestibleWords(
|
processSuggestibleWords(
|
||||||
Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
|
Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
|
||||||
word.length() + MAX_ROOT_LENGTH_DIFF,
|
word.length() + MAX_ROOT_LENGTH_DIFF,
|
||||||
(rootChars, forms) -> {
|
(rootChars, formSupplier) -> {
|
||||||
if (ignoreTitleCaseRoots
|
if (ignoreTitleCaseRoots
|
||||||
&& Character.isUpperCase(rootChars.charAt(0))
|
&& Character.isUpperCase(rootChars.charAt(0))
|
||||||
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
|
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
|
||||||
|
@ -87,44 +97,34 @@ class GeneratingSuggester {
|
||||||
|
|
||||||
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
||||||
|
|
||||||
if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
|
boolean overflow = roots.size() == MAX_ROOTS;
|
||||||
|
if (overflow && sc <= roots.peek().score) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
speller.checkCanceled.run();
|
speller.checkCanceled.run();
|
||||||
|
|
||||||
String root = rootChars.toString();
|
String root = rootChars.toString();
|
||||||
int suitable = filter.findSuitableFormIndex(forms, 0);
|
IntsRef forms = formSupplier.get();
|
||||||
do {
|
for (int i = 0; i < forms.length; i++) {
|
||||||
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
|
if (isSuggestible.test(forms.ints[forms.offset + i])) {
|
||||||
suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
|
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
|
||||||
} while (suitable > 0);
|
if (overflow) {
|
||||||
while (roots.size() > MAX_ROOTS) {
|
roots.poll();
|
||||||
roots.poll();
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
return roots.stream().sorted().collect(Collectors.toList());
|
return roots.stream().sorted().collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class EntryFilter {
|
private void processSuggestibleWords(
|
||||||
private final int formStep;
|
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||||
private final FlagEnumerator.Lookup flagLookup;
|
if (entryCache != null) {
|
||||||
private final char[] excludeFlags;
|
entryCache.processSuggestibleWords(minLength, maxLength, processor);
|
||||||
|
} else {
|
||||||
EntryFilter(Dictionary dic) {
|
dictionary.words.processSuggestibleWords(minLength, maxLength, processor);
|
||||||
formStep = dic.formStep();
|
|
||||||
flagLookup = dic.flagLookup;
|
|
||||||
excludeFlags = dic.allNonSuggestibleFlags();
|
|
||||||
}
|
|
||||||
|
|
||||||
int findSuitableFormIndex(IntsRef forms, int start) {
|
|
||||||
for (int i = start; i < forms.length; i += formStep) {
|
|
||||||
if (!flagLookup.hasAnyFlag(forms.ints[forms.offset + i], excludeFlags)) {
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,16 +26,8 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.LinkedHashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
@ -56,7 +48,6 @@ import org.apache.lucene.util.IntsRef;
|
||||||
*/
|
*/
|
||||||
public class Hunspell {
|
public class Hunspell {
|
||||||
static final long SUGGEST_TIME_LIMIT = 250;
|
static final long SUGGEST_TIME_LIMIT = 250;
|
||||||
|
|
||||||
final Dictionary dictionary;
|
final Dictionary dictionary;
|
||||||
final Stemmer stemmer;
|
final Stemmer stemmer;
|
||||||
private final TimeoutPolicy policy;
|
private final TimeoutPolicy policy;
|
||||||
|
@ -75,7 +66,7 @@ public class Hunspell {
|
||||||
this.dictionary = dictionary;
|
this.dictionary = dictionary;
|
||||||
this.policy = policy;
|
this.policy = policy;
|
||||||
this.checkCanceled = checkCanceled;
|
this.checkCanceled = checkCanceled;
|
||||||
stemmer = new Stemmer(dictionary);
|
this.stemmer = new Stemmer(dictionary);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -568,6 +559,7 @@ public class Hunspell {
|
||||||
* @return suggestions for the given misspelled word
|
* @return suggestions for the given misspelled word
|
||||||
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
||||||
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
||||||
|
* @see Suggester for finer-grained APIs and performance optimizations
|
||||||
*/
|
*/
|
||||||
public List<String> suggest(String word) throws SuggestionTimeoutException {
|
public List<String> suggest(String word) throws SuggestionTimeoutException {
|
||||||
return suggest(word, SUGGEST_TIME_LIMIT);
|
return suggest(word, SUGGEST_TIME_LIMIT);
|
||||||
|
@ -579,140 +571,19 @@ public class Hunspell {
|
||||||
* TimeoutPolicy}'s effects (exception or partial result) may kick in
|
* TimeoutPolicy}'s effects (exception or partial result) may kick in
|
||||||
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
||||||
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
||||||
|
* @see Suggester for finer-grained APIs and performance optimizations
|
||||||
*/
|
*/
|
||||||
public List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
|
public List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
|
||||||
checkCanceled.run();
|
Suggester suggester = new Suggester(dictionary);
|
||||||
if (word.length() >= 100) return Collections.emptyList();
|
if (policy == NO_TIMEOUT) return suggester.suggestNoTimeout(word, checkCanceled);
|
||||||
|
|
||||||
if (dictionary.needsInputCleaning(word)) {
|
|
||||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
WordCase wordCase = WordCase.caseOf(word);
|
|
||||||
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
|
|
||||||
String title = dictionary.toTitleCase(word);
|
|
||||||
if (spell(title)) {
|
|
||||||
return Collections.singletonList(title);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
|
|
||||||
Runnable checkCanceled =
|
|
||||||
policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, suggestions, timeLimitMs);
|
|
||||||
try {
|
try {
|
||||||
doSuggest(word, wordCase, suggestions, checkCanceled);
|
return suggester.suggestWithTimeout(word, timeLimitMs, checkCanceled);
|
||||||
} catch (SuggestionTimeoutException e) {
|
} catch (SuggestionTimeoutException e) {
|
||||||
if (policy != RETURN_PARTIAL_RESULT) {
|
if (policy == RETURN_PARTIAL_RESULT) {
|
||||||
throw e;
|
return e.getPartialResult();
|
||||||
}
|
}
|
||||||
|
throw e;
|
||||||
}
|
}
|
||||||
|
|
||||||
return postprocess(suggestions);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void doSuggest(
|
|
||||||
String word,
|
|
||||||
WordCase wordCase,
|
|
||||||
LinkedHashSet<Suggestion> suggestions,
|
|
||||||
Runnable checkCanceled) {
|
|
||||||
Hunspell suggestionSpeller =
|
|
||||||
new Hunspell(dictionary, policy, checkCanceled) {
|
|
||||||
// Cache for expensive "findStem" requests issued when trying to split a compound word.
|
|
||||||
// The suggestion algorithm issues many of them, often with the same text.
|
|
||||||
// The cache can be large, but will be GC-ed after the "suggest" call.
|
|
||||||
final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
boolean acceptsStem(int formID) {
|
|
||||||
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
|
||||||
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
Root<CharsRef> findStem(
|
|
||||||
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
|
||||||
if (context == COMPOUND_BEGIN && originalCase == null) {
|
|
||||||
return compoundCache
|
|
||||||
.computeIfAbsent(
|
|
||||||
new String(chars, offset, length),
|
|
||||||
__ ->
|
|
||||||
Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
|
|
||||||
.orElse(null);
|
|
||||||
}
|
|
||||||
return super.findStem(chars, offset, length, originalCase, context);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
boolean hasGoodSuggestions =
|
|
||||||
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
|
|
||||||
|
|
||||||
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
|
||||||
List<String> generated =
|
|
||||||
new GeneratingSuggester(suggestionSpeller)
|
|
||||||
.suggest(dictionary.toLowerCase(word), wordCase, suggestions);
|
|
||||||
for (String raw : generated) {
|
|
||||||
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
|
|
||||||
for (String raw : modifyChunksBetweenDashes(word)) {
|
|
||||||
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Runnable checkTimeLimit(String word, Set<Suggestion> suggestions, long timeLimitMs) {
|
|
||||||
return new Runnable() {
|
|
||||||
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
|
|
||||||
int invocationCounter = 100;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
checkCanceled.run();
|
|
||||||
if (--invocationCounter <= 0) {
|
|
||||||
if (System.nanoTime() - deadline > 0) {
|
|
||||||
stop();
|
|
||||||
}
|
|
||||||
invocationCounter = 100;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void stop() {
|
|
||||||
List<String> partialResult =
|
|
||||||
policy == RETURN_PARTIAL_RESULT ? null : postprocess(suggestions);
|
|
||||||
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
|
|
||||||
throw new SuggestionTimeoutException(message, partialResult);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> postprocess(Collection<Suggestion> suggestions) {
|
|
||||||
return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> modifyChunksBetweenDashes(String word) {
|
|
||||||
List<String> result = new ArrayList<>();
|
|
||||||
int chunkStart = 0;
|
|
||||||
while (chunkStart < word.length()) {
|
|
||||||
int chunkEnd = word.indexOf('-', chunkStart);
|
|
||||||
if (chunkEnd < 0) {
|
|
||||||
chunkEnd = word.length();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (chunkEnd > chunkStart) {
|
|
||||||
String chunk = word.substring(chunkStart, chunkEnd);
|
|
||||||
if (!spell(chunk)) {
|
|
||||||
for (String chunkSug : suggest(chunk)) {
|
|
||||||
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
|
||||||
if (spell(replaced)) {
|
|
||||||
result.add(replaced);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
chunkStart = chunkEnd + 1;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,237 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
|
||||||
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A generator for misspelled word corrections based on Hunspell flags. The suggestions are searched
|
||||||
|
* for in two main ways:
|
||||||
|
*
|
||||||
|
* <ol>
|
||||||
|
* <li>Modification: trying to insert/remove/delete/swap parts of the word to get something
|
||||||
|
* acceptable. The performance of this part depends heavily on the contents of TRY, MAP, REP,
|
||||||
|
* KEY directives in the .aff file.
|
||||||
|
* <li>Enumeration: if the modification hasn't produced "good enough" suggestions, the whole
|
||||||
|
* dictionary is scanned and simple affixes are added onto the entries to check if that
|
||||||
|
* produces anything similar to the given misspelled word. This depends on the dictionary size
|
||||||
|
* and the affix count, and it can take noticeable amount of time. To speed this up, {@link
|
||||||
|
* #withSuggestibleEntryCache()} can be used.
|
||||||
|
* </ol>
|
||||||
|
*/
|
||||||
|
public class Suggester {
|
||||||
|
private final Dictionary dictionary;
|
||||||
|
private final SuggestibleEntryCache suggestibleCache;
|
||||||
|
|
||||||
|
public Suggester(Dictionary dictionary) {
|
||||||
|
this(dictionary, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Suggester(Dictionary dictionary, SuggestibleEntryCache suggestibleCache) {
|
||||||
|
this.dictionary = dictionary;
|
||||||
|
this.suggestibleCache = suggestibleCache;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a copy of this suggester instance with better "Enumeration" phase performance (see
|
||||||
|
* {@link Suggester} documentation), but using more memory. With this option, the dictionary
|
||||||
|
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
|
||||||
|
*/
|
||||||
|
public Suggester withSuggestibleEntryCache() {
|
||||||
|
return new Suggester(dictionary, SuggestibleEntryCache.buildCache(dictionary.words));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute suggestions for the given misspelled word
|
||||||
|
*
|
||||||
|
* @param word the misspelled word to calculate suggestions for
|
||||||
|
* @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
|
||||||
|
* generation by throwing an exception
|
||||||
|
*/
|
||||||
|
public List<String> suggestNoTimeout(String word, Runnable checkCanceled) {
|
||||||
|
LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
|
||||||
|
return suggest(word, suggestions, handleCustomTimeoutException(checkCanceled, suggestions));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Runnable handleCustomTimeoutException(
|
||||||
|
Runnable checkCanceled, LinkedHashSet<Suggestion> suggestions) {
|
||||||
|
return () -> {
|
||||||
|
try {
|
||||||
|
checkCanceled.run();
|
||||||
|
} catch (SuggestionTimeoutException e) {
|
||||||
|
if (e.getPartialResult() != null) {
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new SuggestionTimeoutException(e.getMessage(), postprocess(suggestions));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param word the misspelled word to calculate suggestions for
|
||||||
|
* @param timeLimitMs the duration limit in milliseconds after which the computation is interruped
|
||||||
|
* by an exception
|
||||||
|
* @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
|
||||||
|
* generation by throwing an exception
|
||||||
|
* @throws SuggestionTimeoutException if the computation takes too long. Use {@link
|
||||||
|
* SuggestionTimeoutException#getPartialResult()} to get the suggestions computed up to that
|
||||||
|
* point
|
||||||
|
*/
|
||||||
|
public List<String> suggestWithTimeout(String word, long timeLimitMs, Runnable checkCanceled)
|
||||||
|
throws SuggestionTimeoutException {
|
||||||
|
LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
|
||||||
|
Runnable checkTime = checkTimeLimit(word, suggestions, timeLimitMs, checkCanceled);
|
||||||
|
return suggest(word, suggestions, handleCustomTimeoutException(checkTime, suggestions));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> suggest(
|
||||||
|
String word, LinkedHashSet<Suggestion> suggestions, Runnable checkCanceled)
|
||||||
|
throws SuggestionTimeoutException {
|
||||||
|
checkCanceled.run();
|
||||||
|
if (word.length() >= 100) return Collections.emptyList();
|
||||||
|
|
||||||
|
if (dictionary.needsInputCleaning(word)) {
|
||||||
|
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
Hunspell suggestionSpeller =
|
||||||
|
new Hunspell(dictionary, NO_TIMEOUT, checkCanceled) {
|
||||||
|
// Cache for expensive "findStem" requests issued when trying to split a compound word.
|
||||||
|
// The suggestion algorithm issues many of them, often with the same text.
|
||||||
|
// The cache can be large, but will be GC-ed after the "suggest" call.
|
||||||
|
final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
boolean acceptsStem(int formID) {
|
||||||
|
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
||||||
|
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
Root<CharsRef> findStem(
|
||||||
|
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||||
|
if (context == COMPOUND_BEGIN && originalCase == null) {
|
||||||
|
return compoundCache
|
||||||
|
.computeIfAbsent(
|
||||||
|
new String(chars, offset, length),
|
||||||
|
__ ->
|
||||||
|
Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
|
||||||
|
.orElse(null);
|
||||||
|
}
|
||||||
|
return super.findStem(chars, offset, length, originalCase, context);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
WordCase wordCase = WordCase.caseOf(word);
|
||||||
|
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
|
||||||
|
String title = dictionary.toTitleCase(word);
|
||||||
|
if (suggestionSpeller.spell(title)) {
|
||||||
|
return Collections.singletonList(title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasGoodSuggestions =
|
||||||
|
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
|
||||||
|
|
||||||
|
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||||
|
List<String> generated =
|
||||||
|
new GeneratingSuggester(suggestionSpeller, suggestibleCache)
|
||||||
|
.suggest(dictionary.toLowerCase(word), wordCase, suggestions);
|
||||||
|
for (String raw : generated) {
|
||||||
|
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
|
||||||
|
for (String raw : modifyChunksBetweenDashes(word, suggestionSpeller, checkCanceled)) {
|
||||||
|
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return postprocess(suggestions);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Runnable checkTimeLimit(
|
||||||
|
String word, Set<Suggestion> suggestions, long timeLimitMs, Runnable checkCanceled) {
|
||||||
|
return new Runnable() {
|
||||||
|
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
|
||||||
|
int invocationCounter = 100;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
checkCanceled.run();
|
||||||
|
if (--invocationCounter <= 0) {
|
||||||
|
if (System.nanoTime() - deadline > 0) {
|
||||||
|
stop();
|
||||||
|
}
|
||||||
|
invocationCounter = 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void stop() {
|
||||||
|
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
|
||||||
|
throw new SuggestionTimeoutException(message, postprocess(suggestions));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> postprocess(Collection<Suggestion> suggestions) {
|
||||||
|
return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> modifyChunksBetweenDashes(
|
||||||
|
String word, Hunspell speller, Runnable checkCanceled) {
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
int chunkStart = 0;
|
||||||
|
while (chunkStart < word.length()) {
|
||||||
|
int chunkEnd = word.indexOf('-', chunkStart);
|
||||||
|
if (chunkEnd < 0) {
|
||||||
|
chunkEnd = word.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunkEnd > chunkStart) {
|
||||||
|
String chunk = word.substring(chunkStart, chunkEnd);
|
||||||
|
if (!speller.spell(chunk)) {
|
||||||
|
for (String chunkSug : suggestNoTimeout(chunk, checkCanceled)) {
|
||||||
|
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
||||||
|
if (speller.spell(replaced)) {
|
||||||
|
result.add(replaced);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
chunkStart = chunkEnd + 1;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A cache allowing for CPU-cache-friendlier iteration over {@link WordStorage} entries that can be
|
||||||
|
* used for suggestions. The words and the form data are stored in plain contiguous arrays with no
|
||||||
|
* compression.
|
||||||
|
*/
|
||||||
|
class SuggestibleEntryCache {
|
||||||
|
private final short[] lengths;
|
||||||
|
private final char[] roots;
|
||||||
|
private final int[] formData;
|
||||||
|
|
||||||
|
private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
|
||||||
|
this.lengths = lengths;
|
||||||
|
this.roots = roots;
|
||||||
|
this.formData = formData;
|
||||||
|
}
|
||||||
|
|
||||||
|
static SuggestibleEntryCache buildCache(WordStorage storage) {
|
||||||
|
var consumer =
|
||||||
|
new BiConsumer<CharsRef, Supplier<IntsRef>>() {
|
||||||
|
short[] lengths = new short[10];
|
||||||
|
final StringBuilder roots = new StringBuilder();
|
||||||
|
int[] formData = new int[10];
|
||||||
|
int lenOffset = 0;
|
||||||
|
int formDataOffset = 0;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
|
||||||
|
if (root.length > Short.MAX_VALUE) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"Too long dictionary entry, please report this to dev@lucene.apache.org");
|
||||||
|
}
|
||||||
|
|
||||||
|
IntsRef forms = formSupplier.get();
|
||||||
|
|
||||||
|
lengths = ArrayUtil.grow(lengths, lenOffset + 2);
|
||||||
|
lengths[lenOffset] = (short) root.length;
|
||||||
|
lengths[lenOffset + 1] = (short) forms.length;
|
||||||
|
lenOffset += 2;
|
||||||
|
|
||||||
|
roots.append(root.chars, root.offset, root.length);
|
||||||
|
|
||||||
|
formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
|
||||||
|
System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
|
||||||
|
formDataOffset += forms.length;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);
|
||||||
|
|
||||||
|
return new SuggestibleEntryCache(
|
||||||
|
ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
|
||||||
|
consumer.roots.toString().toCharArray(),
|
||||||
|
ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
|
||||||
|
}
|
||||||
|
|
||||||
|
void processSuggestibleWords(
|
||||||
|
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||||
|
CharsRef chars = new CharsRef(roots, 0, 0);
|
||||||
|
IntsRef forms = new IntsRef(formData, 0, 0);
|
||||||
|
Supplier<IntsRef> formSupplier = () -> forms;
|
||||||
|
int rootOffset = 0;
|
||||||
|
int formDataOffset = 0;
|
||||||
|
for (int i = 0; i < lengths.length; i += 2) {
|
||||||
|
int rootLength = lengths[i];
|
||||||
|
short formDataLength = lengths[i + 1];
|
||||||
|
if (rootLength >= minLength && rootLength <= maxLength) {
|
||||||
|
chars.offset = rootOffset;
|
||||||
|
chars.length = rootLength;
|
||||||
|
forms.offset = formDataOffset;
|
||||||
|
forms.length = formDataLength;
|
||||||
|
processor.accept(chars, formSupplier);
|
||||||
|
}
|
||||||
|
rootOffset += rootLength;
|
||||||
|
formDataOffset += formDataLength;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.function.BiConsumer;
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Supplier;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
import org.apache.lucene.store.DataOutput;
|
import org.apache.lucene.store.DataOutput;
|
||||||
|
@ -54,7 +55,8 @@ class WordStorage {
|
||||||
private static final int COLLISION_MASK = 0x40;
|
private static final int COLLISION_MASK = 0x40;
|
||||||
private static final int SUGGESTIBLE_MASK = 0x20;
|
private static final int SUGGESTIBLE_MASK = 0x20;
|
||||||
private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
|
private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
|
||||||
|
private final int maxEntryLength;
|
||||||
|
private final boolean hasCustomMorphData;
|
||||||
/**
|
/**
|
||||||
* A map from word's hash (modulo array's length) into an int containing:
|
* A map from word's hash (modulo array's length) into an int containing:
|
||||||
*
|
*
|
||||||
|
@ -89,7 +91,10 @@ class WordStorage {
|
||||||
*/
|
*/
|
||||||
private final byte[] wordData;
|
private final byte[] wordData;
|
||||||
|
|
||||||
private WordStorage(int[] hashTable, byte[] wordData) {
|
private WordStorage(
|
||||||
|
int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
|
||||||
|
this.maxEntryLength = maxEntryLength;
|
||||||
|
this.hasCustomMorphData = hasCustomMorphData;
|
||||||
this.hashTable = hashTable;
|
this.hashTable = hashTable;
|
||||||
this.wordData = wordData;
|
this.wordData = wordData;
|
||||||
}
|
}
|
||||||
|
@ -153,11 +158,13 @@ class WordStorage {
|
||||||
* can be modified in any way, but may not be saved for later by the processor
|
* can be modified in any way, but may not be saved for later by the processor
|
||||||
*/
|
*/
|
||||||
void processSuggestibleWords(
|
void processSuggestibleWords(
|
||||||
int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
|
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||||
assert minLength <= maxLength;
|
assert minLength <= maxLength;
|
||||||
|
maxLength = Math.min(maxEntryLength, maxLength);
|
||||||
|
|
||||||
CharsRef chars = new CharsRef(maxLength);
|
CharsRef chars = new CharsRef(maxLength);
|
||||||
IntsRef forms = new IntsRef();
|
|
||||||
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
||||||
|
var formSupplier = new LazyFormReader(in);
|
||||||
for (int entryCode : hashTable) {
|
for (int entryCode : hashTable) {
|
||||||
int pos = entryCode & OFFSET_MASK;
|
int pos = entryCode & OFFSET_MASK;
|
||||||
int mask = entryCode >>> OFFSET_BITS;
|
int mask = entryCode >>> OFFSET_BITS;
|
||||||
|
@ -179,11 +186,7 @@ class WordStorage {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mightMatch) {
|
if (mightMatch) {
|
||||||
int dataLength = in.readVInt();
|
formSupplier.dataPos = in.getPosition();
|
||||||
if (forms.ints.length < dataLength) {
|
|
||||||
forms.ints = new int[dataLength];
|
|
||||||
}
|
|
||||||
readForms(forms, in, dataLength);
|
|
||||||
while (prevPos != 0 && wordStart > 0) {
|
while (prevPos != 0 && wordStart > 0) {
|
||||||
in.setPosition(prevPos);
|
in.setPosition(prevPos);
|
||||||
chars.chars[--wordStart] = (char) in.readVInt();
|
chars.chars[--wordStart] = (char) in.readVInt();
|
||||||
|
@ -193,7 +196,7 @@ class WordStorage {
|
||||||
if (prevPos == 0) {
|
if (prevPos == 0) {
|
||||||
chars.offset = wordStart;
|
chars.offset = wordStart;
|
||||||
chars.length = maxLength - wordStart;
|
chars.length = maxLength - wordStart;
|
||||||
processor.accept(chars, forms);
|
processor.accept(chars, formSupplier);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -257,6 +260,7 @@ class WordStorage {
|
||||||
private final ByteArrayDataOutput dataWriter;
|
private final ByteArrayDataOutput dataWriter;
|
||||||
private int commonPrefixLength, commonPrefixPos;
|
private int commonPrefixLength, commonPrefixPos;
|
||||||
private int actualWords;
|
private int actualWords;
|
||||||
|
private int maxEntryLength;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param wordCount an approximate number of the words in the resulting dictionary, used to
|
* @param wordCount an approximate number of the words in the resulting dictionary, used to
|
||||||
|
@ -297,6 +301,8 @@ class WordStorage {
|
||||||
* {@link String#compareTo} rules.
|
* {@link String#compareTo} rules.
|
||||||
*/
|
*/
|
||||||
void add(String entry, char[] flags, int morphDataID) throws IOException {
|
void add(String entry, char[] flags, int morphDataID) throws IOException {
|
||||||
|
maxEntryLength = Math.max(maxEntryLength, entry.length());
|
||||||
|
|
||||||
if (!entry.equals(currentEntry)) {
|
if (!entry.equals(currentEntry)) {
|
||||||
if (currentEntry != null) {
|
if (currentEntry != null) {
|
||||||
if (entry.compareTo(currentEntry) < 0) {
|
if (entry.compareTo(currentEntry) < 0) {
|
||||||
|
@ -411,8 +417,36 @@ class WordStorage {
|
||||||
WordStorage build() throws IOException {
|
WordStorage build() throws IOException {
|
||||||
assert !group.isEmpty() : "build() should be only called once";
|
assert !group.isEmpty() : "build() should be only called once";
|
||||||
flushGroup();
|
flushGroup();
|
||||||
return new WordStorage(
|
byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
|
||||||
hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));
|
return new WordStorage(maxEntryLength, hasCustomMorphData, hashTable, trimmedData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class LazyFormReader implements Supplier<IntsRef> {
|
||||||
|
int dataPos;
|
||||||
|
private final ByteArrayDataInput in;
|
||||||
|
private final IntsRef forms;
|
||||||
|
|
||||||
|
LazyFormReader(ByteArrayDataInput in) {
|
||||||
|
this.in = in;
|
||||||
|
forms = new IntsRef();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IntsRef get() {
|
||||||
|
in.setPosition(dataPos);
|
||||||
|
int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
|
||||||
|
if (forms.ints.length < entryCount) {
|
||||||
|
forms.ints = new int[entryCount];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < entryCount; i++) {
|
||||||
|
forms.ints[i] = in.readVInt();
|
||||||
|
if (hasCustomMorphData) {
|
||||||
|
in.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
forms.length = entryCount;
|
||||||
|
return forms;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -149,6 +149,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDictionariesLoadSuccessfully() throws Exception {
|
public void testDictionariesLoadSuccessfully() throws Exception {
|
||||||
|
AtomicLong memoryWithCache = new AtomicLong();
|
||||||
AtomicLong totalMemory = new AtomicLong();
|
AtomicLong totalMemory = new AtomicLong();
|
||||||
AtomicLong totalWords = new AtomicLong();
|
AtomicLong totalWords = new AtomicLong();
|
||||||
int threads = Runtime.getRuntime().availableProcessors();
|
int threads = Runtime.getRuntime().availableProcessors();
|
||||||
|
@ -159,7 +160,16 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
(Path aff) -> {
|
(Path aff) -> {
|
||||||
try {
|
try {
|
||||||
Dictionary dic = loadDictionary(aff);
|
Dictionary dic = loadDictionary(aff);
|
||||||
|
new Hunspell(dic).spell("aaaa");
|
||||||
|
Suggester suggester = new Suggester(dic).withSuggestibleEntryCache();
|
||||||
|
try {
|
||||||
|
suggester.suggestWithTimeout("aaaaaaaaaa", Hunspell.SUGGEST_TIME_LIMIT, () -> {});
|
||||||
|
} catch (
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
SuggestionTimeoutException e) {
|
||||||
|
}
|
||||||
totalMemory.addAndGet(RamUsageTester.ramUsed(dic));
|
totalMemory.addAndGet(RamUsageTester.ramUsed(dic));
|
||||||
|
memoryWithCache.addAndGet(RamUsageTester.ramUsed(suggester));
|
||||||
totalWords.addAndGet(RamUsageTester.ramUsed(dic.words));
|
totalWords.addAndGet(RamUsageTester.ramUsed(dic.words));
|
||||||
System.out.println(aff + "\t" + memoryUsageSummary(dic));
|
System.out.println(aff + "\t" + memoryUsageSummary(dic));
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
|
@ -195,6 +205,9 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
System.out.println("Total memory: " + RamUsageEstimator.humanReadableUnits(totalMemory.get()));
|
System.out.println("Total memory: " + RamUsageEstimator.humanReadableUnits(totalMemory.get()));
|
||||||
System.out.println(
|
System.out.println(
|
||||||
"Total memory for word storage: " + RamUsageEstimator.humanReadableUnits(totalWords.get()));
|
"Total memory for word storage: " + RamUsageEstimator.humanReadableUnits(totalWords.get()));
|
||||||
|
System.out.println(
|
||||||
|
"Additional memory if withSuggestibleEntryCache is enabled: "
|
||||||
|
+ RamUsageEstimator.humanReadableUnits(memoryWithCache.get() - totalMemory.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String memoryUsageSummary(Dictionary dic) {
|
private static String memoryUsageSummary(Dictionary dic) {
|
||||||
|
|
|
@ -110,6 +110,12 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
Set<String> processed = new HashSet<>();
|
Set<String> processed = new HashSet<>();
|
||||||
dictionary.words.processSuggestibleWords(
|
dictionary.words.processSuggestibleWords(
|
||||||
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
||||||
|
|
||||||
|
Set<String> cached = new HashSet<>();
|
||||||
|
SuggestibleEntryCache.buildCache(dictionary.words)
|
||||||
|
.processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
|
||||||
|
assertEquals(processed, cached);
|
||||||
|
|
||||||
return processed;
|
return processed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.CancellationException;
|
import java.util.concurrent.CancellationException;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||||
|
@ -54,6 +55,33 @@ public class TestHunspell extends LuceneTestCase {
|
||||||
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
|
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCustomCheckCanceledGivesPartialResult() throws Exception {
|
||||||
|
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
|
||||||
|
|
||||||
|
List<String> expected = List.of("apach");
|
||||||
|
assertEquals(expected, new Hunspell(dictionary, NO_TIMEOUT, () -> {}).suggest("apac"));
|
||||||
|
|
||||||
|
AtomicInteger counter = new AtomicInteger();
|
||||||
|
String msg = "msg";
|
||||||
|
Runnable checkCanceled =
|
||||||
|
() -> {
|
||||||
|
if (counter.incrementAndGet() > 400) {
|
||||||
|
throw new SuggestionTimeoutException(msg, null);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Hunspell hunspell = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, checkCanceled);
|
||||||
|
assertEquals(expected, hunspell.suggest("apac"));
|
||||||
|
|
||||||
|
counter.set(0);
|
||||||
|
var e =
|
||||||
|
assertThrows(
|
||||||
|
SuggestionTimeoutException.class,
|
||||||
|
() -> new Suggester(dictionary).suggestNoTimeout("apac", checkCanceled));
|
||||||
|
assertEquals(expected, e.getPartialResult());
|
||||||
|
assertEquals("msg", e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
public void testSuggestionTimeLimit() throws IOException, ParseException {
|
public void testSuggestionTimeLimit() throws IOException, ParseException {
|
||||||
int timeLimitMs = 10;
|
int timeLimitMs = 10;
|
||||||
|
|
||||||
|
|
|
@ -86,7 +86,7 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void de_suggest() throws Exception {
|
public void de_suggest() throws Exception {
|
||||||
checkSuggestionPerformance("de", 100);
|
checkSuggestionPerformance("de", 150);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -163,33 +163,33 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
|
|
||||||
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
||||||
Dictionary dictionary = loadDictionary(code);
|
Dictionary dictionary = loadDictionary(code);
|
||||||
|
Suggester suggester = new Suggester(dictionary).withSuggestibleEntryCache();
|
||||||
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
||||||
List<String> words =
|
List<String> words =
|
||||||
loadWords(code, wordCount, dictionary).stream()
|
loadWords(code, wordCount, dictionary).stream()
|
||||||
.distinct()
|
.distinct()
|
||||||
.filter(w -> hasQuickSuggestions(speller, w))
|
.filter(w -> hasQuickSuggestions(speller, suggester, w))
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
System.out.println("Checking " + words.size() + " misspelled words");
|
System.out.println("Checking " + words.size() + " misspelled words");
|
||||||
|
|
||||||
Hunspell fullSpeller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
|
||||||
measure(
|
measure(
|
||||||
"Suggestions for " + code,
|
"Suggestions for " + code,
|
||||||
words.size(),
|
words.size(),
|
||||||
blackHole -> {
|
blackHole -> {
|
||||||
for (String word : words) {
|
for (String word : words) {
|
||||||
blackHole.accept(fullSpeller.suggest(word));
|
blackHole.accept(suggester.suggestNoTimeout(word, () -> {}));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasQuickSuggestions(Hunspell speller, String word) {
|
private boolean hasQuickSuggestions(Hunspell speller, Suggester suggester, String word) {
|
||||||
if (speller.spell(word)) {
|
if (speller.spell(word)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
speller.suggest(word);
|
suggester.suggestWithTimeout(word, Hunspell.SUGGEST_TIME_LIMIT, () -> {});
|
||||||
} catch (
|
} catch (
|
||||||
@SuppressWarnings("unused")
|
@SuppressWarnings("unused")
|
||||||
SuggestionTimeoutException e) {
|
SuggestionTimeoutException e) {
|
||||||
|
|
|
@ -249,10 +249,14 @@ public class TestSpellChecking extends LuceneTestCase {
|
||||||
InputStream dictStream = Files.newInputStream(dicFile);
|
InputStream dictStream = Files.newInputStream(dicFile);
|
||||||
|
|
||||||
Hunspell speller;
|
Hunspell speller;
|
||||||
|
Suggester defaultSuggester;
|
||||||
|
Suggester cachingSuggester;
|
||||||
try {
|
try {
|
||||||
Dictionary dictionary =
|
Dictionary dictionary =
|
||||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
||||||
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
|
defaultSuggester = new Suggester(dictionary);
|
||||||
|
cachingSuggester = new Suggester(dictionary).withSuggestibleEntryCache();
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.closeWhileHandlingException(affixStream);
|
IOUtils.closeWhileHandlingException(affixStream);
|
||||||
IOUtils.closeWhileHandlingException(dictStream);
|
IOUtils.closeWhileHandlingException(dictStream);
|
||||||
|
@ -273,12 +277,8 @@ public class TestSpellChecking extends LuceneTestCase {
|
||||||
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word.trim()));
|
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word.trim()));
|
||||||
}
|
}
|
||||||
if (Files.exists(sug)) {
|
if (Files.exists(sug)) {
|
||||||
String suggestions =
|
assertEquals(Files.readString(sug).trim(), suggest(defaultSuggester, wrongWords));
|
||||||
wrongWords.stream()
|
assertEquals(Files.readString(sug).trim(), suggest(cachingSuggester, wrongWords));
|
||||||
.map(s -> String.join(", ", speller.suggest(s)))
|
|
||||||
.filter(s -> !s.isEmpty())
|
|
||||||
.collect(Collectors.joining("\n"));
|
|
||||||
assertEquals(Files.readString(sug).trim(), suggestions);
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
assertFalse(".sug file without .wrong file!", Files.exists(sug));
|
assertFalse(".sug file without .wrong file!", Files.exists(sug));
|
||||||
|
@ -290,6 +290,13 @@ public class TestSpellChecking extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static String suggest(Suggester suggester, List<String> wrongWords) {
|
||||||
|
return wrongWords.stream()
|
||||||
|
.map(s -> String.join(", ", suggester.suggestNoTimeout(s, () -> {})))
|
||||||
|
.filter(s -> !s.isEmpty())
|
||||||
|
.collect(Collectors.joining("\n"));
|
||||||
|
}
|
||||||
|
|
||||||
private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
|
private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
|
||||||
Set<String> everythingGenerated = new HashSet<>();
|
Set<String> everythingGenerated = new HashSet<>();
|
||||||
boolean generatedEverything = true;
|
boolean generatedEverything = true;
|
||||||
|
|
Loading…
Reference in New Issue