mirror of https://github.com/apache/lucene.git
hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in) (#11893)
hunspell: allow for faster dictionary iteration during 'suggest' by using more memory (opt-in)
This commit is contained in:
parent
c66a559050
commit
f7417d5961
|
@ -63,7 +63,7 @@ Improvements
|
|||
Optimizations
|
||||
---------------------
|
||||
|
||||
* GITHUB#11857, GITHUB#11859: Hunspell: improved suggestion performance
|
||||
* GITHUB#11857, GITHUB#11859, GITHUB#11893: Hunspell: improved suggestion performance
|
||||
|
||||
Bug Fixes
|
||||
---------------------
|
||||
|
|
|
@ -28,7 +28,11 @@ import java.util.Objects;
|
|||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.IntPredicate;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
||||
|
@ -43,10 +47,12 @@ class GeneratingSuggester {
|
|||
private static final int MAX_ROOT_LENGTH_DIFF = 4;
|
||||
private final Dictionary dictionary;
|
||||
private final Hunspell speller;
|
||||
private final SuggestibleEntryCache entryCache;
|
||||
|
||||
GeneratingSuggester(Hunspell speller) {
|
||||
GeneratingSuggester(Hunspell speller, SuggestibleEntryCache entryCache) {
|
||||
this.dictionary = speller.dictionary;
|
||||
this.speller = speller;
|
||||
this.entryCache = entryCache;
|
||||
}
|
||||
|
||||
List<String> suggest(String word, WordCase originalCase, Set<Suggestion> prevSuggestions) {
|
||||
|
@ -60,7 +66,11 @@ class GeneratingSuggester {
|
|||
String word, WordCase originalCase) {
|
||||
Comparator<Weighted<Root<String>>> natural = Comparator.naturalOrder();
|
||||
PriorityQueue<Weighted<Root<String>>> roots = new PriorityQueue<>(natural.reversed());
|
||||
EntryFilter filter = new EntryFilter(dictionary);
|
||||
|
||||
char[] excludeFlags = dictionary.allNonSuggestibleFlags();
|
||||
FlagEnumerator.Lookup flagLookup = dictionary.flagLookup;
|
||||
IntPredicate isSuggestible = formId -> !flagLookup.hasAnyFlag(formId, excludeFlags);
|
||||
|
||||
boolean ignoreTitleCaseRoots = originalCase == WordCase.LOWER && !dictionary.hasLanguage("de");
|
||||
TrigramAutomaton automaton =
|
||||
new TrigramAutomaton(word) {
|
||||
|
@ -70,10 +80,10 @@ class GeneratingSuggester {
|
|||
}
|
||||
};
|
||||
|
||||
dictionary.words.processSuggestibleWords(
|
||||
processSuggestibleWords(
|
||||
Math.max(1, word.length() - MAX_ROOT_LENGTH_DIFF),
|
||||
word.length() + MAX_ROOT_LENGTH_DIFF,
|
||||
(rootChars, forms) -> {
|
||||
(rootChars, formSupplier) -> {
|
||||
if (ignoreTitleCaseRoots
|
||||
&& Character.isUpperCase(rootChars.charAt(0))
|
||||
&& WordCase.caseOf(rootChars) == WordCase.TITLE) {
|
||||
|
@ -87,44 +97,34 @@ class GeneratingSuggester {
|
|||
|
||||
sc += commonPrefix(word, rootChars) - longerWorsePenalty(word.length(), rootChars.length);
|
||||
|
||||
if (roots.size() == MAX_ROOTS && sc <= roots.peek().score) {
|
||||
boolean overflow = roots.size() == MAX_ROOTS;
|
||||
if (overflow && sc <= roots.peek().score) {
|
||||
return;
|
||||
}
|
||||
|
||||
speller.checkCanceled.run();
|
||||
|
||||
String root = rootChars.toString();
|
||||
int suitable = filter.findSuitableFormIndex(forms, 0);
|
||||
do {
|
||||
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + suitable]), sc));
|
||||
suitable = filter.findSuitableFormIndex(forms, suitable + filter.formStep);
|
||||
} while (suitable > 0);
|
||||
while (roots.size() > MAX_ROOTS) {
|
||||
roots.poll();
|
||||
IntsRef forms = formSupplier.get();
|
||||
for (int i = 0; i < forms.length; i++) {
|
||||
if (isSuggestible.test(forms.ints[forms.offset + i])) {
|
||||
roots.add(new Weighted<>(new Root<>(root, forms.ints[forms.offset + i]), sc));
|
||||
if (overflow) {
|
||||
roots.poll();
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return roots.stream().sorted().collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static class EntryFilter {
|
||||
private final int formStep;
|
||||
private final FlagEnumerator.Lookup flagLookup;
|
||||
private final char[] excludeFlags;
|
||||
|
||||
EntryFilter(Dictionary dic) {
|
||||
formStep = dic.formStep();
|
||||
flagLookup = dic.flagLookup;
|
||||
excludeFlags = dic.allNonSuggestibleFlags();
|
||||
}
|
||||
|
||||
int findSuitableFormIndex(IntsRef forms, int start) {
|
||||
for (int i = start; i < forms.length; i += formStep) {
|
||||
if (!flagLookup.hasAnyFlag(forms.ints[forms.offset + i], excludeFlags)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
private void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
if (entryCache != null) {
|
||||
entryCache.processSuggestibleWords(minLength, maxLength, processor);
|
||||
} else {
|
||||
dictionary.words.processSuggestibleWords(minLength, maxLength, processor);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,16 +26,8 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
|
|||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
@ -56,7 +48,6 @@ import org.apache.lucene.util.IntsRef;
|
|||
*/
|
||||
public class Hunspell {
|
||||
static final long SUGGEST_TIME_LIMIT = 250;
|
||||
|
||||
final Dictionary dictionary;
|
||||
final Stemmer stemmer;
|
||||
private final TimeoutPolicy policy;
|
||||
|
@ -75,7 +66,7 @@ public class Hunspell {
|
|||
this.dictionary = dictionary;
|
||||
this.policy = policy;
|
||||
this.checkCanceled = checkCanceled;
|
||||
stemmer = new Stemmer(dictionary);
|
||||
this.stemmer = new Stemmer(dictionary);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -568,6 +559,7 @@ public class Hunspell {
|
|||
* @return suggestions for the given misspelled word
|
||||
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
||||
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
||||
* @see Suggester for finer-grained APIs and performance optimizations
|
||||
*/
|
||||
public List<String> suggest(String word) throws SuggestionTimeoutException {
|
||||
return suggest(word, SUGGEST_TIME_LIMIT);
|
||||
|
@ -579,140 +571,19 @@ public class Hunspell {
|
|||
* TimeoutPolicy}'s effects (exception or partial result) may kick in
|
||||
* @throws SuggestionTimeoutException if the computation takes too long and {@link
|
||||
* TimeoutPolicy#THROW_EXCEPTION} was specified in the constructor
|
||||
* @see Suggester for finer-grained APIs and performance optimizations
|
||||
*/
|
||||
public List<String> suggest(String word, long timeLimitMs) throws SuggestionTimeoutException {
|
||||
checkCanceled.run();
|
||||
if (word.length() >= 100) return Collections.emptyList();
|
||||
Suggester suggester = new Suggester(dictionary);
|
||||
if (policy == NO_TIMEOUT) return suggester.suggestNoTimeout(word, checkCanceled);
|
||||
|
||||
if (dictionary.needsInputCleaning(word)) {
|
||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
WordCase wordCase = WordCase.caseOf(word);
|
||||
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
|
||||
String title = dictionary.toTitleCase(word);
|
||||
if (spell(title)) {
|
||||
return Collections.singletonList(title);
|
||||
}
|
||||
}
|
||||
|
||||
LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
|
||||
Runnable checkCanceled =
|
||||
policy == NO_TIMEOUT ? this.checkCanceled : checkTimeLimit(word, suggestions, timeLimitMs);
|
||||
try {
|
||||
doSuggest(word, wordCase, suggestions, checkCanceled);
|
||||
return suggester.suggestWithTimeout(word, timeLimitMs, checkCanceled);
|
||||
} catch (SuggestionTimeoutException e) {
|
||||
if (policy != RETURN_PARTIAL_RESULT) {
|
||||
throw e;
|
||||
if (policy == RETURN_PARTIAL_RESULT) {
|
||||
return e.getPartialResult();
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
|
||||
return postprocess(suggestions);
|
||||
}
|
||||
|
||||
private void doSuggest(
|
||||
String word,
|
||||
WordCase wordCase,
|
||||
LinkedHashSet<Suggestion> suggestions,
|
||||
Runnable checkCanceled) {
|
||||
Hunspell suggestionSpeller =
|
||||
new Hunspell(dictionary, policy, checkCanceled) {
|
||||
// Cache for expensive "findStem" requests issued when trying to split a compound word.
|
||||
// The suggestion algorithm issues many of them, often with the same text.
|
||||
// The cache can be large, but will be GC-ed after the "suggest" call.
|
||||
final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
|
||||
|
||||
@Override
|
||||
boolean acceptsStem(int formID) {
|
||||
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
||||
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
||||
}
|
||||
|
||||
@Override
|
||||
Root<CharsRef> findStem(
|
||||
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||
if (context == COMPOUND_BEGIN && originalCase == null) {
|
||||
return compoundCache
|
||||
.computeIfAbsent(
|
||||
new String(chars, offset, length),
|
||||
__ ->
|
||||
Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
|
||||
.orElse(null);
|
||||
}
|
||||
return super.findStem(chars, offset, length, originalCase, context);
|
||||
}
|
||||
};
|
||||
boolean hasGoodSuggestions =
|
||||
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
|
||||
|
||||
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||
List<String> generated =
|
||||
new GeneratingSuggester(suggestionSpeller)
|
||||
.suggest(dictionary.toLowerCase(word), wordCase, suggestions);
|
||||
for (String raw : generated) {
|
||||
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
||||
}
|
||||
}
|
||||
|
||||
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
|
||||
for (String raw : modifyChunksBetweenDashes(word)) {
|
||||
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Runnable checkTimeLimit(String word, Set<Suggestion> suggestions, long timeLimitMs) {
|
||||
return new Runnable() {
|
||||
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
|
||||
int invocationCounter = 100;
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
checkCanceled.run();
|
||||
if (--invocationCounter <= 0) {
|
||||
if (System.nanoTime() - deadline > 0) {
|
||||
stop();
|
||||
}
|
||||
invocationCounter = 100;
|
||||
}
|
||||
}
|
||||
|
||||
private void stop() {
|
||||
List<String> partialResult =
|
||||
policy == RETURN_PARTIAL_RESULT ? null : postprocess(suggestions);
|
||||
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
|
||||
throw new SuggestionTimeoutException(message, partialResult);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private List<String> postprocess(Collection<Suggestion> suggestions) {
|
||||
return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
|
||||
}
|
||||
|
||||
private List<String> modifyChunksBetweenDashes(String word) {
|
||||
List<String> result = new ArrayList<>();
|
||||
int chunkStart = 0;
|
||||
while (chunkStart < word.length()) {
|
||||
int chunkEnd = word.indexOf('-', chunkStart);
|
||||
if (chunkEnd < 0) {
|
||||
chunkEnd = word.length();
|
||||
}
|
||||
|
||||
if (chunkEnd > chunkStart) {
|
||||
String chunk = word.substring(chunkStart, chunkEnd);
|
||||
if (!spell(chunk)) {
|
||||
for (String chunkSug : suggest(chunk)) {
|
||||
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
||||
if (spell(replaced)) {
|
||||
result.add(replaced);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chunkStart = chunkEnd + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,237 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
|
||||
import static org.apache.lucene.analysis.hunspell.TimeoutPolicy.NO_TIMEOUT;
|
||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* A generator for misspelled word corrections based on Hunspell flags. The suggestions are searched
|
||||
* for in two main ways:
|
||||
*
|
||||
* <ol>
|
||||
* <li>Modification: trying to insert/remove/delete/swap parts of the word to get something
|
||||
* acceptable. The performance of this part depends heavily on the contents of TRY, MAP, REP,
|
||||
* KEY directives in the .aff file.
|
||||
* <li>Enumeration: if the modification hasn't produced "good enough" suggestions, the whole
|
||||
* dictionary is scanned and simple affixes are added onto the entries to check if that
|
||||
* produces anything similar to the given misspelled word. This depends on the dictionary size
|
||||
* and the affix count, and it can take noticeable amount of time. To speed this up, {@link
|
||||
* #withSuggestibleEntryCache()} can be used.
|
||||
* </ol>
|
||||
*/
|
||||
public class Suggester {
|
||||
private final Dictionary dictionary;
|
||||
private final SuggestibleEntryCache suggestibleCache;
|
||||
|
||||
public Suggester(Dictionary dictionary) {
|
||||
this(dictionary, null);
|
||||
}
|
||||
|
||||
private Suggester(Dictionary dictionary, SuggestibleEntryCache suggestibleCache) {
|
||||
this.dictionary = dictionary;
|
||||
this.suggestibleCache = suggestibleCache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of this suggester instance with better "Enumeration" phase performance (see
|
||||
* {@link Suggester} documentation), but using more memory. With this option, the dictionary
|
||||
* entries are stored as fast-to-iterate plain words instead of highly compressed prefix trees.
|
||||
*/
|
||||
public Suggester withSuggestibleEntryCache() {
|
||||
return new Suggester(dictionary, SuggestibleEntryCache.buildCache(dictionary.words));
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute suggestions for the given misspelled word
|
||||
*
|
||||
* @param word the misspelled word to calculate suggestions for
|
||||
* @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
|
||||
* generation by throwing an exception
|
||||
*/
|
||||
public List<String> suggestNoTimeout(String word, Runnable checkCanceled) {
|
||||
LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
|
||||
return suggest(word, suggestions, handleCustomTimeoutException(checkCanceled, suggestions));
|
||||
}
|
||||
|
||||
private Runnable handleCustomTimeoutException(
|
||||
Runnable checkCanceled, LinkedHashSet<Suggestion> suggestions) {
|
||||
return () -> {
|
||||
try {
|
||||
checkCanceled.run();
|
||||
} catch (SuggestionTimeoutException e) {
|
||||
if (e.getPartialResult() != null) {
|
||||
throw e;
|
||||
}
|
||||
|
||||
throw new SuggestionTimeoutException(e.getMessage(), postprocess(suggestions));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param word the misspelled word to calculate suggestions for
|
||||
* @param timeLimitMs the duration limit in milliseconds after which the computation is interruped
|
||||
* by an exception
|
||||
* @param checkCanceled an object that's periodically called, allowing to interrupt or suggestion
|
||||
* generation by throwing an exception
|
||||
* @throws SuggestionTimeoutException if the computation takes too long. Use {@link
|
||||
* SuggestionTimeoutException#getPartialResult()} to get the suggestions computed up to that
|
||||
* point
|
||||
*/
|
||||
public List<String> suggestWithTimeout(String word, long timeLimitMs, Runnable checkCanceled)
|
||||
throws SuggestionTimeoutException {
|
||||
LinkedHashSet<Suggestion> suggestions = new LinkedHashSet<>();
|
||||
Runnable checkTime = checkTimeLimit(word, suggestions, timeLimitMs, checkCanceled);
|
||||
return suggest(word, suggestions, handleCustomTimeoutException(checkTime, suggestions));
|
||||
}
|
||||
|
||||
private List<String> suggest(
|
||||
String word, LinkedHashSet<Suggestion> suggestions, Runnable checkCanceled)
|
||||
throws SuggestionTimeoutException {
|
||||
checkCanceled.run();
|
||||
if (word.length() >= 100) return Collections.emptyList();
|
||||
|
||||
if (dictionary.needsInputCleaning(word)) {
|
||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||
}
|
||||
|
||||
Hunspell suggestionSpeller =
|
||||
new Hunspell(dictionary, NO_TIMEOUT, checkCanceled) {
|
||||
// Cache for expensive "findStem" requests issued when trying to split a compound word.
|
||||
// The suggestion algorithm issues many of them, often with the same text.
|
||||
// The cache can be large, but will be GC-ed after the "suggest" call.
|
||||
final Map<String, Optional<Root<CharsRef>>> compoundCache = new HashMap<>();
|
||||
|
||||
@Override
|
||||
boolean acceptsStem(int formID) {
|
||||
return !dictionary.hasFlag(formID, dictionary.noSuggest)
|
||||
&& !dictionary.hasFlag(formID, dictionary.subStandard);
|
||||
}
|
||||
|
||||
@Override
|
||||
Root<CharsRef> findStem(
|
||||
char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||
if (context == COMPOUND_BEGIN && originalCase == null) {
|
||||
return compoundCache
|
||||
.computeIfAbsent(
|
||||
new String(chars, offset, length),
|
||||
__ ->
|
||||
Optional.ofNullable(super.findStem(chars, offset, length, null, context)))
|
||||
.orElse(null);
|
||||
}
|
||||
return super.findStem(chars, offset, length, originalCase, context);
|
||||
}
|
||||
};
|
||||
|
||||
WordCase wordCase = WordCase.caseOf(word);
|
||||
if (dictionary.forceUCase != FLAG_UNSET && wordCase == WordCase.LOWER) {
|
||||
String title = dictionary.toTitleCase(word);
|
||||
if (suggestionSpeller.spell(title)) {
|
||||
return Collections.singletonList(title);
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasGoodSuggestions =
|
||||
new ModifyingSuggester(suggestionSpeller, suggestions, word, wordCase).suggest();
|
||||
|
||||
if (!hasGoodSuggestions && dictionary.maxNGramSuggestions > 0) {
|
||||
List<String> generated =
|
||||
new GeneratingSuggester(suggestionSpeller, suggestibleCache)
|
||||
.suggest(dictionary.toLowerCase(word), wordCase, suggestions);
|
||||
for (String raw : generated) {
|
||||
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
||||
}
|
||||
}
|
||||
|
||||
if (word.contains("-") && suggestions.stream().noneMatch(s -> s.raw.contains("-"))) {
|
||||
for (String raw : modifyChunksBetweenDashes(word, suggestionSpeller, checkCanceled)) {
|
||||
suggestions.add(new Suggestion(raw, word, wordCase, suggestionSpeller));
|
||||
}
|
||||
}
|
||||
return postprocess(suggestions);
|
||||
}
|
||||
|
||||
private Runnable checkTimeLimit(
|
||||
String word, Set<Suggestion> suggestions, long timeLimitMs, Runnable checkCanceled) {
|
||||
return new Runnable() {
|
||||
final long deadline = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(timeLimitMs);
|
||||
int invocationCounter = 100;
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
checkCanceled.run();
|
||||
if (--invocationCounter <= 0) {
|
||||
if (System.nanoTime() - deadline > 0) {
|
||||
stop();
|
||||
}
|
||||
invocationCounter = 100;
|
||||
}
|
||||
}
|
||||
|
||||
private void stop() {
|
||||
String message = "Time limit of " + timeLimitMs + "ms exceeded for " + word;
|
||||
throw new SuggestionTimeoutException(message, postprocess(suggestions));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private List<String> postprocess(Collection<Suggestion> suggestions) {
|
||||
return suggestions.stream().flatMap(s -> Arrays.stream(s.result)).distinct().toList();
|
||||
}
|
||||
|
||||
private List<String> modifyChunksBetweenDashes(
|
||||
String word, Hunspell speller, Runnable checkCanceled) {
|
||||
List<String> result = new ArrayList<>();
|
||||
int chunkStart = 0;
|
||||
while (chunkStart < word.length()) {
|
||||
int chunkEnd = word.indexOf('-', chunkStart);
|
||||
if (chunkEnd < 0) {
|
||||
chunkEnd = word.length();
|
||||
}
|
||||
|
||||
if (chunkEnd > chunkStart) {
|
||||
String chunk = word.substring(chunkStart, chunkEnd);
|
||||
if (!speller.spell(chunk)) {
|
||||
for (String chunkSug : suggestNoTimeout(chunk, checkCanceled)) {
|
||||
String replaced = word.substring(0, chunkStart) + chunkSug + word.substring(chunkEnd);
|
||||
if (speller.spell(replaced)) {
|
||||
result.add(replaced);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
chunkStart = chunkEnd + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/**
|
||||
* A cache allowing for CPU-cache-friendlier iteration over {@link WordStorage} entries that can be
|
||||
* used for suggestions. The words and the form data are stored in plain contiguous arrays with no
|
||||
* compression.
|
||||
*/
|
||||
class SuggestibleEntryCache {
|
||||
private final short[] lengths;
|
||||
private final char[] roots;
|
||||
private final int[] formData;
|
||||
|
||||
private SuggestibleEntryCache(short[] lengths, char[] roots, int[] formData) {
|
||||
this.lengths = lengths;
|
||||
this.roots = roots;
|
||||
this.formData = formData;
|
||||
}
|
||||
|
||||
static SuggestibleEntryCache buildCache(WordStorage storage) {
|
||||
var consumer =
|
||||
new BiConsumer<CharsRef, Supplier<IntsRef>>() {
|
||||
short[] lengths = new short[10];
|
||||
final StringBuilder roots = new StringBuilder();
|
||||
int[] formData = new int[10];
|
||||
int lenOffset = 0;
|
||||
int formDataOffset = 0;
|
||||
|
||||
@Override
|
||||
public void accept(CharsRef root, Supplier<IntsRef> formSupplier) {
|
||||
if (root.length > Short.MAX_VALUE) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Too long dictionary entry, please report this to dev@lucene.apache.org");
|
||||
}
|
||||
|
||||
IntsRef forms = formSupplier.get();
|
||||
|
||||
lengths = ArrayUtil.grow(lengths, lenOffset + 2);
|
||||
lengths[lenOffset] = (short) root.length;
|
||||
lengths[lenOffset + 1] = (short) forms.length;
|
||||
lenOffset += 2;
|
||||
|
||||
roots.append(root.chars, root.offset, root.length);
|
||||
|
||||
formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
|
||||
System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
|
||||
formDataOffset += forms.length;
|
||||
}
|
||||
};
|
||||
|
||||
storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);
|
||||
|
||||
return new SuggestibleEntryCache(
|
||||
ArrayUtil.copyOfSubArray(consumer.lengths, 0, consumer.lenOffset),
|
||||
consumer.roots.toString().toCharArray(),
|
||||
ArrayUtil.copyOfSubArray(consumer.formData, 0, consumer.formDataOffset));
|
||||
}
|
||||
|
||||
void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
CharsRef chars = new CharsRef(roots, 0, 0);
|
||||
IntsRef forms = new IntsRef(formData, 0, 0);
|
||||
Supplier<IntsRef> formSupplier = () -> forms;
|
||||
int rootOffset = 0;
|
||||
int formDataOffset = 0;
|
||||
for (int i = 0; i < lengths.length; i += 2) {
|
||||
int rootLength = lengths[i];
|
||||
short formDataLength = lengths[i + 1];
|
||||
if (rootLength >= minLength && rootLength <= maxLength) {
|
||||
chars.offset = rootOffset;
|
||||
chars.length = rootLength;
|
||||
forms.offset = formDataOffset;
|
||||
forms.length = formDataLength;
|
||||
processor.accept(chars, formSupplier);
|
||||
}
|
||||
rootOffset += rootLength;
|
||||
formDataOffset += formDataLength;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
|
@ -54,7 +55,8 @@ class WordStorage {
|
|||
private static final int COLLISION_MASK = 0x40;
|
||||
private static final int SUGGESTIBLE_MASK = 0x20;
|
||||
private static final int MAX_STORED_LENGTH = SUGGESTIBLE_MASK - 1;
|
||||
|
||||
private final int maxEntryLength;
|
||||
private final boolean hasCustomMorphData;
|
||||
/**
|
||||
* A map from word's hash (modulo array's length) into an int containing:
|
||||
*
|
||||
|
@ -89,7 +91,10 @@ class WordStorage {
|
|||
*/
|
||||
private final byte[] wordData;
|
||||
|
||||
private WordStorage(int[] hashTable, byte[] wordData) {
|
||||
private WordStorage(
|
||||
int maxEntryLength, boolean hasCustomMorphData, int[] hashTable, byte[] wordData) {
|
||||
this.maxEntryLength = maxEntryLength;
|
||||
this.hasCustomMorphData = hasCustomMorphData;
|
||||
this.hashTable = hashTable;
|
||||
this.wordData = wordData;
|
||||
}
|
||||
|
@ -153,11 +158,13 @@ class WordStorage {
|
|||
* can be modified in any way, but may not be saved for later by the processor
|
||||
*/
|
||||
void processSuggestibleWords(
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, IntsRef> processor) {
|
||||
int minLength, int maxLength, BiConsumer<CharsRef, Supplier<IntsRef>> processor) {
|
||||
assert minLength <= maxLength;
|
||||
maxLength = Math.min(maxEntryLength, maxLength);
|
||||
|
||||
CharsRef chars = new CharsRef(maxLength);
|
||||
IntsRef forms = new IntsRef();
|
||||
ByteArrayDataInput in = new ByteArrayDataInput(wordData);
|
||||
var formSupplier = new LazyFormReader(in);
|
||||
for (int entryCode : hashTable) {
|
||||
int pos = entryCode & OFFSET_MASK;
|
||||
int mask = entryCode >>> OFFSET_BITS;
|
||||
|
@ -179,11 +186,7 @@ class WordStorage {
|
|||
}
|
||||
|
||||
if (mightMatch) {
|
||||
int dataLength = in.readVInt();
|
||||
if (forms.ints.length < dataLength) {
|
||||
forms.ints = new int[dataLength];
|
||||
}
|
||||
readForms(forms, in, dataLength);
|
||||
formSupplier.dataPos = in.getPosition();
|
||||
while (prevPos != 0 && wordStart > 0) {
|
||||
in.setPosition(prevPos);
|
||||
chars.chars[--wordStart] = (char) in.readVInt();
|
||||
|
@ -193,7 +196,7 @@ class WordStorage {
|
|||
if (prevPos == 0) {
|
||||
chars.offset = wordStart;
|
||||
chars.length = maxLength - wordStart;
|
||||
processor.accept(chars, forms);
|
||||
processor.accept(chars, formSupplier);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -257,6 +260,7 @@ class WordStorage {
|
|||
private final ByteArrayDataOutput dataWriter;
|
||||
private int commonPrefixLength, commonPrefixPos;
|
||||
private int actualWords;
|
||||
private int maxEntryLength;
|
||||
|
||||
/**
|
||||
* @param wordCount an approximate number of the words in the resulting dictionary, used to
|
||||
|
@ -297,6 +301,8 @@ class WordStorage {
|
|||
* {@link String#compareTo} rules.
|
||||
*/
|
||||
void add(String entry, char[] flags, int morphDataID) throws IOException {
|
||||
maxEntryLength = Math.max(maxEntryLength, entry.length());
|
||||
|
||||
if (!entry.equals(currentEntry)) {
|
||||
if (currentEntry != null) {
|
||||
if (entry.compareTo(currentEntry) < 0) {
|
||||
|
@ -411,8 +417,36 @@ class WordStorage {
|
|||
WordStorage build() throws IOException {
|
||||
assert !group.isEmpty() : "build() should be only called once";
|
||||
flushGroup();
|
||||
return new WordStorage(
|
||||
hashTable, ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition()));
|
||||
byte[] trimmedData = ArrayUtil.copyOfSubArray(wordData, 0, dataWriter.getPosition());
|
||||
return new WordStorage(maxEntryLength, hasCustomMorphData, hashTable, trimmedData);
|
||||
}
|
||||
}
|
||||
|
||||
private class LazyFormReader implements Supplier<IntsRef> {
|
||||
int dataPos;
|
||||
private final ByteArrayDataInput in;
|
||||
private final IntsRef forms;
|
||||
|
||||
LazyFormReader(ByteArrayDataInput in) {
|
||||
this.in = in;
|
||||
forms = new IntsRef();
|
||||
}
|
||||
|
||||
@Override
|
||||
public IntsRef get() {
|
||||
in.setPosition(dataPos);
|
||||
int entryCount = in.readVInt() / (hasCustomMorphData ? 2 : 1);
|
||||
if (forms.ints.length < entryCount) {
|
||||
forms.ints = new int[entryCount];
|
||||
}
|
||||
for (int i = 0; i < entryCount; i++) {
|
||||
forms.ints[i] = in.readVInt();
|
||||
if (hasCustomMorphData) {
|
||||
in.readVInt();
|
||||
}
|
||||
}
|
||||
forms.length = entryCount;
|
||||
return forms;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -149,6 +149,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void testDictionariesLoadSuccessfully() throws Exception {
|
||||
AtomicLong memoryWithCache = new AtomicLong();
|
||||
AtomicLong totalMemory = new AtomicLong();
|
||||
AtomicLong totalWords = new AtomicLong();
|
||||
int threads = Runtime.getRuntime().availableProcessors();
|
||||
|
@ -159,7 +160,16 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
(Path aff) -> {
|
||||
try {
|
||||
Dictionary dic = loadDictionary(aff);
|
||||
new Hunspell(dic).spell("aaaa");
|
||||
Suggester suggester = new Suggester(dic).withSuggestibleEntryCache();
|
||||
try {
|
||||
suggester.suggestWithTimeout("aaaaaaaaaa", Hunspell.SUGGEST_TIME_LIMIT, () -> {});
|
||||
} catch (
|
||||
@SuppressWarnings("unused")
|
||||
SuggestionTimeoutException e) {
|
||||
}
|
||||
totalMemory.addAndGet(RamUsageTester.ramUsed(dic));
|
||||
memoryWithCache.addAndGet(RamUsageTester.ramUsed(suggester));
|
||||
totalWords.addAndGet(RamUsageTester.ramUsed(dic.words));
|
||||
System.out.println(aff + "\t" + memoryUsageSummary(dic));
|
||||
} catch (Throwable e) {
|
||||
|
@ -195,6 +205,9 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
System.out.println("Total memory: " + RamUsageEstimator.humanReadableUnits(totalMemory.get()));
|
||||
System.out.println(
|
||||
"Total memory for word storage: " + RamUsageEstimator.humanReadableUnits(totalWords.get()));
|
||||
System.out.println(
|
||||
"Additional memory if withSuggestibleEntryCache is enabled: "
|
||||
+ RamUsageEstimator.humanReadableUnits(memoryWithCache.get() - totalMemory.get()));
|
||||
}
|
||||
|
||||
private static String memoryUsageSummary(Dictionary dic) {
|
||||
|
|
|
@ -110,6 +110,12 @@ public class TestDictionary extends LuceneTestCase {
|
|||
Set<String> processed = new HashSet<>();
|
||||
dictionary.words.processSuggestibleWords(
|
||||
minLength, maxLength, (word, __) -> processed.add(word.toString()));
|
||||
|
||||
Set<String> cached = new HashSet<>();
|
||||
SuggestibleEntryCache.buildCache(dictionary.words)
|
||||
.processSuggestibleWords(minLength, maxLength, (word, __) -> cached.add(word.toString()));
|
||||
assertEquals(processed, cached);
|
||||
|
||||
return processed;
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.CancellationException;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
|
@ -54,6 +55,33 @@ public class TestHunspell extends LuceneTestCase {
|
|||
assertThrows(CancellationException.class, () -> hunspell.suggest("apac"));
|
||||
}
|
||||
|
||||
public void testCustomCheckCanceledGivesPartialResult() throws Exception {
|
||||
Dictionary dictionary = loadDictionary(false, "simple.aff", "simple.dic");
|
||||
|
||||
List<String> expected = List.of("apach");
|
||||
assertEquals(expected, new Hunspell(dictionary, NO_TIMEOUT, () -> {}).suggest("apac"));
|
||||
|
||||
AtomicInteger counter = new AtomicInteger();
|
||||
String msg = "msg";
|
||||
Runnable checkCanceled =
|
||||
() -> {
|
||||
if (counter.incrementAndGet() > 400) {
|
||||
throw new SuggestionTimeoutException(msg, null);
|
||||
}
|
||||
};
|
||||
|
||||
Hunspell hunspell = new Hunspell(dictionary, RETURN_PARTIAL_RESULT, checkCanceled);
|
||||
assertEquals(expected, hunspell.suggest("apac"));
|
||||
|
||||
counter.set(0);
|
||||
var e =
|
||||
assertThrows(
|
||||
SuggestionTimeoutException.class,
|
||||
() -> new Suggester(dictionary).suggestNoTimeout("apac", checkCanceled));
|
||||
assertEquals(expected, e.getPartialResult());
|
||||
assertEquals("msg", e.getMessage());
|
||||
}
|
||||
|
||||
public void testSuggestionTimeLimit() throws IOException, ParseException {
|
||||
int timeLimitMs = 10;
|
||||
|
||||
|
|
|
@ -86,7 +86,7 @@ public class TestPerformance extends LuceneTestCase {
|
|||
|
||||
@Test
|
||||
public void de_suggest() throws Exception {
|
||||
checkSuggestionPerformance("de", 100);
|
||||
checkSuggestionPerformance("de", 150);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -163,33 +163,33 @@ public class TestPerformance extends LuceneTestCase {
|
|||
|
||||
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
||||
Dictionary dictionary = loadDictionary(code);
|
||||
Suggester suggester = new Suggester(dictionary).withSuggestibleEntryCache();
|
||||
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
||||
List<String> words =
|
||||
loadWords(code, wordCount, dictionary).stream()
|
||||
.distinct()
|
||||
.filter(w -> hasQuickSuggestions(speller, w))
|
||||
.filter(w -> hasQuickSuggestions(speller, suggester, w))
|
||||
.collect(Collectors.toList());
|
||||
System.out.println("Checking " + words.size() + " misspelled words");
|
||||
|
||||
Hunspell fullSpeller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||
measure(
|
||||
"Suggestions for " + code,
|
||||
words.size(),
|
||||
blackHole -> {
|
||||
for (String word : words) {
|
||||
blackHole.accept(fullSpeller.suggest(word));
|
||||
blackHole.accept(suggester.suggestNoTimeout(word, () -> {}));
|
||||
}
|
||||
});
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
private boolean hasQuickSuggestions(Hunspell speller, String word) {
|
||||
private boolean hasQuickSuggestions(Hunspell speller, Suggester suggester, String word) {
|
||||
if (speller.spell(word)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
speller.suggest(word);
|
||||
suggester.suggestWithTimeout(word, Hunspell.SUGGEST_TIME_LIMIT, () -> {});
|
||||
} catch (
|
||||
@SuppressWarnings("unused")
|
||||
SuggestionTimeoutException e) {
|
||||
|
|
|
@ -249,10 +249,14 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
InputStream dictStream = Files.newInputStream(dicFile);
|
||||
|
||||
Hunspell speller;
|
||||
Suggester defaultSuggester;
|
||||
Suggester cachingSuggester;
|
||||
try {
|
||||
Dictionary dictionary =
|
||||
new Dictionary(new ByteBuffersDirectory(), "dictionary", affixStream, dictStream);
|
||||
speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||
defaultSuggester = new Suggester(dictionary);
|
||||
cachingSuggester = new Suggester(dictionary).withSuggestibleEntryCache();
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(affixStream);
|
||||
IOUtils.closeWhileHandlingException(dictStream);
|
||||
|
@ -273,12 +277,8 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
assertFalse("Unexpectedly considered correct: " + word, speller.spell(word.trim()));
|
||||
}
|
||||
if (Files.exists(sug)) {
|
||||
String suggestions =
|
||||
wrongWords.stream()
|
||||
.map(s -> String.join(", ", speller.suggest(s)))
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.joining("\n"));
|
||||
assertEquals(Files.readString(sug).trim(), suggestions);
|
||||
assertEquals(Files.readString(sug).trim(), suggest(defaultSuggester, wrongWords));
|
||||
assertEquals(Files.readString(sug).trim(), suggest(cachingSuggester, wrongWords));
|
||||
}
|
||||
} else {
|
||||
assertFalse(".sug file without .wrong file!", Files.exists(sug));
|
||||
|
@ -290,6 +290,13 @@ public class TestSpellChecking extends LuceneTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
private static String suggest(Suggester suggester, List<String> wrongWords) {
|
||||
return wrongWords.stream()
|
||||
.map(s -> String.join(", ", suggester.suggestNoTimeout(s, () -> {})))
|
||||
.filter(s -> !s.isEmpty())
|
||||
.collect(Collectors.joining("\n"));
|
||||
}
|
||||
|
||||
private static Set<String> expandWholeDictionary(Path dic, Hunspell speller) throws IOException {
|
||||
Set<String> everythingGenerated = new HashSet<>();
|
||||
boolean generatedEverything = true;
|
||||
|
|
Loading…
Reference in New Issue