LUCENE-9852: Make Hunspell thread-safe (#24)

This commit is contained in:
Peter Gromov 2021-03-19 02:57:03 +01:00 committed by GitHub
parent 5b36af3cd7
commit 28edbf8fc6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 66 additions and 60 deletions

View File

@ -48,8 +48,7 @@ import org.apache.lucene.util.IntsRef;
* <li>PHONE affix file option for suggestions * <li>PHONE affix file option for suggestions
* </ul> * </ul>
* *
* <p>The objects of this class are not thread-safe (but a single underlying Dictionary can be * <p>The objects of this class are thread-safe.
* shared by multiple spell-checkers in different threads).
*/ */
public class Hunspell { public class Hunspell {
static final long SUGGEST_TIME_LIMIT = 250; static final long SUGGEST_TIME_LIMIT = 250;

View File

@ -33,11 +33,6 @@ import org.apache.lucene.util.fst.FST;
*/ */
final class Stemmer { final class Stemmer {
private final Dictionary dictionary; private final Dictionary dictionary;
private final StringBuilder segment = new StringBuilder();
// used for normalization
private final StringBuilder scratchSegment = new StringBuilder();
private char[] scratchBuffer = new char[32];
// it's '1' if we have no stem exceptions, otherwise every other form // it's '1' if we have no stem exceptions, otherwise every other form
// is really an ID pointing to the exception table // is really an ID pointing to the exception table
@ -50,16 +45,6 @@ final class Stemmer {
*/ */
public Stemmer(Dictionary dictionary) { public Stemmer(Dictionary dictionary) {
this.dictionary = dictionary; this.dictionary = dictionary;
prefixReader = dictionary.prefixes == null ? null : dictionary.prefixes.getBytesReader();
suffixReader = dictionary.suffixes == null ? null : dictionary.suffixes.getBytesReader();
for (int level = 0; level < 3; level++) {
if (dictionary.prefixes != null) {
prefixArcs[level] = new FST.Arc<>();
}
if (dictionary.suffixes != null) {
suffixArcs[level] = new FST.Arc<>();
}
}
formStep = dictionary.formStep(); formStep = dictionary.formStep();
} }
@ -82,11 +67,11 @@ final class Stemmer {
public List<CharsRef> stem(char[] word, int length) { public List<CharsRef> stem(char[] word, int length) {
if (dictionary.mayNeedInputCleaning()) { if (dictionary.mayNeedInputCleaning()) {
scratchSegment.setLength(0); CharsRef scratchSegment = new CharsRef(word, 0, length);
scratchSegment.append(word, 0, length);
if (dictionary.needsInputCleaning(scratchSegment)) { if (dictionary.needsInputCleaning(scratchSegment)) {
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment); StringBuilder segment = new StringBuilder();
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length()); dictionary.cleanInput(scratchSegment, segment);
char[] scratchBuffer = new char[segment.length()];
length = segment.length(); length = segment.length();
segment.getChars(0, length, scratchBuffer, 0); segment.getChars(0, length, scratchBuffer, 0);
word = scratchBuffer; word = scratchBuffer;
@ -122,8 +107,8 @@ final class Stemmer {
} }
boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) { boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
char[] titleBuffer = wordCase == WordCase.UPPER ? caseFoldTitle(word, length) : null;
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length);
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length); char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null && !processor.process(aposCase, length, wordCase)) { if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
return false; return false;
@ -140,7 +125,7 @@ final class Stemmer {
return true; return true;
} }
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); char[] lowerBuffer = caseFoldLower(titleBuffer != null ? titleBuffer : word, length);
if (!processor.process(lowerBuffer, length, wordCase)) { if (!processor.process(lowerBuffer, length, wordCase)) {
return false; return false;
} }
@ -152,10 +137,6 @@ final class Stemmer {
return true; return true;
} }
// temporary buffers for case variants
private char[] lowerBuffer = new char[8];
private char[] titleBuffer = new char[8];
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */ /** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
WordCase caseOf(char[] word, int length) { WordCase caseOf(char[] word, int length) {
if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) { if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
@ -166,19 +147,21 @@ final class Stemmer {
} }
/** folds titlecase variant of word to titleBuffer */ /** folds titlecase variant of word to titleBuffer */
private void caseFoldTitle(char[] word, int length) { private char[] caseFoldTitle(char[] word, int length) {
titleBuffer = ArrayUtil.grow(titleBuffer, length); char[] titleBuffer = new char[length];
System.arraycopy(word, 0, titleBuffer, 0, length); System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) { for (int i = 1; i < length; i++) {
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]); titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
} }
return titleBuffer;
} }
/** folds lowercase variant of word (title cased) to lowerBuffer */ /** folds lowercase variant of word (title cased) to lowerBuffer */
private void caseFoldLower(char[] word, int length) { private char[] caseFoldLower(char[] word, int length) {
lowerBuffer = ArrayUtil.grow(lowerBuffer, length); char[] lowerBuffer = new char[length];
System.arraycopy(word, 0, lowerBuffer, 0, length); System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]); lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
return lowerBuffer;
} }
// Special prefix handling for Catalan, French, Italian: // Special prefix handling for Catalan, French, Italian:
@ -315,7 +298,7 @@ final class Stemmer {
String exception = stemException(morphDataId); String exception = stemException(morphDataId);
if (dictionary.oconv != null) { if (dictionary.oconv != null) {
scratchSegment.setLength(0); StringBuilder scratchSegment = new StringBuilder();
if (exception != null) { if (exception != null) {
scratchSegment.append(exception); scratchSegment.append(exception);
} else { } else {
@ -334,16 +317,6 @@ final class Stemmer {
} }
} }
// some state for traversing FSTs
private final FST.BytesReader prefixReader;
private final FST.BytesReader suffixReader;
@SuppressWarnings({"unchecked", "rawtypes"})
private final FST.Arc<IntsRef>[] prefixArcs = new FST.Arc[3];
@SuppressWarnings({"unchecked", "rawtypes"})
private final FST.Arc<IntsRef>[] suffixArcs = new FST.Arc[3];
/** /**
* Generates a list of stems for the provided word * Generates a list of stems for the provided word
* *
@ -372,15 +345,16 @@ final class Stemmer {
boolean doPrefix, boolean doPrefix,
boolean previousWasPrefix, boolean previousWasPrefix,
RootProcessor processor) { RootProcessor processor) {
FST.Arc<IntsRef> arc = new FST.Arc<>();
if (doPrefix && dictionary.prefixes != null) { if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes; FST<IntsRef> fst = dictionary.prefixes;
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; FST.BytesReader reader = fst.getBytesReader();
fst.getFirstArc(arc); fst.getFirstArc(arc);
IntsRef output = fst.outputs.getNoOutput(); IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? length + 1 : length; int limit = dictionary.fullStrip ? length + 1 : length;
for (int i = 0; i < limit; i++) { for (int i = 0; i < limit; i++) {
if (i > 0) { if (i > 0) {
output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]); output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i - 1]);
if (output == null) { if (output == null) {
break; break;
} }
@ -423,13 +397,13 @@ final class Stemmer {
if (dictionary.suffixes != null) { if (dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes; FST<IntsRef> fst = dictionary.suffixes;
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth]; FST.BytesReader reader = fst.getBytesReader();
fst.getFirstArc(arc); fst.getFirstArc(arc);
IntsRef output = fst.outputs.getNoOutput(); IntsRef output = fst.outputs.getNoOutput();
int limit = dictionary.fullStrip ? 0 : 1; int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) { for (int i = length; i >= limit; i--) {
if (i < length) { if (i < length) {
output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]); output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i]);
if (output == null) { if (output == null) {
break; break;
} }

View File

@ -28,11 +28,15 @@ import java.nio.file.Paths;
import java.text.ParseException; import java.text.ParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.function.Consumer; import java.util.function.Consumer;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NamedThreadFactory;
import org.junit.Assume; import org.junit.Assume;
import org.junit.AssumptionViolatedException; import org.junit.AssumptionViolatedException;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -106,26 +110,55 @@ public class TestPerformance extends LuceneTestCase {
Dictionary dictionary = loadDictionary(code); Dictionary dictionary = loadDictionary(code);
List<String> words = loadWords(code, wordCount, dictionary); List<String> words = loadWords(code, wordCount, dictionary);
List<String> halfWords = words.subList(0, words.size() / 2);
Stemmer stemmer = new Stemmer(dictionary); Stemmer stemmer = new Stemmer(dictionary);
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {}); Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
measure( int cpus = Runtime.getRuntime().availableProcessors();
"Stemming " + code, ExecutorService executor =
blackHole -> { Executors.newFixedThreadPool(cpus, new NamedThreadFactory("hunspellStemming-"));
for (String word : words) {
blackHole.accept(stemmer.stem(word)); try {
} measure("Stemming " + code, blackHole -> stemWords(words, stemmer, blackHole));
});
measure( measure(
"Spellchecking " + code, "Multi-threaded stemming " + code,
blackHole -> { blackHole -> {
for (String word : words) { List<Future<?>> futures = new ArrayList<>();
blackHole.accept(speller.spell(word)); for (int i = 0; i < cpus; i++) {
} Stemmer localStemmer = new Stemmer(dictionary);
}); futures.add(executor.submit(() -> stemWords(halfWords, localStemmer, blackHole)));
}
try {
for (Future<?> future : futures) {
future.get();
}
} catch (Exception e) {
throw new RuntimeException(e);
}
});
measure(
"Spellchecking " + code,
blackHole -> {
for (String word : words) {
blackHole.accept(speller.spell(word));
}
});
} finally {
executor.shutdown();
assertTrue(executor.awaitTermination(1, TimeUnit.MINUTES));
}
System.out.println(); System.out.println();
} }
private void stemWords(List<String> words, Stemmer stemmer, Consumer<Object> blackHole) {
for (String word : words) {
blackHole.accept(stemmer.stem(word));
}
}
private void checkSuggestionPerformance(String code, int wordCount) throws Exception { private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
Dictionary dictionary = loadDictionary(code); Dictionary dictionary = loadDictionary(code);
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {}); Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});