mirror of https://github.com/apache/lucene.git
LUCENE-9852: Make Hunspell thread-safe (#24)
This commit is contained in:
parent
5b36af3cd7
commit
28edbf8fc6
|
@ -48,8 +48,7 @@ import org.apache.lucene.util.IntsRef;
|
||||||
* <li>PHONE affix file option for suggestions
|
* <li>PHONE affix file option for suggestions
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
* <p>The objects of this class are not thread-safe (but a single underlying Dictionary can be
|
* <p>The objects of this class are thread-safe.
|
||||||
* shared by multiple spell-checkers in different threads).
|
|
||||||
*/
|
*/
|
||||||
public class Hunspell {
|
public class Hunspell {
|
||||||
static final long SUGGEST_TIME_LIMIT = 250;
|
static final long SUGGEST_TIME_LIMIT = 250;
|
||||||
|
|
|
@ -33,11 +33,6 @@ import org.apache.lucene.util.fst.FST;
|
||||||
*/
|
*/
|
||||||
final class Stemmer {
|
final class Stemmer {
|
||||||
private final Dictionary dictionary;
|
private final Dictionary dictionary;
|
||||||
private final StringBuilder segment = new StringBuilder();
|
|
||||||
|
|
||||||
// used for normalization
|
|
||||||
private final StringBuilder scratchSegment = new StringBuilder();
|
|
||||||
private char[] scratchBuffer = new char[32];
|
|
||||||
|
|
||||||
// it's '1' if we have no stem exceptions, otherwise every other form
|
// it's '1' if we have no stem exceptions, otherwise every other form
|
||||||
// is really an ID pointing to the exception table
|
// is really an ID pointing to the exception table
|
||||||
|
@ -50,16 +45,6 @@ final class Stemmer {
|
||||||
*/
|
*/
|
||||||
public Stemmer(Dictionary dictionary) {
|
public Stemmer(Dictionary dictionary) {
|
||||||
this.dictionary = dictionary;
|
this.dictionary = dictionary;
|
||||||
prefixReader = dictionary.prefixes == null ? null : dictionary.prefixes.getBytesReader();
|
|
||||||
suffixReader = dictionary.suffixes == null ? null : dictionary.suffixes.getBytesReader();
|
|
||||||
for (int level = 0; level < 3; level++) {
|
|
||||||
if (dictionary.prefixes != null) {
|
|
||||||
prefixArcs[level] = new FST.Arc<>();
|
|
||||||
}
|
|
||||||
if (dictionary.suffixes != null) {
|
|
||||||
suffixArcs[level] = new FST.Arc<>();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
formStep = dictionary.formStep();
|
formStep = dictionary.formStep();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,11 +67,11 @@ final class Stemmer {
|
||||||
public List<CharsRef> stem(char[] word, int length) {
|
public List<CharsRef> stem(char[] word, int length) {
|
||||||
|
|
||||||
if (dictionary.mayNeedInputCleaning()) {
|
if (dictionary.mayNeedInputCleaning()) {
|
||||||
scratchSegment.setLength(0);
|
CharsRef scratchSegment = new CharsRef(word, 0, length);
|
||||||
scratchSegment.append(word, 0, length);
|
|
||||||
if (dictionary.needsInputCleaning(scratchSegment)) {
|
if (dictionary.needsInputCleaning(scratchSegment)) {
|
||||||
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
|
StringBuilder segment = new StringBuilder();
|
||||||
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
|
dictionary.cleanInput(scratchSegment, segment);
|
||||||
|
char[] scratchBuffer = new char[segment.length()];
|
||||||
length = segment.length();
|
length = segment.length();
|
||||||
segment.getChars(0, length, scratchBuffer, 0);
|
segment.getChars(0, length, scratchBuffer, 0);
|
||||||
word = scratchBuffer;
|
word = scratchBuffer;
|
||||||
|
@ -122,8 +107,8 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
|
boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
|
||||||
|
char[] titleBuffer = wordCase == WordCase.UPPER ? caseFoldTitle(word, length) : null;
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER) {
|
||||||
caseFoldTitle(word, length);
|
|
||||||
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
||||||
if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
|
if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -140,7 +125,7 @@ final class Stemmer {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
char[] lowerBuffer = caseFoldLower(titleBuffer != null ? titleBuffer : word, length);
|
||||||
if (!processor.process(lowerBuffer, length, wordCase)) {
|
if (!processor.process(lowerBuffer, length, wordCase)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -152,10 +137,6 @@ final class Stemmer {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// temporary buffers for case variants
|
|
||||||
private char[] lowerBuffer = new char[8];
|
|
||||||
private char[] titleBuffer = new char[8];
|
|
||||||
|
|
||||||
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
||||||
WordCase caseOf(char[] word, int length) {
|
WordCase caseOf(char[] word, int length) {
|
||||||
if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
|
if (dictionary.ignoreCase || length == 0 || Character.isLowerCase(word[0])) {
|
||||||
|
@ -166,19 +147,21 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds titlecase variant of word to titleBuffer */
|
/** folds titlecase variant of word to titleBuffer */
|
||||||
private void caseFoldTitle(char[] word, int length) {
|
private char[] caseFoldTitle(char[] word, int length) {
|
||||||
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
char[] titleBuffer = new char[length];
|
||||||
System.arraycopy(word, 0, titleBuffer, 0, length);
|
System.arraycopy(word, 0, titleBuffer, 0, length);
|
||||||
for (int i = 1; i < length; i++) {
|
for (int i = 1; i < length; i++) {
|
||||||
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
|
||||||
}
|
}
|
||||||
|
return titleBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
||||||
private void caseFoldLower(char[] word, int length) {
|
private char[] caseFoldLower(char[] word, int length) {
|
||||||
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
char[] lowerBuffer = new char[length];
|
||||||
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
||||||
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
||||||
|
return lowerBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special prefix handling for Catalan, French, Italian:
|
// Special prefix handling for Catalan, French, Italian:
|
||||||
|
@ -315,7 +298,7 @@ final class Stemmer {
|
||||||
String exception = stemException(morphDataId);
|
String exception = stemException(morphDataId);
|
||||||
|
|
||||||
if (dictionary.oconv != null) {
|
if (dictionary.oconv != null) {
|
||||||
scratchSegment.setLength(0);
|
StringBuilder scratchSegment = new StringBuilder();
|
||||||
if (exception != null) {
|
if (exception != null) {
|
||||||
scratchSegment.append(exception);
|
scratchSegment.append(exception);
|
||||||
} else {
|
} else {
|
||||||
|
@ -334,16 +317,6 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// some state for traversing FSTs
|
|
||||||
private final FST.BytesReader prefixReader;
|
|
||||||
private final FST.BytesReader suffixReader;
|
|
||||||
|
|
||||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
|
||||||
private final FST.Arc<IntsRef>[] prefixArcs = new FST.Arc[3];
|
|
||||||
|
|
||||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
|
||||||
private final FST.Arc<IntsRef>[] suffixArcs = new FST.Arc[3];
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates a list of stems for the provided word
|
* Generates a list of stems for the provided word
|
||||||
*
|
*
|
||||||
|
@ -372,15 +345,16 @@ final class Stemmer {
|
||||||
boolean doPrefix,
|
boolean doPrefix,
|
||||||
boolean previousWasPrefix,
|
boolean previousWasPrefix,
|
||||||
RootProcessor processor) {
|
RootProcessor processor) {
|
||||||
|
FST.Arc<IntsRef> arc = new FST.Arc<>();
|
||||||
if (doPrefix && dictionary.prefixes != null) {
|
if (doPrefix && dictionary.prefixes != null) {
|
||||||
FST<IntsRef> fst = dictionary.prefixes;
|
FST<IntsRef> fst = dictionary.prefixes;
|
||||||
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
|
FST.BytesReader reader = fst.getBytesReader();
|
||||||
fst.getFirstArc(arc);
|
fst.getFirstArc(arc);
|
||||||
IntsRef output = fst.outputs.getNoOutput();
|
IntsRef output = fst.outputs.getNoOutput();
|
||||||
int limit = dictionary.fullStrip ? length + 1 : length;
|
int limit = dictionary.fullStrip ? length + 1 : length;
|
||||||
for (int i = 0; i < limit; i++) {
|
for (int i = 0; i < limit; i++) {
|
||||||
if (i > 0) {
|
if (i > 0) {
|
||||||
output = Dictionary.nextArc(fst, arc, prefixReader, output, word[offset + i - 1]);
|
output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i - 1]);
|
||||||
if (output == null) {
|
if (output == null) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -423,13 +397,13 @@ final class Stemmer {
|
||||||
|
|
||||||
if (dictionary.suffixes != null) {
|
if (dictionary.suffixes != null) {
|
||||||
FST<IntsRef> fst = dictionary.suffixes;
|
FST<IntsRef> fst = dictionary.suffixes;
|
||||||
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
|
FST.BytesReader reader = fst.getBytesReader();
|
||||||
fst.getFirstArc(arc);
|
fst.getFirstArc(arc);
|
||||||
IntsRef output = fst.outputs.getNoOutput();
|
IntsRef output = fst.outputs.getNoOutput();
|
||||||
int limit = dictionary.fullStrip ? 0 : 1;
|
int limit = dictionary.fullStrip ? 0 : 1;
|
||||||
for (int i = length; i >= limit; i--) {
|
for (int i = length; i >= limit; i--) {
|
||||||
if (i < length) {
|
if (i < length) {
|
||||||
output = Dictionary.nextArc(fst, arc, suffixReader, output, word[offset + i]);
|
output = Dictionary.nextArc(fst, arc, reader, output, word[offset + i]);
|
||||||
if (output == null) {
|
if (output == null) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,11 +28,15 @@ import java.nio.file.Paths;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.NamedThreadFactory;
|
||||||
import org.junit.Assume;
|
import org.junit.Assume;
|
||||||
import org.junit.AssumptionViolatedException;
|
import org.junit.AssumptionViolatedException;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
@ -106,16 +110,34 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
Dictionary dictionary = loadDictionary(code);
|
Dictionary dictionary = loadDictionary(code);
|
||||||
|
|
||||||
List<String> words = loadWords(code, wordCount, dictionary);
|
List<String> words = loadWords(code, wordCount, dictionary);
|
||||||
|
List<String> halfWords = words.subList(0, words.size() / 2);
|
||||||
|
|
||||||
Stemmer stemmer = new Stemmer(dictionary);
|
Stemmer stemmer = new Stemmer(dictionary);
|
||||||
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.NO_TIMEOUT, () -> {});
|
||||||
|
int cpus = Runtime.getRuntime().availableProcessors();
|
||||||
|
ExecutorService executor =
|
||||||
|
Executors.newFixedThreadPool(cpus, new NamedThreadFactory("hunspellStemming-"));
|
||||||
|
|
||||||
|
try {
|
||||||
|
measure("Stemming " + code, blackHole -> stemWords(words, stemmer, blackHole));
|
||||||
|
|
||||||
measure(
|
measure(
|
||||||
"Stemming " + code,
|
"Multi-threaded stemming " + code,
|
||||||
blackHole -> {
|
blackHole -> {
|
||||||
for (String word : words) {
|
List<Future<?>> futures = new ArrayList<>();
|
||||||
blackHole.accept(stemmer.stem(word));
|
for (int i = 0; i < cpus; i++) {
|
||||||
|
Stemmer localStemmer = new Stemmer(dictionary);
|
||||||
|
futures.add(executor.submit(() -> stemWords(halfWords, localStemmer, blackHole)));
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
for (Future<?> future : futures) {
|
||||||
|
future.get();
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
measure(
|
measure(
|
||||||
"Spellchecking " + code,
|
"Spellchecking " + code,
|
||||||
blackHole -> {
|
blackHole -> {
|
||||||
|
@ -123,9 +145,20 @@ public class TestPerformance extends LuceneTestCase {
|
||||||
blackHole.accept(speller.spell(word));
|
blackHole.accept(speller.spell(word));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
} finally {
|
||||||
|
executor.shutdown();
|
||||||
|
assertTrue(executor.awaitTermination(1, TimeUnit.MINUTES));
|
||||||
|
}
|
||||||
|
|
||||||
System.out.println();
|
System.out.println();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void stemWords(List<String> words, Stemmer stemmer, Consumer<Object> blackHole) {
|
||||||
|
for (String word : words) {
|
||||||
|
blackHole.accept(stemmer.stem(word));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
private void checkSuggestionPerformance(String code, int wordCount) throws Exception {
|
||||||
Dictionary dictionary = loadDictionary(code);
|
Dictionary dictionary = loadDictionary(code);
|
||||||
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
Hunspell speller = new Hunspell(dictionary, TimeoutPolicy.THROW_EXCEPTION, () -> {});
|
||||||
|
|
Loading…
Reference in New Issue