LUCENE-9726: Hunspell: speed up spellchecking by stopping at a single… (#2295)

This commit is contained in:
Peter Gromov 2021-02-04 09:13:11 +01:00 committed by GitHub
parent e2cf6ee74d
commit 04167b27f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 170 additions and 131 deletions

View File

@ -16,7 +16,6 @@
*/ */
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
import java.util.List;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
@ -58,10 +57,9 @@ class CheckCompoundPattern {
} }
boolean prohibitsCompounding( boolean prohibitsCompounding(
CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) { CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
if (isNonAffixedPattern(endChars)) { if (isNonAffixedPattern(endChars)) {
if (stemsBefore.stream() if (!charsMatch(word, breakPos - stemBefore.length, stemBefore.chars)) {
.noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
return false; return false;
} }
} else if (!charsMatch(word, breakPos - endChars.length, endChars)) { } else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
@ -69,18 +67,18 @@ class CheckCompoundPattern {
} }
if (isNonAffixedPattern(beginChars)) { if (isNonAffixedPattern(beginChars)) {
if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) { if (!charsMatch(word, breakPos, stemAfter.chars)) {
return false; return false;
} }
} else if (!charsMatch(word, breakPos, beginChars)) { } else if (!charsMatch(word, breakPos, beginChars)) {
return false; return false;
} }
if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) { if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
return false; return false;
} }
//noinspection RedundantIfStatement //noinspection RedundantIfStatement
if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) { if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
return false; return false;
} }
@ -91,14 +89,9 @@ class CheckCompoundPattern {
return pattern.length == 1 && pattern[0] == '0'; return pattern.length == 1 && pattern[0] == '0';
} }
private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) { private boolean stemHasFlags(CharsRef stem, char[] flags) {
for (CharsRef stem : stems) { IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length); return forms != null && hasAllFlags(flags, forms);
if (forms != null && hasAllFlags(flags, forms)) {
return true;
}
}
return false;
} }
private boolean hasAllFlags(char[] flags, IntsRef forms) { private boolean hasAllFlags(char[] flags, IntsRef forms) {

View File

@ -140,7 +140,7 @@ public class SpellChecker {
return false; return false;
} }
if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) { if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
return true; return true;
} }
@ -156,8 +156,24 @@ public class SpellChecker {
return false; return false;
} }
private CharsRef findStem(
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
CharsRef[] result = {null};
stemmer.doStem(
wordChars,
offset,
length,
originalCase,
context,
(stem, forms, formID) -> {
result[0] = stem;
return false;
});
return result[0];
}
private boolean checkCompounds( private boolean checkCompounds(
CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) { CharsRef word, WordCase originalCase, int depth, Predicate<CharsRef> checkPatterns) {
if (depth > dictionary.compoundMax - 2) return false; if (depth > dictionary.compoundMax - 2) return false;
int limit = word.length - dictionary.compoundMin + 1; int limit = word.length - dictionary.compoundMin + 1;
@ -165,16 +181,15 @@ public class SpellChecker {
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE; WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakOffset = word.offset + breakPos; int breakOffset = word.offset + breakPos;
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) { if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
List<CharsRef> stems = CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context); if (stem == null
if (stems.isEmpty()
&& dictionary.simplifiedTriple && dictionary.simplifiedTriple
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) { && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context); stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
} }
if (!stems.isEmpty() && checkPatterns.test(stems)) { if (stem != null && checkPatterns.test(stem)) {
Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems); Predicate<CharsRef> nextCheck = checkNextPatterns(word, breakPos, stem);
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) { if (checkCompoundsAfter(word, breakPos, originalCase, depth, stem, nextCheck)) {
return true; return true;
} }
} }
@ -195,12 +210,11 @@ public class SpellChecker {
if (expanded != null) { if (expanded != null) {
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE; WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
int breakPos = pos + pattern.endLength(); int breakPos = pos + pattern.endLength();
List<CharsRef> stems = CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context); if (stem != null) {
if (!stems.isEmpty()) { Predicate<CharsRef> nextCheck =
Predicate<List<CharsRef>> nextCheck = next -> pattern.prohibitsCompounding(expanded, breakPos, stem, next);
next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next); if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stem, nextCheck)) {
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
return true; return true;
} }
} }
@ -209,11 +223,10 @@ public class SpellChecker {
return false; return false;
} }
private Predicate<List<CharsRef>> checkNextPatterns( private Predicate<CharsRef> checkNextPatterns(CharsRef word, int breakPos, CharsRef stems) {
CharsRef word, int breakPos, List<CharsRef> stems) { return nextStem ->
return nextStems ->
dictionary.checkCompoundPatterns.stream() dictionary.checkCompoundPatterns.stream()
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems)); .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStem));
} }
private boolean checkCompoundsAfter( private boolean checkCompoundsAfter(
@ -221,16 +234,16 @@ public class SpellChecker {
int breakPos, int breakPos,
WordCase originalCase, WordCase originalCase,
int depth, int depth,
List<CharsRef> prevStems, CharsRef prevStem,
Predicate<List<CharsRef>> checkPatterns) { Predicate<CharsRef> checkPatterns) {
int remainingLength = word.length - breakPos; int remainingLength = word.length - breakPos;
int breakOffset = word.offset + breakPos; int breakOffset = word.offset + breakPos;
List<CharsRef> tailStems = CharsRef tailStem =
stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END); findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
if (!tailStems.isEmpty() if (tailStem != null
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems)) && !(dictionary.checkCompoundDup && equalsIgnoreCase(prevStem, tailStem))
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase) && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
&& checkPatterns.test(tailStems)) { && checkPatterns.test(tailStem)) {
return true; return true;
} }

View File

@ -100,24 +100,41 @@ final class Stemmer {
return Collections.emptyList(); return Collections.emptyList();
} }
List<CharsRef> list = new ArrayList<>();
RootProcessor processor =
(stem, forms, formID) -> {
list.add(newStem(stem, forms, formID));
return true;
};
if (!doStem(word, 0, length, null, WordContext.SIMPLE_WORD, processor)) {
return list;
}
WordCase wordCase = caseOf(word, length); WordCase wordCase = caseOf(word, length);
List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) { if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
addCaseVariations(word, length, wordCase, list); addCaseVariations(word, length, wordCase, processor);
} }
return list; return list;
} }
private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) { private void addCaseVariations(
char[] word, int length, WordCase wordCase, RootProcessor processor) {
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER) {
caseFoldTitle(word, length); caseFoldTitle(word, length);
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length); char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
if (aposCase != null) { if (aposCase != null) {
list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD)); if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
return;
}
}
if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
return;
} }
list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
for (char[] variation : sharpSVariations(titleBuffer, length)) { for (char[] variation : sharpSVariations(titleBuffer, length)) {
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD)); if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
return;
}
} }
} }
@ -126,10 +143,14 @@ final class Stemmer {
} }
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length); caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD)); if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
return;
}
if (wordCase == WordCase.UPPER) { if (wordCase == WordCase.UPPER) {
for (char[] variation : sharpSVariations(lowerBuffer, length)) { for (char[] variation : sharpSVariations(lowerBuffer, length)) {
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD)); if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
return;
}
} }
} }
} }
@ -220,9 +241,13 @@ final class Stemmer {
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList()); return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
} }
List<CharsRef> doStem( boolean doStem(
char[] word, int offset, int length, WordCase originalCase, WordContext context) { char[] word,
List<CharsRef> stems = new ArrayList<>(); int offset,
int length,
WordCase originalCase,
WordContext context,
RootProcessor processor) {
IntsRef forms = dictionary.lookupWord(word, offset, length); IntsRef forms = dictionary.lookupWord(word, offset, length);
if (forms != null) { if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) { for (int i = 0; i < forms.length; i += formStep) {
@ -241,36 +266,37 @@ final class Stemmer {
if (context.isCompound()) { if (context.isCompound()) {
if (context != WordContext.COMPOUND_END if (context != WordContext.COMPOUND_END
&& Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) { && Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
return new ArrayList<>(); return false;
} }
if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag) if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) { && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
continue; continue;
} }
} }
stems.add(newStem(word, offset, length, forms, i)); if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) {
return false;
}
} }
} }
try { try {
stems.addAll( return stem(
stem( word,
word, offset,
offset, length,
length, context,
context, -1,
-1, Dictionary.FLAG_UNSET,
Dictionary.FLAG_UNSET, -1,
-1, 0,
0, true,
true, true,
true, false,
false, false,
false, originalCase,
originalCase)); processor);
} catch (IOException bogus) { } catch (IOException bogus) {
throw new RuntimeException(bogus); throw new RuntimeException(bogus);
} }
return stems;
} }
private boolean acceptCase( private boolean acceptCase(
@ -319,7 +345,12 @@ final class Stemmer {
return deduped; return deduped;
} }
private CharsRef newStem(char[] buffer, int offset, int length, IntsRef forms, int formID) { interface RootProcessor {
/** @return whether the processing should be continued */
boolean processRoot(CharsRef stem, IntsRef forms, int formID);
}
private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) {
final String exception; final String exception;
if (dictionary.hasStemExceptions) { if (dictionary.hasStemExceptions) {
int exceptionID = forms.ints[forms.offset + formID + 1]; int exceptionID = forms.ints[forms.offset + formID + 1];
@ -337,7 +368,7 @@ final class Stemmer {
if (exception != null) { if (exception != null) {
scratchSegment.append(exception); scratchSegment.append(exception);
} else { } else {
scratchSegment.append(buffer, offset, length); scratchSegment.append(stem.chars, stem.offset, stem.length);
} }
try { try {
Dictionary.applyMappings(dictionary.oconv, scratchSegment); Dictionary.applyMappings(dictionary.oconv, scratchSegment);
@ -351,7 +382,7 @@ final class Stemmer {
if (exception != null) { if (exception != null) {
return new CharsRef(exception); return new CharsRef(exception);
} else { } else {
return new CharsRef(buffer, offset, length); return stem;
} }
} }
} }
@ -385,9 +416,9 @@ final class Stemmer {
* most suffix must also contain circumfix flag. * most suffix must also contain circumfix flag.
* @param originalCase if non-null, represents original word case to disallow case variations of * @param originalCase if non-null, represents original word case to disallow case variations of
* word with KEEPCASE flags * word with KEEPCASE flags
* @return List of stems, or empty list if no stems are found * @return whether the processing should be continued
*/ */
private List<CharsRef> stem( private boolean stem(
char[] word, char[] word,
int offset, int offset,
int length, int length,
@ -400,12 +431,9 @@ final class Stemmer {
boolean doSuffix, boolean doSuffix,
boolean previousWasPrefix, boolean previousWasPrefix,
boolean circumfix, boolean circumfix,
WordCase originalCase) WordCase originalCase,
RootProcessor processor)
throws IOException { throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<>();
if (doPrefix && dictionary.prefixes != null) { if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes; FST<IntsRef> fst = dictionary.prefixes;
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
@ -440,19 +468,21 @@ final class Stemmer {
} }
boolean pureAffix = strippedWord == word; boolean pureAffix = strippedWord == word;
stems.addAll( if (!applyAffix(
applyAffix( strippedWord,
strippedWord, pureAffix ? offset + i : 0,
pureAffix ? offset + i : 0, pureAffix ? length - i : strippedWord.length,
pureAffix ? length - i : strippedWord.length, context,
context, prefix,
prefix, previous,
previous, -1,
-1, recursionDepth,
recursionDepth, true,
true, circumfix,
circumfix, originalCase,
originalCase)); processor)) {
return false;
}
} }
} }
} }
@ -493,25 +523,27 @@ final class Stemmer {
} }
boolean pureAffix = strippedWord == word; boolean pureAffix = strippedWord == word;
stems.addAll( if (!applyAffix(
applyAffix( strippedWord,
strippedWord, pureAffix ? offset : 0,
pureAffix ? offset : 0, pureAffix ? i : strippedWord.length,
pureAffix ? i : strippedWord.length, context,
context, suffix,
suffix, previous,
previous, prefixId,
prefixId, recursionDepth,
recursionDepth, false,
false, circumfix,
circumfix, originalCase,
originalCase)); processor)) {
return false;
}
} }
} }
} }
} }
return stems; return true;
} }
/** /**
@ -632,9 +664,9 @@ final class Stemmer {
* as a stem! * as a stem!
* @param recursionDepth current recursion depth * @param recursionDepth current recursion depth
* @param prefix true if we are removing a prefix (false if it's a suffix) * @param prefix true if we are removing a prefix (false if it's a suffix)
* @return List of stems for the word, or an empty list if none are found * @return whether the processing should be continued
*/ */
private List<CharsRef> applyAffix( private boolean applyAffix(
char[] strippedWord, char[] strippedWord,
int offset, int offset,
int length, int length,
@ -645,12 +677,11 @@ final class Stemmer {
int recursionDepth, int recursionDepth,
boolean prefix, boolean prefix,
boolean circumfix, boolean circumfix,
WordCase originalCase) WordCase originalCase,
RootProcessor processor)
throws IOException { throws IOException {
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG); char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
List<CharsRef> stems = new ArrayList<>();
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix); boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix);
IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length); IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
if (forms != null) { if (forms != null) {
@ -694,7 +725,9 @@ final class Stemmer {
continue; continue;
} }
} }
stems.add(newStem(strippedWord, offset, length, forms, i)); if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) {
return false;
}
} }
} }
} }
@ -720,7 +753,7 @@ final class Stemmer {
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix // COMPLEXPREFIXES = false: combine with another suffix
} else { } else {
return stems; return true;
} }
} else { } else {
doPrefix = false; doPrefix = false;
@ -728,29 +761,29 @@ final class Stemmer {
prefixId = affix; prefixId = affix;
// we took away the second prefix: go look for another suffix // we took away the second prefix: go look for another suffix
} else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) { } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
return stems; return true;
} }
// we took away a prefix, then a suffix: go look for another suffix // we took away a prefix, then a suffix: go look for another suffix
} }
stems.addAll( return stem(
stem( strippedWord,
strippedWord, offset,
offset, length,
length, context,
context, affix,
affix, flag,
flag, prefixId,
prefixId, recursionDepth + 1,
recursionDepth + 1, doPrefix,
doPrefix, true,
true, prefix,
prefix, circumfix,
circumfix, originalCase,
originalCase)); processor);
} }
return stems; return true;
} }
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) { private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) {