mirror of https://github.com/apache/lucene.git
LUCENE-9726: Hunspell: speed up spellchecking by stopping at a single… (#2295)
This commit is contained in:
parent
e2cf6ee74d
commit
04167b27f5
|
@ -16,7 +16,6 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.hunspell;
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CharsRef;
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
@ -58,10 +57,9 @@ class CheckCompoundPattern {
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean prohibitsCompounding(
|
boolean prohibitsCompounding(
|
||||||
CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
|
CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
|
||||||
if (isNonAffixedPattern(endChars)) {
|
if (isNonAffixedPattern(endChars)) {
|
||||||
if (stemsBefore.stream()
|
if (!charsMatch(word, breakPos - stemBefore.length, stemBefore.chars)) {
|
||||||
.noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
|
} else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
|
||||||
|
@ -69,18 +67,18 @@ class CheckCompoundPattern {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isNonAffixedPattern(beginChars)) {
|
if (isNonAffixedPattern(beginChars)) {
|
||||||
if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
|
if (!charsMatch(word, breakPos, stemAfter.chars)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (!charsMatch(word, breakPos, beginChars)) {
|
} else if (!charsMatch(word, breakPos, beginChars)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
|
if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
//noinspection RedundantIfStatement
|
//noinspection RedundantIfStatement
|
||||||
if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
|
if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,14 +89,9 @@ class CheckCompoundPattern {
|
||||||
return pattern.length == 1 && pattern[0] == '0';
|
return pattern.length == 1 && pattern[0] == '0';
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
|
private boolean stemHasFlags(CharsRef stem, char[] flags) {
|
||||||
for (CharsRef stem : stems) {
|
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
|
||||||
IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
|
return forms != null && hasAllFlags(flags, forms);
|
||||||
if (forms != null && hasAllFlags(flags, forms)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasAllFlags(char[] flags, IntsRef forms) {
|
private boolean hasAllFlags(char[] flags, IntsRef forms) {
|
||||||
|
|
|
@ -140,7 +140,7 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
|
if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -156,8 +156,24 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private CharsRef findStem(
|
||||||
|
char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
|
||||||
|
CharsRef[] result = {null};
|
||||||
|
stemmer.doStem(
|
||||||
|
wordChars,
|
||||||
|
offset,
|
||||||
|
length,
|
||||||
|
originalCase,
|
||||||
|
context,
|
||||||
|
(stem, forms, formID) -> {
|
||||||
|
result[0] = stem;
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
return result[0];
|
||||||
|
}
|
||||||
|
|
||||||
private boolean checkCompounds(
|
private boolean checkCompounds(
|
||||||
CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
|
CharsRef word, WordCase originalCase, int depth, Predicate<CharsRef> checkPatterns) {
|
||||||
if (depth > dictionary.compoundMax - 2) return false;
|
if (depth > dictionary.compoundMax - 2) return false;
|
||||||
|
|
||||||
int limit = word.length - dictionary.compoundMin + 1;
|
int limit = word.length - dictionary.compoundMin + 1;
|
||||||
|
@ -165,16 +181,15 @@ public class SpellChecker {
|
||||||
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
int breakOffset = word.offset + breakPos;
|
int breakOffset = word.offset + breakPos;
|
||||||
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
|
if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
|
||||||
List<CharsRef> stems =
|
CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
|
||||||
stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
|
if (stem == null
|
||||||
if (stems.isEmpty()
|
|
||||||
&& dictionary.simplifiedTriple
|
&& dictionary.simplifiedTriple
|
||||||
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
|
&& word.chars[breakOffset - 1] == word.chars[breakOffset]) {
|
||||||
stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
|
stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
|
||||||
}
|
}
|
||||||
if (!stems.isEmpty() && checkPatterns.test(stems)) {
|
if (stem != null && checkPatterns.test(stem)) {
|
||||||
Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
|
Predicate<CharsRef> nextCheck = checkNextPatterns(word, breakPos, stem);
|
||||||
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
|
if (checkCompoundsAfter(word, breakPos, originalCase, depth, stem, nextCheck)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -195,12 +210,11 @@ public class SpellChecker {
|
||||||
if (expanded != null) {
|
if (expanded != null) {
|
||||||
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
|
||||||
int breakPos = pos + pattern.endLength();
|
int breakPos = pos + pattern.endLength();
|
||||||
List<CharsRef> stems =
|
CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
|
||||||
stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
|
if (stem != null) {
|
||||||
if (!stems.isEmpty()) {
|
Predicate<CharsRef> nextCheck =
|
||||||
Predicate<List<CharsRef>> nextCheck =
|
next -> pattern.prohibitsCompounding(expanded, breakPos, stem, next);
|
||||||
next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
|
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stem, nextCheck)) {
|
||||||
if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -209,11 +223,10 @@ public class SpellChecker {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Predicate<List<CharsRef>> checkNextPatterns(
|
private Predicate<CharsRef> checkNextPatterns(CharsRef word, int breakPos, CharsRef stems) {
|
||||||
CharsRef word, int breakPos, List<CharsRef> stems) {
|
return nextStem ->
|
||||||
return nextStems ->
|
|
||||||
dictionary.checkCompoundPatterns.stream()
|
dictionary.checkCompoundPatterns.stream()
|
||||||
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
|
.noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStem));
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean checkCompoundsAfter(
|
private boolean checkCompoundsAfter(
|
||||||
|
@ -221,16 +234,16 @@ public class SpellChecker {
|
||||||
int breakPos,
|
int breakPos,
|
||||||
WordCase originalCase,
|
WordCase originalCase,
|
||||||
int depth,
|
int depth,
|
||||||
List<CharsRef> prevStems,
|
CharsRef prevStem,
|
||||||
Predicate<List<CharsRef>> checkPatterns) {
|
Predicate<CharsRef> checkPatterns) {
|
||||||
int remainingLength = word.length - breakPos;
|
int remainingLength = word.length - breakPos;
|
||||||
int breakOffset = word.offset + breakPos;
|
int breakOffset = word.offset + breakPos;
|
||||||
List<CharsRef> tailStems =
|
CharsRef tailStem =
|
||||||
stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
|
||||||
if (!tailStems.isEmpty()
|
if (tailStem != null
|
||||||
&& !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
|
&& !(dictionary.checkCompoundDup && equalsIgnoreCase(prevStem, tailStem))
|
||||||
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
&& !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
|
||||||
&& checkPatterns.test(tailStems)) {
|
&& checkPatterns.test(tailStem)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -100,24 +100,41 @@ final class Stemmer {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<CharsRef> list = new ArrayList<>();
|
||||||
|
RootProcessor processor =
|
||||||
|
(stem, forms, formID) -> {
|
||||||
|
list.add(newStem(stem, forms, formID));
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!doStem(word, 0, length, null, WordContext.SIMPLE_WORD, processor)) {
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
WordCase wordCase = caseOf(word, length);
|
WordCase wordCase = caseOf(word, length);
|
||||||
List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
|
|
||||||
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
|
||||||
addCaseVariations(word, length, wordCase, list);
|
addCaseVariations(word, length, wordCase, processor);
|
||||||
}
|
}
|
||||||
return list;
|
return list;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
|
private void addCaseVariations(
|
||||||
|
char[] word, int length, WordCase wordCase, RootProcessor processor) {
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER) {
|
||||||
caseFoldTitle(word, length);
|
caseFoldTitle(word, length);
|
||||||
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
|
||||||
if (aposCase != null) {
|
if (aposCase != null) {
|
||||||
list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
|
||||||
for (char[] variation : sharpSVariations(titleBuffer, length)) {
|
for (char[] variation : sharpSVariations(titleBuffer, length)) {
|
||||||
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
|
if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -126,10 +143,14 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
|
||||||
list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
|
if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (wordCase == WordCase.UPPER) {
|
if (wordCase == WordCase.UPPER) {
|
||||||
for (char[] variation : sharpSVariations(lowerBuffer, length)) {
|
for (char[] variation : sharpSVariations(lowerBuffer, length)) {
|
||||||
list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
|
if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -220,9 +241,13 @@ final class Stemmer {
|
||||||
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
|
return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
List<CharsRef> doStem(
|
boolean doStem(
|
||||||
char[] word, int offset, int length, WordCase originalCase, WordContext context) {
|
char[] word,
|
||||||
List<CharsRef> stems = new ArrayList<>();
|
int offset,
|
||||||
|
int length,
|
||||||
|
WordCase originalCase,
|
||||||
|
WordContext context,
|
||||||
|
RootProcessor processor) {
|
||||||
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
IntsRef forms = dictionary.lookupWord(word, offset, length);
|
||||||
if (forms != null) {
|
if (forms != null) {
|
||||||
for (int i = 0; i < forms.length; i += formStep) {
|
for (int i = 0; i < forms.length; i += formStep) {
|
||||||
|
@ -241,36 +266,37 @@ final class Stemmer {
|
||||||
if (context.isCompound()) {
|
if (context.isCompound()) {
|
||||||
if (context != WordContext.COMPOUND_END
|
if (context != WordContext.COMPOUND_END
|
||||||
&& Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
|
&& Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
|
||||||
return new ArrayList<>();
|
return false;
|
||||||
}
|
}
|
||||||
if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
|
if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
|
||||||
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
&& !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stems.add(newStem(word, offset, length, forms, i));
|
if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
stems.addAll(
|
return stem(
|
||||||
stem(
|
word,
|
||||||
word,
|
offset,
|
||||||
offset,
|
length,
|
||||||
length,
|
context,
|
||||||
context,
|
-1,
|
||||||
-1,
|
Dictionary.FLAG_UNSET,
|
||||||
Dictionary.FLAG_UNSET,
|
-1,
|
||||||
-1,
|
0,
|
||||||
0,
|
true,
|
||||||
true,
|
true,
|
||||||
true,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
originalCase,
|
||||||
originalCase));
|
processor);
|
||||||
} catch (IOException bogus) {
|
} catch (IOException bogus) {
|
||||||
throw new RuntimeException(bogus);
|
throw new RuntimeException(bogus);
|
||||||
}
|
}
|
||||||
return stems;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean acceptCase(
|
private boolean acceptCase(
|
||||||
|
@ -319,7 +345,12 @@ final class Stemmer {
|
||||||
return deduped;
|
return deduped;
|
||||||
}
|
}
|
||||||
|
|
||||||
private CharsRef newStem(char[] buffer, int offset, int length, IntsRef forms, int formID) {
|
interface RootProcessor {
|
||||||
|
/** @return whether the processing should be continued */
|
||||||
|
boolean processRoot(CharsRef stem, IntsRef forms, int formID);
|
||||||
|
}
|
||||||
|
|
||||||
|
private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) {
|
||||||
final String exception;
|
final String exception;
|
||||||
if (dictionary.hasStemExceptions) {
|
if (dictionary.hasStemExceptions) {
|
||||||
int exceptionID = forms.ints[forms.offset + formID + 1];
|
int exceptionID = forms.ints[forms.offset + formID + 1];
|
||||||
|
@ -337,7 +368,7 @@ final class Stemmer {
|
||||||
if (exception != null) {
|
if (exception != null) {
|
||||||
scratchSegment.append(exception);
|
scratchSegment.append(exception);
|
||||||
} else {
|
} else {
|
||||||
scratchSegment.append(buffer, offset, length);
|
scratchSegment.append(stem.chars, stem.offset, stem.length);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
|
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
|
||||||
|
@ -351,7 +382,7 @@ final class Stemmer {
|
||||||
if (exception != null) {
|
if (exception != null) {
|
||||||
return new CharsRef(exception);
|
return new CharsRef(exception);
|
||||||
} else {
|
} else {
|
||||||
return new CharsRef(buffer, offset, length);
|
return stem;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -385,9 +416,9 @@ final class Stemmer {
|
||||||
* most suffix must also contain circumfix flag.
|
* most suffix must also contain circumfix flag.
|
||||||
* @param originalCase if non-null, represents original word case to disallow case variations of
|
* @param originalCase if non-null, represents original word case to disallow case variations of
|
||||||
* word with KEEPCASE flags
|
* word with KEEPCASE flags
|
||||||
* @return List of stems, or empty list if no stems are found
|
* @return whether the processing should be continued
|
||||||
*/
|
*/
|
||||||
private List<CharsRef> stem(
|
private boolean stem(
|
||||||
char[] word,
|
char[] word,
|
||||||
int offset,
|
int offset,
|
||||||
int length,
|
int length,
|
||||||
|
@ -400,12 +431,9 @@ final class Stemmer {
|
||||||
boolean doSuffix,
|
boolean doSuffix,
|
||||||
boolean previousWasPrefix,
|
boolean previousWasPrefix,
|
||||||
boolean circumfix,
|
boolean circumfix,
|
||||||
WordCase originalCase)
|
WordCase originalCase,
|
||||||
|
RootProcessor processor)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
// TODO: allow this stuff to be reused by tokenfilter
|
|
||||||
List<CharsRef> stems = new ArrayList<>();
|
|
||||||
|
|
||||||
if (doPrefix && dictionary.prefixes != null) {
|
if (doPrefix && dictionary.prefixes != null) {
|
||||||
FST<IntsRef> fst = dictionary.prefixes;
|
FST<IntsRef> fst = dictionary.prefixes;
|
||||||
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
|
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
|
||||||
|
@ -440,19 +468,21 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean pureAffix = strippedWord == word;
|
boolean pureAffix = strippedWord == word;
|
||||||
stems.addAll(
|
if (!applyAffix(
|
||||||
applyAffix(
|
strippedWord,
|
||||||
strippedWord,
|
pureAffix ? offset + i : 0,
|
||||||
pureAffix ? offset + i : 0,
|
pureAffix ? length - i : strippedWord.length,
|
||||||
pureAffix ? length - i : strippedWord.length,
|
context,
|
||||||
context,
|
prefix,
|
||||||
prefix,
|
previous,
|
||||||
previous,
|
-1,
|
||||||
-1,
|
recursionDepth,
|
||||||
recursionDepth,
|
true,
|
||||||
true,
|
circumfix,
|
||||||
circumfix,
|
originalCase,
|
||||||
originalCase));
|
processor)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -493,25 +523,27 @@ final class Stemmer {
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean pureAffix = strippedWord == word;
|
boolean pureAffix = strippedWord == word;
|
||||||
stems.addAll(
|
if (!applyAffix(
|
||||||
applyAffix(
|
strippedWord,
|
||||||
strippedWord,
|
pureAffix ? offset : 0,
|
||||||
pureAffix ? offset : 0,
|
pureAffix ? i : strippedWord.length,
|
||||||
pureAffix ? i : strippedWord.length,
|
context,
|
||||||
context,
|
suffix,
|
||||||
suffix,
|
previous,
|
||||||
previous,
|
prefixId,
|
||||||
prefixId,
|
recursionDepth,
|
||||||
recursionDepth,
|
false,
|
||||||
false,
|
circumfix,
|
||||||
circumfix,
|
originalCase,
|
||||||
originalCase));
|
processor)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return stems;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -632,9 +664,9 @@ final class Stemmer {
|
||||||
* as a stem!
|
* as a stem!
|
||||||
* @param recursionDepth current recursion depth
|
* @param recursionDepth current recursion depth
|
||||||
* @param prefix true if we are removing a prefix (false if it's a suffix)
|
* @param prefix true if we are removing a prefix (false if it's a suffix)
|
||||||
* @return List of stems for the word, or an empty list if none are found
|
* @return whether the processing should be continued
|
||||||
*/
|
*/
|
||||||
private List<CharsRef> applyAffix(
|
private boolean applyAffix(
|
||||||
char[] strippedWord,
|
char[] strippedWord,
|
||||||
int offset,
|
int offset,
|
||||||
int length,
|
int length,
|
||||||
|
@ -645,12 +677,11 @@ final class Stemmer {
|
||||||
int recursionDepth,
|
int recursionDepth,
|
||||||
boolean prefix,
|
boolean prefix,
|
||||||
boolean circumfix,
|
boolean circumfix,
|
||||||
WordCase originalCase)
|
WordCase originalCase,
|
||||||
|
RootProcessor processor)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||||
|
|
||||||
List<CharsRef> stems = new ArrayList<>();
|
|
||||||
|
|
||||||
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix);
|
boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix);
|
||||||
IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
|
IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
|
||||||
if (forms != null) {
|
if (forms != null) {
|
||||||
|
@ -694,7 +725,9 @@ final class Stemmer {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
stems.add(newStem(strippedWord, offset, length, forms, i));
|
if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -720,7 +753,7 @@ final class Stemmer {
|
||||||
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
|
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
|
||||||
// COMPLEXPREFIXES = false: combine with another suffix
|
// COMPLEXPREFIXES = false: combine with another suffix
|
||||||
} else {
|
} else {
|
||||||
return stems;
|
return true;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
doPrefix = false;
|
doPrefix = false;
|
||||||
|
@ -728,29 +761,29 @@ final class Stemmer {
|
||||||
prefixId = affix;
|
prefixId = affix;
|
||||||
// we took away the second prefix: go look for another suffix
|
// we took away the second prefix: go look for another suffix
|
||||||
} else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
|
} else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
|
||||||
return stems;
|
return true;
|
||||||
}
|
}
|
||||||
// we took away a prefix, then a suffix: go look for another suffix
|
// we took away a prefix, then a suffix: go look for another suffix
|
||||||
}
|
}
|
||||||
|
|
||||||
stems.addAll(
|
return stem(
|
||||||
stem(
|
strippedWord,
|
||||||
strippedWord,
|
offset,
|
||||||
offset,
|
length,
|
||||||
length,
|
context,
|
||||||
context,
|
affix,
|
||||||
affix,
|
flag,
|
||||||
flag,
|
prefixId,
|
||||||
prefixId,
|
recursionDepth + 1,
|
||||||
recursionDepth + 1,
|
doPrefix,
|
||||||
doPrefix,
|
true,
|
||||||
true,
|
prefix,
|
||||||
prefix,
|
circumfix,
|
||||||
circumfix,
|
originalCase,
|
||||||
originalCase));
|
processor);
|
||||||
}
|
}
|
||||||
|
|
||||||
return stems;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) {
|
private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) {
|
||||||
|
|
Loading…
Reference in New Issue