LUCENE-9692: Hunspell: extract Stemmer.stripAffix from similar code in prefix/suffix processing (#2237)

This commit is contained in:
Peter Gromov 2021-01-25 09:11:11 +01:00 committed by GitHub
parent f64e7cbbda
commit e4ec3e3919
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 37 additions and 44 deletions

View File

@ -26,7 +26,6 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
/**
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
@ -305,11 +304,10 @@ final class Stemmer {
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = prefixReaders[recursionDepth];
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? length + 1 : length;
for (int i = 0; i < limit; i++) {
@ -333,23 +331,12 @@ final class Stemmer {
}
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
int deAffixedLength = length - i;
int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(
prefix, dictionary.stripData, stripStart, stripLength, word, i, deAffixedLength)) {
char[] strippedWord = stripAffix(word, length, i, prefix, true);
if (strippedWord == null) {
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList =
stems.addAll(
applyAffix(
strippedWord,
strippedWord.length,
@ -358,9 +345,7 @@ final class Stemmer {
recursionDepth,
true,
circumfix,
caseVariant);
stems.addAll(stemList);
caseVariant));
}
}
}
@ -368,11 +353,10 @@ final class Stemmer {
if (doSuffix && dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = suffixReaders[recursionDepth];
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
@ -396,25 +380,12 @@ final class Stemmer {
}
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(
suffix, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
if (strippedWord == null) {
continue;
}
char[] strippedWord = new char[stripLength + deAffixedLength];
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(
dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList =
stems.addAll(
applyAffix(
strippedWord,
strippedWord.length,
@ -423,9 +394,7 @@ final class Stemmer {
recursionDepth,
false,
circumfix,
caseVariant);
stems.addAll(stemList);
caseVariant));
}
}
}
@ -434,6 +403,30 @@ final class Stemmer {
return stems;
}
private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
int deAffixedLen = length - affixLen;
int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLen = stripEnd - stripStart;
char[] stripData = dictionary.stripData;
boolean condition =
isPrefix
? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
: checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
if (!condition) {
return null;
}
char[] strippedWord = new char[stripLen + deAffixedLen];
System.arraycopy(
word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
return strippedWord;
}
private boolean isAffixCompatible(
int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
@ -495,9 +488,9 @@ final class Stemmer {
* @param strippedWord Word the affix has been removed and the strip added
* @param length valid length of stripped word
* @param affix HunspellAffix representing the affix rule itself
* @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
* unless both are compatible so we must check dictionary form against both to add it as a
* stem!
* @param prefixId when we already stripped a prefix, we can't simply recurse and check the
* suffix, unless both are compatible so we must check dictionary form against both to add it
* as a stem!
* @param recursionDepth current recursion depth
* @param prefix true if we are removing a prefix (false if it's a suffix)
* @return List of stems for the word, or an empty list if none are found