LUCENE-9692: Hunspell: extract Stemmer.stripAffix from similar code in prefix/suffix processing (#2237)

This commit is contained in:
Peter Gromov 2021-01-25 09:11:11 +01:00 committed by GitHub
parent f64e7cbbda
commit e4ec3e3919
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 37 additions and 44 deletions

View File

@ -26,7 +26,6 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
/** /**
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
@ -305,11 +304,10 @@ final class Stemmer {
if (doPrefix && dictionary.prefixes != null) { if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes; FST<IntsRef> fst = dictionary.prefixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = prefixReaders[recursionDepth]; FST.BytesReader bytesReader = prefixReaders[recursionDepth];
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth]; FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc); fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT; IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? length + 1 : length; int limit = dictionary.fullStrip ? length + 1 : length;
for (int i = 0; i < limit; i++) { for (int i = 0; i < limit; i++) {
@ -333,23 +331,12 @@ final class Stemmer {
} }
if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) { if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
int deAffixedLength = length - i; char[] strippedWord = stripAffix(word, length, i, prefix, true);
if (strippedWord == null) {
int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(
prefix, dictionary.stripData, stripStart, stripLength, word, i, deAffixedLength)) {
continue; continue;
} }
char[] strippedWord = new char[stripLength + deAffixedLength]; stems.addAll(
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList =
applyAffix( applyAffix(
strippedWord, strippedWord,
strippedWord.length, strippedWord.length,
@ -358,9 +345,7 @@ final class Stemmer {
recursionDepth, recursionDepth,
true, true,
circumfix, circumfix,
caseVariant); caseVariant));
stems.addAll(stemList);
} }
} }
} }
@ -368,11 +353,10 @@ final class Stemmer {
if (doSuffix && dictionary.suffixes != null) { if (doSuffix && dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes; FST<IntsRef> fst = dictionary.suffixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = suffixReaders[recursionDepth]; FST.BytesReader bytesReader = suffixReaders[recursionDepth];
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth]; FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc); fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
IntsRef output = NO_OUTPUT; IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? 0 : 1; int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) { for (int i = length; i >= limit; i--) {
@ -396,25 +380,12 @@ final class Stemmer {
} }
if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) { if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
int appendLength = length - i; char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
int deAffixedLength = length - appendLength; if (strippedWord == null) {
int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(
suffix, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
continue; continue;
} }
char[] strippedWord = new char[stripLength + deAffixedLength]; stems.addAll(
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(
dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList =
applyAffix( applyAffix(
strippedWord, strippedWord,
strippedWord.length, strippedWord.length,
@ -423,9 +394,7 @@ final class Stemmer {
recursionDepth, recursionDepth,
false, false,
circumfix, circumfix,
caseVariant); caseVariant));
stems.addAll(stemList);
} }
} }
} }
@ -434,6 +403,30 @@ final class Stemmer {
return stems; return stems;
} }
private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
int deAffixedLen = length - affixLen;
int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
int stripLen = stripEnd - stripStart;
char[] stripData = dictionary.stripData;
boolean condition =
isPrefix
? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
: checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
if (!condition) {
return null;
}
char[] strippedWord = new char[stripLen + deAffixedLen];
System.arraycopy(
word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
return strippedWord;
}
private boolean isAffixCompatible( private boolean isAffixCompatible(
int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) { int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND); int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
@ -495,9 +488,9 @@ final class Stemmer {
* @param strippedWord Word the affix has been removed and the strip added * @param strippedWord Word the affix has been removed and the strip added
* @param length valid length of stripped word * @param length valid length of stripped word
* @param affix HunspellAffix representing the affix rule itself * @param affix HunspellAffix representing the affix rule itself
* @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix, * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
* unless both are compatible so we must check dictionary form against both to add it as a * suffix, unless both are compatible so we must check dictionary form against both to add it
* stem! * as a stem!
* @param recursionDepth current recursion depth * @param recursionDepth current recursion depth
* @param prefix true if we are removing a prefix (false if it's a suffix) * @param prefix true if we are removing a prefix (false if it's a suffix)
* @return List of stems for the word, or an empty list if none are found * @return List of stems for the word, or an empty list if none are found