mirror of https://github.com/apache/lucene.git
LUCENE-9688: Hunspell: consider prefix's continuation flags when applying suffix (#2229)
This commit is contained in:
parent
d7968130c3
commit
0a1a3f4c40
|
@ -251,8 +251,8 @@ final class Stemmer {
|
|||
* @param previous previous affix that was removed (so we dont remove same one twice)
|
||||
* @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
|
||||
* affixes in this recursive step
|
||||
* @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's
|
||||
* also checked against the word
|
||||
* @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
|
||||
* checked against the word
|
||||
* @param recursionDepth current recursiondepth
|
||||
* @param doPrefix true if we should remove prefixes
|
||||
* @param doSuffix true if we should remove suffixes
|
||||
|
@ -270,7 +270,7 @@ final class Stemmer {
|
|||
int length,
|
||||
int previous,
|
||||
int prevFlag,
|
||||
int prefixFlag,
|
||||
int prefixId,
|
||||
int recursionDepth,
|
||||
boolean doPrefix,
|
||||
boolean doSuffix,
|
||||
|
@ -398,7 +398,7 @@ final class Stemmer {
|
|||
strippedWord,
|
||||
strippedWord.length,
|
||||
suffix,
|
||||
prefixFlag,
|
||||
prefixId,
|
||||
recursionDepth,
|
||||
false,
|
||||
circumfix,
|
||||
|
@ -474,9 +474,9 @@ final class Stemmer {
|
|||
* @param strippedWord Word the affix has been removed and the strip added
|
||||
* @param length valid length of stripped word
|
||||
* @param affix HunspellAffix representing the affix rule itself
|
||||
* @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the
|
||||
* suffix, unless both are compatible so we must check dictionary form against both to add it
|
||||
* as a stem!
|
||||
* @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
|
||||
* unless both are compatible so we must check dictionary form against both to add it as a
|
||||
* stem!
|
||||
* @param recursionDepth current recursion depth
|
||||
* @param prefix true if we are removing a prefix (false if it's a suffix)
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
|
@ -485,14 +485,13 @@ final class Stemmer {
|
|||
char[] strippedWord,
|
||||
int length,
|
||||
int affix,
|
||||
int prefixFlag,
|
||||
int prefixId,
|
||||
int recursionDepth,
|
||||
boolean prefix,
|
||||
boolean circumfix,
|
||||
boolean caseVariant)
|
||||
throws IOException {
|
||||
char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
|
||||
char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
|
||||
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
|
||||
|
@ -500,16 +499,15 @@ final class Stemmer {
|
|||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
|
||||
if (Dictionary.hasFlag(wordFlags, flag)) {
|
||||
if (Dictionary.hasFlag(wordFlags, flag) || isFlagAppendedByAffix(prefixId, flag)) {
|
||||
// confusing: in this one exception, we already chained the first prefix against the
|
||||
// second,
|
||||
// so it doesnt need to be checked against the word
|
||||
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
|
||||
if (!chainedPrefix
|
||||
&& prefixFlag >= 0
|
||||
&& !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
|
||||
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
|
||||
if (!dictionary.hasFlag(append, (char) prefixFlag, scratch)) {
|
||||
if (!chainedPrefix && prefixId >= 0) {
|
||||
char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
|
||||
if (!Dictionary.hasFlag(wordFlags, prefixFlag)
|
||||
&& !isFlagAppendedByAffix(affix, prefixFlag)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -517,8 +515,7 @@ final class Stemmer {
|
|||
// if circumfix was previously set by a prefix, we must check this suffix,
|
||||
// to ensure it has it, and vice versa
|
||||
if (dictionary.circumfix != -1) {
|
||||
boolean suffixCircumfix =
|
||||
dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
|
||||
boolean suffixCircumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
|
||||
if (circumfix != suffixCircumfix) {
|
||||
continue;
|
||||
}
|
||||
|
@ -541,14 +538,14 @@ final class Stemmer {
|
|||
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we
|
||||
// have that flag
|
||||
if (dictionary.circumfix != -1 && !circumfix && prefix) {
|
||||
circumfix = dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
|
||||
circumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
|
||||
}
|
||||
|
||||
if (isCrossProduct(affix) && recursionDepth <= 1) {
|
||||
boolean doPrefix;
|
||||
if (recursionDepth == 0) {
|
||||
if (prefix) {
|
||||
prefixFlag = flag;
|
||||
prefixId = affix;
|
||||
doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix;
|
||||
// we took away the first prefix.
|
||||
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
|
||||
|
@ -564,7 +561,7 @@ final class Stemmer {
|
|||
} else {
|
||||
doPrefix = false;
|
||||
if (prefix && dictionary.complexPrefixes) {
|
||||
prefixFlag = flag;
|
||||
prefixId = affix;
|
||||
// we took away the second prefix: go look for another suffix
|
||||
} else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
|
||||
return stems;
|
||||
|
@ -578,7 +575,7 @@ final class Stemmer {
|
|||
length,
|
||||
affix,
|
||||
flag,
|
||||
prefixFlag,
|
||||
prefixId,
|
||||
recursionDepth + 1,
|
||||
doPrefix,
|
||||
true,
|
||||
|
@ -590,6 +587,12 @@ final class Stemmer {
|
|||
return stems;
|
||||
}
|
||||
|
||||
private boolean isFlagAppendedByAffix(int affixId, char flag) {
|
||||
if (affixId < 0) return false;
|
||||
int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
|
||||
return dictionary.hasFlag(appendId, flag, scratch);
|
||||
}
|
||||
|
||||
private boolean isCrossProduct(int affix) {
|
||||
return (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
|
||||
}
|
||||
|
|
|
@ -38,5 +38,8 @@ public class TestDependencies extends StemmerTestBase {
|
|||
assertStemsTo("hydration", "hydrate");
|
||||
assertStemsTo("dehydrate", "hydrate");
|
||||
assertStemsTo("dehydration", "hydrate");
|
||||
|
||||
assertStemsTo("calorie", "calorie", "calorie");
|
||||
assertStemsTo("calories", "calorie");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,3 +17,10 @@ PFX h 0 de .
|
|||
|
||||
SFX A Y 1
|
||||
SFX A te tion/S .
|
||||
|
||||
SFX s Y 1
|
||||
SFX s 0 s .
|
||||
|
||||
PFX p Y 1
|
||||
PFX p 0 0/s .
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
2
|
||||
4
|
||||
drink/RQ [verb]
|
||||
drink/S [noun]
|
||||
hydrate/hA
|
||||
calorie/p
|
Loading…
Reference in New Issue