From 0a1a3f4c4095bf5258e5898c2b23658747f12bba Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Fri, 22 Jan 2021 12:03:55 +0100 Subject: [PATCH] LUCENE-9688: Hunspell: consider prefix's continuation flags when applying suffix (#2229) --- .../lucene/analysis/hunspell/Stemmer.java | 45 ++++++++++--------- .../analysis/hunspell/TestDependencies.java | 3 ++ .../lucene/analysis/hunspell/dependencies.aff | 7 +++ .../lucene/analysis/hunspell/dependencies.dic | 5 ++- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index 3bb46a7a961..090742b5f7a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -251,8 +251,8 @@ final class Stemmer { * @param previous previous affix that was removed (so we dont remove same one twice) * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any * affixes in this recursive step - * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's - * also checked against the word + * @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also + * checked against the word * @param recursionDepth current recursiondepth * @param doPrefix true if we should remove prefixes * @param doSuffix true if we should remove suffixes @@ -270,7 +270,7 @@ final class Stemmer { int length, int previous, int prevFlag, - int prefixFlag, + int prefixId, int recursionDepth, boolean doPrefix, boolean doSuffix, @@ -398,7 +398,7 @@ final class Stemmer { strippedWord, strippedWord.length, suffix, - prefixFlag, + prefixId, recursionDepth, false, circumfix, @@ -474,9 +474,9 @@ final class Stemmer { * @param strippedWord Word the affix has been removed and the strip added * @param length valid length of stripped word * @param affix HunspellAffix representing the affix rule itself - * @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the - * suffix, unless both are compatible so we must check dictionary form against both to add it - * as a stem! + * @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix, + * unless both are compatible so we must check dictionary form against both to add it as a + * stem! * @param recursionDepth current recursion depth * @param prefix true if we are removing a prefix (false if it's a suffix) * @return List of stems for the word, or an empty list if none are found @@ -485,14 +485,13 @@ final class Stemmer { char[] strippedWord, int length, int affix, - int prefixFlag, + int prefixId, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException { char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG); - char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND); List stems = new ArrayList<>(); @@ -500,16 +499,15 @@ final class Stemmer { if (forms != null) { for (int i = 0; i < forms.length; i += formStep) { char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch); - if (Dictionary.hasFlag(wordFlags, flag)) { + if (Dictionary.hasFlag(wordFlags, flag) || isFlagAppendedByAffix(prefixId, flag)) { // confusing: in this one exception, we already chained the first prefix against the // second, // so it doesnt need to be checked against the word boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix; - if (!chainedPrefix - && prefixFlag >= 0 - && !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) { - // see if we can chain prefix thru the suffix continuation class (only if it has any!) - if (!dictionary.hasFlag(append, (char) prefixFlag, scratch)) { + if (!chainedPrefix && prefixId >= 0) { + char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG); + if (!Dictionary.hasFlag(wordFlags, prefixFlag) + && !isFlagAppendedByAffix(affix, prefixFlag)) { continue; } } @@ -517,8 +515,7 @@ final class Stemmer { // if circumfix was previously set by a prefix, we must check this suffix, // to ensure it has it, and vice versa if (dictionary.circumfix != -1) { - boolean suffixCircumfix = - dictionary.hasFlag(append, (char) dictionary.circumfix, scratch); + boolean suffixCircumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix); if (circumfix != suffixCircumfix) { continue; } @@ -541,14 +538,14 @@ final class Stemmer { // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we // have that flag if (dictionary.circumfix != -1 && !circumfix && prefix) { - circumfix = dictionary.hasFlag(append, (char) dictionary.circumfix, scratch); + circumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix); } if (isCrossProduct(affix) && recursionDepth <= 1) { boolean doPrefix; if (recursionDepth == 0) { if (prefix) { - prefixFlag = flag; + prefixId = affix; doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix; // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix @@ -564,7 +561,7 @@ final class Stemmer { } else { doPrefix = false; if (prefix && dictionary.complexPrefixes) { - prefixFlag = flag; + prefixId = affix; // we took away the second prefix: go look for another suffix } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) { return stems; @@ -578,7 +575,7 @@ final class Stemmer { length, affix, flag, - prefixFlag, + prefixId, recursionDepth + 1, doPrefix, true, @@ -590,6 +587,12 @@ final class Stemmer { return stems; } + private boolean isFlagAppendedByAffix(int affixId, char flag) { + if (affixId < 0) return false; + int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND); + return dictionary.hasFlag(appendId, flag, scratch); + } + private boolean isCrossProduct(int affix) { return (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1; } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java index aadcda3b6ae..e6310b6cee9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java @@ -38,5 +38,8 @@ public class TestDependencies extends StemmerTestBase { assertStemsTo("hydration", "hydrate"); assertStemsTo("dehydrate", "hydrate"); assertStemsTo("dehydration", "hydrate"); + + assertStemsTo("calorie", "calorie", "calorie"); + assertStemsTo("calories", "calorie"); } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff index 6aff674bd1c..9750c069acf 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff @@ -17,3 +17,10 @@ PFX h 0 de . SFX A Y 1 SFX A te tion/S . + +SFX s Y 1 +SFX s 0 s . + +PFX p Y 1 +PFX p 0 0/s . + diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.dic index 632f70ff8f5..08c565ec25f 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.dic @@ -1,4 +1,5 @@ -2 +4 drink/RQ [verb] drink/S [noun] -hydrate/hA \ No newline at end of file +hydrate/hA +calorie/p \ No newline at end of file