LUCENE-9688: Hunspell: consider prefix's continuation flags when applying suffix (#2229)

2021-01-22 12:03:55 +01:00 · 2021-01-22 12:03:55 +01:00 · 0a1a3f4c40
parent d7968130c3
commit 0a1a3f4c40
4 changed files with 37 additions and 23 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -251,8 +251,8 @@ final class Stemmer {
   * @param previous previous affix that was removed (so we dont remove same one twice)
   * @param prevFlag Flag from a previous stemming step that need to be cross-checked with any
   *     affixes in this recursive step
-   * @param prefixFlag flag of the most inner removed prefix, so that when removing a suffix, it's
-   *     also checked against the word
+   * @param prefixId ID of the most inner removed prefix, so that when removing a suffix, it's also
+   *     checked against the word
   * @param recursionDepth current recursiondepth
   * @param doPrefix true if we should remove prefixes
   * @param doSuffix true if we should remove suffixes
@ -270,7 +270,7 @@ final class Stemmer {
      int length,
      int previous,
      int prevFlag,
-      int prefixFlag,
+      int prefixId,
      int recursionDepth,
      boolean doPrefix,
      boolean doSuffix,
@ -398,7 +398,7 @@ final class Stemmer {
                    strippedWord,
                    strippedWord.length,
                    suffix,
-                    prefixFlag,
+                    prefixId,
                    recursionDepth,
                    false,
                    circumfix,
@ -474,9 +474,9 @@ final class Stemmer {
   * @param strippedWord Word the affix has been removed and the strip added
   * @param length valid length of stripped word
   * @param affix HunspellAffix representing the affix rule itself
-   * @param prefixFlag when we already stripped a prefix, we cant simply recurse and check the
-   *     suffix, unless both are compatible so we must check dictionary form against both to add it
-   *     as a stem!
+   * @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
+   *     unless both are compatible so we must check dictionary form against both to add it as a
+   *     stem!
   * @param recursionDepth current recursion depth
   * @param prefix true if we are removing a prefix (false if it's a suffix)
   * @return List of stems for the word, or an empty list if none are found
@ -485,14 +485,13 @@ final class Stemmer {
      char[] strippedWord,
      int length,
      int affix,
-      int prefixFlag,
+      int prefixId,
      int recursionDepth,
      boolean prefix,
      boolean circumfix,
      boolean caseVariant)
      throws IOException {
    char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);
-    char append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);

    List<CharsRef> stems = new ArrayList<>();

@ -500,16 +499,15 @@ final class Stemmer {
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
        char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
-        if (Dictionary.hasFlag(wordFlags, flag)) {
+        if (Dictionary.hasFlag(wordFlags, flag) || isFlagAppendedByAffix(prefixId, flag)) {
          // confusing: in this one exception, we already chained the first prefix against the
          // second,
          // so it doesnt need to be checked against the word
          boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
-          if (!chainedPrefix
-              && prefixFlag >= 0
-              && !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
-            // see if we can chain prefix thru the suffix continuation class (only if it has any!)
-            if (!dictionary.hasFlag(append, (char) prefixFlag, scratch)) {
+          if (!chainedPrefix && prefixId >= 0) {
+            char prefixFlag = dictionary.affixData(prefixId, Dictionary.AFFIX_FLAG);
+            if (!Dictionary.hasFlag(wordFlags, prefixFlag)
+                && !isFlagAppendedByAffix(affix, prefixFlag)) {
              continue;
            }
          }
@ -517,8 +515,7 @@ final class Stemmer {
          // if circumfix was previously set by a prefix, we must check this suffix,
          // to ensure it has it, and vice versa
          if (dictionary.circumfix != -1) {
-            boolean suffixCircumfix =
-                dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
+            boolean suffixCircumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
            if (circumfix != suffixCircumfix) {
              continue;
            }
@ -541,14 +538,14 @@ final class Stemmer {
    // if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we
    // have that flag
    if (dictionary.circumfix != -1 && !circumfix && prefix) {
-      circumfix = dictionary.hasFlag(append, (char) dictionary.circumfix, scratch);
+      circumfix = isFlagAppendedByAffix(affix, (char) dictionary.circumfix);
    }

    if (isCrossProduct(affix) && recursionDepth <= 1) {
      boolean doPrefix;
      if (recursionDepth == 0) {
        if (prefix) {
-          prefixFlag = flag;
+          prefixId = affix;
          doPrefix = dictionary.complexPrefixes && dictionary.twoStageAffix;
          // we took away the first prefix.
          // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix
@ -564,7 +561,7 @@ final class Stemmer {
      } else {
        doPrefix = false;
        if (prefix && dictionary.complexPrefixes) {
-          prefixFlag = flag;
+          prefixId = affix;
          // we took away the second prefix: go look for another suffix
        } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
          return stems;
@ -578,7 +575,7 @@ final class Stemmer {
              length,
              affix,
              flag,
-              prefixFlag,
+              prefixId,
              recursionDepth + 1,
              doPrefix,
              true,
@ -590,6 +587,12 @@ final class Stemmer {
    return stems;
  }

+  private boolean isFlagAppendedByAffix(int affixId, char flag) {
+    if (affixId < 0) return false;
+    int appendId = dictionary.affixData(affixId, Dictionary.AFFIX_APPEND);
+    return dictionary.hasFlag(appendId, flag, scratch);
+  }
+
  private boolean isCrossProduct(int affix) {
    return (dictionary.affixData(affix, Dictionary.AFFIX_CONDITION) & 1) == 1;
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDependencies.java
@ -38,5 +38,8 @@ public class TestDependencies extends StemmerTestBase {
    assertStemsTo("hydration", "hydrate");
    assertStemsTo("dehydrate", "hydrate");
    assertStemsTo("dehydration", "hydrate");
+
+    assertStemsTo("calorie", "calorie", "calorie");
+    assertStemsTo("calories", "calorie");
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.aff
@ -17,3 +17,10 @@ PFX h 0 de .

 SFX A Y 1
 SFX A te tion/S .
+
+SFX s Y 1
+SFX s 0 s .
+
+PFX p Y 1
+PFX p 0 0/s .
+
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/dependencies.dic
@ -1,4 +1,5 @@
-2
+4
 drink/RQ	[verb]
 drink/S	[noun]
-hydrate/hA
+hydrate/hA
+calorie/p