LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compound (#2397)

2021-02-19 20:03:34 +01:00 · 2021-02-19 20:03:34 +01:00 · 4b3fb1e065
parent e7c80f6445
commit 4b3fb1e065
6 changed files with 65 additions and 39 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
+import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;

 import java.util.ArrayList;
@ -397,8 +398,7 @@ public class Hunspell {
      if (forms != null) {
        words.add(forms);

-        if (dictionary.compoundRules != null
-            && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
+        if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
@ -417,13 +417,17 @@ public class Hunspell {

  private boolean checkLastCompoundPart(
      char[] wordChars, int start, int length, List<IntsRef> words) {
-    IntsRef forms = dictionary.lookupWord(wordChars, start, length);
-    if (forms == null) return false;
+    IntsRef ref = new IntsRef(new int[1], 0, 1);
+    words.add(ref);

-    words.add(forms);
-    boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
+    Stemmer.RootProcessor stopOnMatching =
+        (stem, formID, morphDataId) -> {
+          ref.ints[0] = formID;
+          return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
+        };
+    boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
    words.remove(words.size() - 1);
-    return result;
+    return found;
  }

  private static boolean isNumber(String s) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -247,19 +247,12 @@ final class Stemmer {
        if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
          continue;
        }
-        // we can't add this form, it only belongs inside a compound word
-        if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
-          continue;
+        if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
+            && dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
+          return false;
        }
-        if (context.isCompound()) {
-          if (context != WordContext.COMPOUND_END
-              && dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
-            return false;
-          }
-          if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
-              && !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
-            continue;
-          }
+        if (!isRootCompatibleWithContext(context, -1, entryId)) {
+          continue;
        }
        if (!callProcessor(word, offset, length, processor, forms, i)) {
          return false;
@ -540,8 +533,8 @@ final class Stemmer {
      if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
        return false;
      }
-      WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
-      if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
+      if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
+          && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
        return false;
      }
      if (context == WordContext.COMPOUND_END
@ -550,18 +543,17 @@ final class Stemmer {
          && dictionary.hasFlag(append, dictionary.onlyincompound)) {
        return false;
      }
+    } else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
+      return false;
    }

    if (recursionDepth == 0) {
-      // check if affix is allowed in a non-compound word
-      return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
+      return true;
    }

    if (dictionary.isCrossProduct(affix)) {
      // cross check incoming continuation class (flag of previous affix) against list.
-      if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) {
-        return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
-      }
+      return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
    }

    return false;
@ -640,18 +632,10 @@ final class Stemmer {
            }
          }

-          if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
+          if (!isRootCompatibleWithContext(context, affix, entryId)) {
            continue;
          }
-          if (context.isCompound()) {
-            char cFlag = context.requiredFlag(dictionary);
-            if (!dictionary.hasFlag(entryId, cFlag)
-                && !isFlagAppendedByAffix(affix, cFlag)
-                && !dictionary.hasFlag(entryId, dictionary.compoundFlag)
-                && !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
-              continue;
-            }
-          }
+
          if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
            return false;
          }
@ -704,6 +688,20 @@ final class Stemmer {
    return true;
  }

+  private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
+    if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
+      return false;
+    }
+    if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
+      char cFlag = context.requiredFlag(dictionary);
+      return dictionary.hasFlag(entryId, cFlag)
+          || isFlagAppendedByAffix(lastAffix, cFlag)
+          || dictionary.hasFlag(entryId, dictionary.compoundFlag)
+          || isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
+    }
+    return true;
+  }
+
  private boolean callProcessor(
      char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
    CharsRef stem = new CharsRef(word, offset, length);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
@ -17,15 +17,35 @@
 package org.apache.lucene.analysis.hunspell;

 enum WordContext {
+  /** non-compound */
  SIMPLE_WORD,
+
+  /** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
  COMPOUND_BEGIN,
+
+  /** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
  COMPOUND_MIDDLE,
-  COMPOUND_END;
+
+  /** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
+  COMPOUND_END,
+
+  /**
+   * The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
+   * is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
+   */
+  COMPOUND_RULE_END;

  boolean isCompound() {
    return this != SIMPLE_WORD;
  }

+  boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
+    if (isPrefix) {
+      return this == WordContext.COMPOUND_BEGIN;
+    }
+    return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
+  }
+
  char requiredFlag(Dictionary dictionary) {
    switch (this) {
      case COMPOUND_BEGIN:
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
@ -5,3 +5,6 @@ ONLYINCOMPOUND c
 COMPOUNDRULE 2
 COMPOUNDRULE n*1t
 COMPOUNDRULE n*mp
+
+SFX S Y 1
+SFX S 0 s
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
@ -9,7 +9,7 @@
 7/nm
 8/nm
 9/nm
-0th/pt
+0th/ptS
 1st/p
 1th/tc
 2nd/p
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
@ -28,4 +28,5 @@
 10001st
 10011th
 1ST
-42ND
+42ND
+10ths