LUCENE-9776: Hunspell: allow to inflect the last part of COMPOUNDRULE compound (#2397)

2021-02-19 20:03:34 +01:00 · 2021-02-19 20:03:34 +01:00 · 4b3fb1e065
parent e7c80f6445
commit 4b3fb1e065
6 changed files with 65 additions and 39 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -20,6 +20,7 @@ import static org.apache.lucene.analysis.hunspell.Dictionary.FLAG_UNSET;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_BEGIN;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
 import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_RULE_END;
 import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
 import java.util.ArrayList;
@ -397,8 +398,7 @@ public class Hunspell {
      if (forms != null) {
        words.add(forms);
-        if (dictionary.compoundRules != null
+        if (dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
            && dictionary.compoundRules.stream().anyMatch(r -> r.mayMatch(words))) {
          if (checkLastCompoundPart(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
@ -417,13 +417,17 @@ public class Hunspell {
  private boolean checkLastCompoundPart(
      char[] wordChars, int start, int length, List<IntsRef> words) {
-    IntsRef forms = dictionary.lookupWord(wordChars, start, length);
+    IntsRef ref = new IntsRef(new int[1], 0, 1);
-    if (forms == null) return false;
+    words.add(ref);
-    words.add(forms);
+    Stemmer.RootProcessor stopOnMatching =
-    boolean result = dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words));
+        (stem, formID, morphDataId) -> {
          ref.ints[0] = formID;
          return dictionary.compoundRules.stream().noneMatch(r -> r.fullyMatches(words));
        };
    boolean found = !stemmer.doStem(wordChars, start, length, COMPOUND_RULE_END, stopOnMatching);
    words.remove(words.size() - 1);
-    return result;
+    return found;
  }
  private static boolean isNumber(String s) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -247,19 +247,12 @@ final class Stemmer {
        if (dictionary.hasFlag(entryId, dictionary.needaffix)) {
          continue;
        }
-        // we can't add this form, it only belongs inside a compound word
+        if ((context == WordContext.COMPOUND_BEGIN || context == WordContext.COMPOUND_MIDDLE)
-        if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
+            && dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
-          continue;
+          return false;
        }
-        if (context.isCompound()) {
+        if (!isRootCompatibleWithContext(context, -1, entryId)) {
-          if (context != WordContext.COMPOUND_END
+          continue;
              && dictionary.hasFlag(entryId, dictionary.compoundForbid)) {
            return false;
          }
          if (!dictionary.hasFlag(entryId, dictionary.compoundFlag)
              && !dictionary.hasFlag(entryId, context.requiredFlag(dictionary))) {
            continue;
          }
        }
        if (!callProcessor(word, offset, length, processor, forms, i)) {
          return false;
@ -540,8 +533,8 @@ final class Stemmer {
      if (!isPrefix && dictionary.hasFlag(append, dictionary.compoundForbid)) {
        return false;
      }
-      WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
+      if (!context.isAffixAllowedWithoutSpecialPermit(isPrefix)
-      if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
+          && !dictionary.hasFlag(append, dictionary.compoundPermit)) {
        return false;
      }
      if (context == WordContext.COMPOUND_END
@ -550,18 +543,17 @@ final class Stemmer {
          && dictionary.hasFlag(append, dictionary.onlyincompound)) {
        return false;
      }
    } else if (dictionary.hasFlag(append, dictionary.onlyincompound)) {
      return false;
    }
    if (recursionDepth == 0) {
-      // check if affix is allowed in a non-compound word
+      return true;
      return context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound);
    }
    if (dictionary.isCrossProduct(affix)) {
      // cross check incoming continuation class (flag of previous affix) against list.
-      if (context.isCompound() || !dictionary.hasFlag(append, dictionary.onlyincompound)) {
+      return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
        return previousWasPrefix || dictionary.hasFlag(append, prevFlag);
      }
    }
    return false;
@ -640,18 +632,10 @@ final class Stemmer {
            }
          }
-          if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
+          if (!isRootCompatibleWithContext(context, affix, entryId)) {
            continue;
          }
-          if (context.isCompound()) {
+
            char cFlag = context.requiredFlag(dictionary);
            if (!dictionary.hasFlag(entryId, cFlag)
                && !isFlagAppendedByAffix(affix, cFlag)
                && !dictionary.hasFlag(entryId, dictionary.compoundFlag)
                && !isFlagAppendedByAffix(affix, dictionary.compoundFlag)) {
              continue;
            }
          }
          if (!callProcessor(strippedWord, offset, length, processor, forms, i)) {
            return false;
          }
@ -704,6 +688,20 @@ final class Stemmer {
    return true;
  }
  private boolean isRootCompatibleWithContext(WordContext context, int lastAffix, int entryId) {
    if (!context.isCompound() && dictionary.hasFlag(entryId, dictionary.onlyincompound)) {
      return false;
    }
    if (context.isCompound() && context != WordContext.COMPOUND_RULE_END) {
      char cFlag = context.requiredFlag(dictionary);
      return dictionary.hasFlag(entryId, cFlag)
          || isFlagAppendedByAffix(lastAffix, cFlag)
          || dictionary.hasFlag(entryId, dictionary.compoundFlag)
          || isFlagAppendedByAffix(lastAffix, dictionary.compoundFlag);
    }
    return true;
  }
  private boolean callProcessor(
      char[] word, int offset, int length, RootProcessor processor, IntsRef forms, int i) {
    CharsRef stem = new CharsRef(word, offset, length);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
@ -17,15 +17,35 @@
 package org.apache.lucene.analysis.hunspell;
 enum WordContext {
  /** non-compound */
  SIMPLE_WORD,
  /** The first root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
  COMPOUND_BEGIN,
  /** A middle root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
  COMPOUND_MIDDLE,
-  COMPOUND_END;
+
  /** The final root in a word with COMPOUNDFLAG/BEGIN/MIDDLE/END compounding */
  COMPOUND_END,
  /**
   * The final root in a word with COMPOUNDRULE compounding. The difference to {@link #COMPOUND_END}
   * is that this context doesn't require COMPOUNDFLAG/COMPOUNDEND flags, but allows ONLYINCOMPOUND.
   */
  COMPOUND_RULE_END;
  boolean isCompound() {
    return this != SIMPLE_WORD;
  }
  boolean isAffixAllowedWithoutSpecialPermit(boolean isPrefix) {
    if (isPrefix) {
      return this == WordContext.COMPOUND_BEGIN;
    }
    return this == WordContext.COMPOUND_END || this == WordContext.COMPOUND_RULE_END;
  }
  char requiredFlag(Dictionary dictionary) {
    switch (this) {
      case COMPOUND_BEGIN:
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.aff
@ -5,3 +5,6 @@ ONLYINCOMPOUND c
 COMPOUNDRULE 2
 COMPOUNDRULE n*1t
 COMPOUNDRULE n*mp
 SFX S Y 1
 SFX S 0 s
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.dic
@ -9,7 +9,7 @@
 7/nm
 8/nm
 9/nm
-0th/pt
+0th/ptS
 1st/p
 1th/tc
 2nd/p
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compoundrule4.good
@ -28,4 +28,5 @@
 10001st
 10011th
 1ST
-42ND
+42ND
 10ths
 /nm
 /nm
 /nm
-th/pt
+th/ptS
 st/p
 th/tc
 nd/p
 st
 th
 ST
 ND
+ths