LUCENE-9699: Support German-like compound words (#2248)

2021-01-27 22:31:58 +01:00 · 2021-01-27 22:31:58 +01:00 · a176308aa6
parent 38ec2602ce
commit a176308aa6
9 changed files with 329 additions and 31 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -141,8 +141,9 @@ public class Dictionary {
  char keepcase;
  char needaffix;
  char forbiddenword;
-  char onlyincompound;
-  int compoundMin = 3;
+  char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
+  boolean checkCompoundCase;
+  int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
  List<CompoundRule> compoundRules; // nullable

  // ignored characters (dictionary, affix, inputs)
@ -377,8 +378,20 @@ public class Dictionary {
        forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIN".equals(firstWord)) {
        compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
+      } else if ("COMPOUNDWORDMAX".equals(firstWord)) {
+        compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
      } else if ("COMPOUNDRULE".equals(firstWord)) {
        compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
+      } else if ("COMPOUNDBEGIN".equals(firstWord)) {
+        compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("COMPOUNDMIDDLE".equals(firstWord)) {
+        compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("COMPOUNDEND".equals(firstWord)) {
+        compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
+        compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
+        checkCompoundCase = true;
      }
    }

@ -1303,10 +1316,6 @@ public class Dictionary {
    }
  }

-  boolean hasCompounding() {
-    return compoundRules != null;
-  }
-
  boolean hasFlag(int entryId, char flag, BytesRef scratch) {
    return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag);
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -87,18 +87,54 @@ public class SpellChecker {
      return false;
    }

-    if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
+    if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
      return true;
    }

-    if (dictionary.hasCompounding()) {
-      return checkCompounds(wordChars, 0, length, new ArrayList<>());
+    if (dictionary.compoundRules != null
+        && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
+      return true;
+    }
+
+    return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
+  }
+
+  private boolean hasStems(
+      char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
+    return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
+  }
+
+  private boolean checkCompounds(
+      char[] chars, int offset, int length, boolean caseVariant, int depth) {
+    if (depth > dictionary.compoundMax - 2) return false;
+
+    int limit = length - dictionary.compoundMin + 1;
+    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
+      WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
+      int breakOffset = offset + breakPos;
+      if (checkCompoundCase(chars, breakOffset)
+          && hasStems(chars, offset, breakPos, caseVariant, context)) {
+        int remainingLength = length - breakPos;
+        if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
+          return true;
+        }
+
+        if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
+          return true;
+        }
+      }
    }

    return false;
  }

-  private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
+  private boolean checkCompoundCase(char[] chars, int breakPos) {
+    if (!dictionary.checkCompoundCase) return true;
+    return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
+  }
+
+  private boolean checkCompoundRules(
+      char[] wordChars, int offset, int length, List<IntsRef> words) {
    if (words.size() >= 100) return false;

    int limit = length - dictionary.compoundMin + 1;
@ -113,7 +149,7 @@ public class SpellChecker {
            return true;
          }

-          if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
+          if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
        }
@ -132,8 +168,7 @@ public class SpellChecker {

    words.add(forms);
    boolean result =
-        dictionary.compoundRules != null
-            && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
+        dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
    words.remove(words.size() - 1);
    return result;
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -94,18 +94,18 @@ final class Stemmer {
    }

    WordCase wordCase = caseOf(word, length);
-    List<CharsRef> list = doStem(word, length, false);
+    List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
      char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
      if (aposCase != null) {
-        list.addAll(doStem(aposCase, length, true));
+        list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
      }
-      list.addAll(doStem(titleBuffer, length, true));
+      list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
    }
    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
      caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
-      list.addAll(doStem(lowerBuffer, length, true));
+      list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
    }
    return list;
  }
@ -158,9 +158,10 @@ final class Stemmer {
    return null;
  }

-  List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
+  List<CharsRef> doStem(
+      char[] word, int offset, int length, boolean caseVariant, WordContext context) {
    List<CharsRef> stems = new ArrayList<>();
-    IntsRef forms = dictionary.lookupWord(word, 0, length);
+    IntsRef forms = dictionary.lookupWord(word, offset, length);
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
        char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
@ -172,15 +173,32 @@ final class Stemmer {
          continue;
        }
        // we can't add this form, it only belongs inside a compound word
-        if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
+        if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
          continue;
        }
-        stems.add(newStem(word, 0, length, forms, i));
+        if (context.isCompound()
+            && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
+          continue;
+        }
+        stems.add(newStem(word, offset, length, forms, i));
      }
    }
    try {
      stems.addAll(
-          stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
+          stem(
+              word,
+              offset,
+              length,
+              context,
+              -1,
+              (char) 0,
+              -1,
+              0,
+              true,
+              true,
+              false,
+              false,
+              caseVariant));
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
@ -287,6 +305,7 @@ final class Stemmer {
      char[] word,
      int offset,
      int length,
+      WordContext context,
      int previous,
      char prevFlag,
      int prefixId,
@ -328,7 +347,7 @@ final class Stemmer {
            continue;
          }

-          if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
+          if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
            char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
            if (strippedWord == null) {
              continue;
@ -340,6 +359,7 @@ final class Stemmer {
                    strippedWord,
                    pureAffix ? offset + i : 0,
                    pureAffix ? length - i : strippedWord.length,
+                    context,
                    prefix,
                    -1,
                    recursionDepth,
@ -378,7 +398,8 @@ final class Stemmer {
            continue;
          }

-          if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
+          if (isAffixCompatible(
+              suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
            char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
            if (strippedWord == null) {
              continue;
@ -390,6 +411,7 @@ final class Stemmer {
                    strippedWord,
                    pureAffix ? offset : 0,
                    pureAffix ? i : strippedWord.length,
+                    context,
                    suffix,
                    prefixId,
                    recursionDepth,
@ -442,18 +464,31 @@ final class Stemmer {
  }

  private boolean isAffixCompatible(
-      int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) {
+      int affix,
+      char prevFlag,
+      int recursionDepth,
+      boolean isPrefix,
+      boolean previousWasPrefix,
+      WordContext context) {
    int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);

+    if (context.isCompound() && dictionary.compoundPermit > 0) {
+      WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
+      if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
+        return false;
+      }
+    }
+
    if (recursionDepth == 0) {
      // check if affix is allowed in a non-compound word
-      return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
+      return context.isCompound()
+          || !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
    }

    if (isCrossProduct(affix)) {
      // cross check incoming continuation class (flag of previous affix) against list.
      char[] appendFlags = dictionary.decodeFlags(append, scratch);
-      if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
+      if (context.isCompound() || !Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
        return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag);
      }
    }
@ -491,8 +526,9 @@ final class Stemmer {
  /**
   * Applies the affix rule to the given word, producing a list of stems if any are found
   *
-   * @param strippedWord Word the affix has been removed and the strip added
-   * @param length valid length of stripped word
+   * @param strippedWord Char array containing the word with the affix removed and the strip added
+   * @param offset where the word actually starts in the array
+   * @param length the length of the stripped word
   * @param affix HunspellAffix representing the affix rule itself
   * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
   *     suffix, unless both are compatible so we must check dictionary form against both to add it
@ -505,6 +541,7 @@ final class Stemmer {
      char[] strippedWord,
      int offset,
      int length,
+      WordContext context,
      int affix,
      int prefixId,
      int recursionDepth,
@ -546,10 +583,15 @@ final class Stemmer {
          if (!acceptCase(caseVariant, wordFlags)) {
            continue;
          }
-          // we aren't decompounding (yet)
-          if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
+          if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
            continue;
          }
+          if (context.isCompound()) {
+            char cFlag = context.requiredFlag(dictionary);
+            if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
+              continue;
+            }
+          }
          stems.add(newStem(strippedWord, offset, length, forms, i));
        }
      }
@ -594,6 +636,7 @@ final class Stemmer {
              strippedWord,
              offset,
              length,
+              context,
              affix,
              flag,
              prefixId,
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+enum WordContext {
+  SIMPLE_WORD,
+  COMPOUND_BEGIN,
+  COMPOUND_MIDDLE,
+  COMPOUND_END;
+
+  boolean isCompound() {
+    return this != SIMPLE_WORD;
+  }
+
+  char requiredFlag(Dictionary dictionary) {
+    switch (this) {
+      case COMPOUND_BEGIN:
+        return dictionary.compoundBegin;
+      case COMPOUND_MIDDLE:
+        return dictionary.compoundMiddle;
+      case COMPOUND_END:
+        return dictionary.compoundEnd;
+      default:
+        return Dictionary.FLAG_UNSET;
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -80,6 +80,10 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("compoundrule8");
  }

+  public void testGermanCompounding() throws Exception {
+    doTest("germancompounding");
+  }
+
  protected void doTest(String name) throws Exception {
    InputStream affixStream =
        Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff
@ -0,0 +1,91 @@
+# German compounding
+
+# handle special casing of German sharp s
+
+CHECKSHARPS
+
+# compound flags
+
+COMPOUNDBEGIN U
+COMPOUNDMIDDLE V
+COMPOUNDEND W
+
+# Prefixes are allowed at the beginning of compounds,
+# suffixes are allowed at the end of compounds by default:
+# (prefix)?(root)+(affix)?
+# Affixes with COMPOUNDPERMITFLAG may be inside of compounds.
+COMPOUNDPERMITFLAG P
+
+# for German fogemorphemes (Fuge-element)
+# Hint: ONLYINCOMPOUND is not required everywhere, but the
+# checking will be a little faster with it.
+
+ONLYINCOMPOUND X
+
+# forbid uppercase characters at compound word bounds
+CHECKCOMPOUNDCASE
+
+# for handling Fuge-elements with dashes (Arbeits-) 
+# dash will be a special word
+
+COMPOUNDMIN 1
+WORDCHARS -
+
+# compound settings and fogemorpheme for `Arbeit'
+
+SFX A Y 3
+SFX A 0 s/UPX .
+SFX A 0 s/VPDX .
+SFX A 0 0/WXD .
+
+SFX B Y 2
+SFX B 0 0/UPX .
+SFX B 0 0/VWXDP .
+
+# a suffix for `Computer'
+
+SFX C Y 1
+SFX C 0 n/WD .
+
+# for forbid exceptions (*Arbeitsnehmer)
+
+FORBIDDENWORD Z
+
+# dash prefix for compounds with dash (Arbeits-Computer)
+
+PFX - Y 1
+PFX - 0 -/P .
+
+# decapitalizing prefix
+# circumfix for positioning in compounds
+
+PFX D Y 29
+PFX D A a/PX A
+PFX D Ä ä/PX Ä
+PFX D B b/PX B
+PFX D C c/PX C
+PFX D D d/PX D
+PFX D E e/PX E
+PFX D F f/PX F
+PFX D G g/PX G
+PFX D H h/PX H
+PFX D I i/PX I
+PFX D J j/PX J
+PFX D K k/PX K
+PFX D L l/PX L
+PFX D M m/PX M
+PFX D N n/PX N
+PFX D O o/PX O
+PFX D Ö ö/PX Ö
+PFX D P p/PX P
+PFX D Q q/PX Q
+PFX D R r/PX R
+PFX D S s/PX S
+PFX D T t/PX T
+PFX D U u/PX U
+PFX D Ü ü/PX Ü
+PFX D V v/PX V
+PFX D W w/PX W
+PFX D X x/PX X
+PFX D Y y/PX Y
+PFX D Z z/PX Z
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic
@ -0,0 +1,5 @@
+4
+Arbeit/A-
+Computer/BC-
+-/W
+Arbeitsnehmer/Z
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good
@ -0,0 +1,20 @@
+Computer
+Computern
+Arbeit
+Arbeits-
+Computerarbeit
+Computerarbeits-
+Arbeitscomputer
+Computercomputer
+Computercomputern
+Arbeitscomputern
+Computerarbeitscomputer
+Computerarbeitscomputern
+Arbeitscomputercomputer
+Computercomputerarbeit
+Arbeitscomputerarbeit
+Arbeitsarbeitsarbeit
+Computerarbeitsarbeit
+Computerarbeits-Computer
+Computerarbeits-Computern
+Computer-Arbeit
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong
@ -0,0 +1,50 @@
+computer
+computern
+arbeit
+Arbeits
+arbeits
+ComputerArbeit
+ComputernArbeit
+Computernarbeit
+ComputerArbeits
+Arbeitcomputer
+Arbeitcomputern
+ArbeitsComputer
+ArbeitsComputern
+Computerarbeitcomputer
+ComputerArbeitcomputer
+ComputerArbeitscomputer
+Computerarbeitcomputern
+ComputerArbeitcomputern
+ComputerArbeitscomputern
+Arbeitscomputerarbeits
+Arbeitscomputernarbeits
+Computerarbeits-computer
+Arbeitsnehmer
+computers
+computern
+computernarbeit
+computernArbeit
+computerArbeit
+computerArbeits
+arbeitcomputer
+arbeitsComputer
+computerarbeitcomputer
+computerArbeitcomputer
+computerArbeitscomputer
+arbeitscomputerarbeits
+computerarbeits-computer
+arbeitsnehmer
+computernarbeit
+computernArbeit
+arbeits-
+computerarbeit
+computerarbeits-
+arbeitscomputer
+arbeitscomputern
+computerarbeitscomputer
+computerarbeitscomputern
+computerarbeitscomputers
+arbeitscomputerarbeit
+computerarbeits-Computer
+computerarbeits-Computern