LUCENE-9699: Support German-like compound words (#2248)

2021-01-27 22:31:58 +01:00 · 2021-01-27 22:31:58 +01:00 · a176308aa6
parent 38ec2602ce
commit a176308aa6
9 changed files with 329 additions and 31 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -141,8 +141,9 @@ public class Dictionary {
  char keepcase;
  char needaffix;
  char forbiddenword;
-  char onlyincompound;
+  char onlyincompound, compoundBegin, compoundMiddle, compoundEnd, compoundPermit;
-  int compoundMin = 3;
+  boolean checkCompoundCase;
  int compoundMin = 3, compoundMax = Integer.MAX_VALUE;
  List<CompoundRule> compoundRules; // nullable
  // ignored characters (dictionary, affix, inputs)
@ -377,8 +378,20 @@ public class Dictionary {
        forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIN".equals(firstWord)) {
        compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
      } else if ("COMPOUNDWORDMAX".equals(firstWord)) {
        compoundMax = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
      } else if ("COMPOUNDRULE".equals(firstWord)) {
        compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
      } else if ("COMPOUNDBEGIN".equals(firstWord)) {
        compoundBegin = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDMIDDLE".equals(firstWord)) {
        compoundMiddle = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDEND".equals(firstWord)) {
        compoundEnd = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("COMPOUNDPERMITFLAG".equals(firstWord)) {
        compoundPermit = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("CHECKCOMPOUNDCASE".equals(firstWord)) {
        checkCompoundCase = true;
      }
    }
@ -1303,10 +1316,6 @@ public class Dictionary {
    }
  }
  boolean hasCompounding() {
    return compoundRules != null;
  }
  boolean hasFlag(int entryId, char flag, BytesRef scratch) {
    return flag != FLAG_UNSET && hasFlag(decodeFlags(entryId, scratch), flag);
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -87,18 +87,54 @@ public class SpellChecker {
      return false;
    }
-    if (!stemmer.doStem(wordChars, length, caseVariant).isEmpty()) {
+    if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
      return true;
    }
-    if (dictionary.hasCompounding()) {
+    if (dictionary.compoundRules != null
-      return checkCompounds(wordChars, 0, length, new ArrayList<>());
+        && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
      return true;
    }
    return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
  }
  private boolean hasStems(
      char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
    return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
  }
  private boolean checkCompounds(
      char[] chars, int offset, int length, boolean caseVariant, int depth) {
    if (depth > dictionary.compoundMax - 2) return false;
    int limit = length - dictionary.compoundMin + 1;
    for (int breakPos = dictionary.compoundMin; breakPos < limit; breakPos++) {
      WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
      int breakOffset = offset + breakPos;
      if (checkCompoundCase(chars, breakOffset)
          && hasStems(chars, offset, breakPos, caseVariant, context)) {
        int remainingLength = length - breakPos;
        if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
          return true;
        }
        if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
          return true;
        }
      }
    }
    return false;
  }
-  private boolean checkCompounds(char[] wordChars, int offset, int length, List<IntsRef> words) {
+  private boolean checkCompoundCase(char[] chars, int breakPos) {
    if (!dictionary.checkCompoundCase) return true;
    return Character.isUpperCase(chars[breakPos - 1]) == Character.isUpperCase(chars[breakPos]);
  }
  private boolean checkCompoundRules(
      char[] wordChars, int offset, int length, List<IntsRef> words) {
    if (words.size() >= 100) return false;
    int limit = length - dictionary.compoundMin + 1;
@ -113,7 +149,7 @@ public class SpellChecker {
            return true;
          }
-          if (checkCompounds(wordChars, offset + breakPos, length - breakPos, words)) {
+          if (checkCompoundRules(wordChars, offset + breakPos, length - breakPos, words)) {
            return true;
          }
        }
@ -132,8 +168,7 @@ public class SpellChecker {
    words.add(forms);
    boolean result =
-        dictionary.compoundRules != null
+        dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
            && dictionary.compoundRules.stream().anyMatch(r -> r.fullyMatches(words, scratch));
    words.remove(words.size() - 1);
    return result;
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -94,18 +94,18 @@ final class Stemmer {
    }
    WordCase wordCase = caseOf(word, length);
-    List<CharsRef> list = doStem(word, length, false);
+    List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
      char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
      if (aposCase != null) {
-        list.addAll(doStem(aposCase, length, true));
+        list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
      }
-      list.addAll(doStem(titleBuffer, length, true));
+      list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
    }
    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
      caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
-      list.addAll(doStem(lowerBuffer, length, true));
+      list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
    }
    return list;
  }
@ -158,9 +158,10 @@ final class Stemmer {
    return null;
  }
-  List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
+  List<CharsRef> doStem(
      char[] word, int offset, int length, boolean caseVariant, WordContext context) {
    List<CharsRef> stems = new ArrayList<>();
-    IntsRef forms = dictionary.lookupWord(word, 0, length);
+    IntsRef forms = dictionary.lookupWord(word, offset, length);
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
        char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
@ -172,15 +173,32 @@ final class Stemmer {
          continue;
        }
        // we can't add this form, it only belongs inside a compound word
-        if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
+        if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
          continue;
        }
-        stems.add(newStem(word, 0, length, forms, i));
+        if (context.isCompound()
            && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
          continue;
        }
        stems.add(newStem(word, offset, length, forms, i));
      }
    }
    try {
      stems.addAll(
-          stem(word, 0, length, -1, (char) 0, -1, 0, true, true, false, false, caseVariant));
+          stem(
              word,
              offset,
              length,
              context,
              -1,
              (char) 0,
              -1,
              0,
              true,
              true,
              false,
              false,
              caseVariant));
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
@ -287,6 +305,7 @@ final class Stemmer {
      char[] word,
      int offset,
      int length,
      WordContext context,
      int previous,
      char prevFlag,
      int prefixId,
@ -328,7 +347,7 @@ final class Stemmer {
            continue;
          }
-          if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
+          if (isAffixCompatible(prefix, prevFlag, recursionDepth, true, false, context)) {
            char[] strippedWord = stripAffix(word, offset, length, i, prefix, true);
            if (strippedWord == null) {
              continue;
@ -340,6 +359,7 @@ final class Stemmer {
                    strippedWord,
                    pureAffix ? offset + i : 0,
                    pureAffix ? length - i : strippedWord.length,
                    context,
                    prefix,
                    -1,
                    recursionDepth,
@ -378,7 +398,8 @@ final class Stemmer {
            continue;
          }
-          if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
+          if (isAffixCompatible(
              suffix, prevFlag, recursionDepth, false, previousWasPrefix, context)) {
            char[] strippedWord = stripAffix(word, offset, length, length - i, suffix, false);
            if (strippedWord == null) {
              continue;
@ -390,6 +411,7 @@ final class Stemmer {
                    strippedWord,
                    pureAffix ? offset : 0,
                    pureAffix ? i : strippedWord.length,
                    context,
                    suffix,
                    prefixId,
                    recursionDepth,
@ -442,18 +464,31 @@ final class Stemmer {
  }
  private boolean isAffixCompatible(
-      int affix, char prevFlag, int recursionDepth, boolean previousWasPrefix) {
+      int affix,
      char prevFlag,
      int recursionDepth,
      boolean isPrefix,
      boolean previousWasPrefix,
      WordContext context) {
    int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
    if (context.isCompound() && dictionary.compoundPermit > 0) {
      WordContext allowed = isPrefix ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_END;
      if (context != allowed && !dictionary.hasFlag(append, dictionary.compoundPermit, scratch)) {
        return false;
      }
    }
    if (recursionDepth == 0) {
      // check if affix is allowed in a non-compound word
-      return !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
+      return context.isCompound()
          || !dictionary.hasFlag(append, dictionary.onlyincompound, scratch);
    }
    if (isCrossProduct(affix)) {
      // cross check incoming continuation class (flag of previous affix) against list.
      char[] appendFlags = dictionary.decodeFlags(append, scratch);
-      if (!Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
+      if (context.isCompound() || !Dictionary.hasFlag(appendFlags, dictionary.onlyincompound)) {
        return previousWasPrefix || Dictionary.hasFlag(appendFlags, prevFlag);
      }
    }
@ -491,8 +526,9 @@ final class Stemmer {
  /**
   * Applies the affix rule to the given word, producing a list of stems if any are found
   *
-   * @param strippedWord Word the affix has been removed and the strip added
+   * @param strippedWord Char array containing the word with the affix removed and the strip added
-   * @param length valid length of stripped word
+   * @param offset where the word actually starts in the array
   * @param length the length of the stripped word
   * @param affix HunspellAffix representing the affix rule itself
   * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
   *     suffix, unless both are compatible so we must check dictionary form against both to add it
@ -505,6 +541,7 @@ final class Stemmer {
      char[] strippedWord,
      int offset,
      int length,
      WordContext context,
      int affix,
      int prefixId,
      int recursionDepth,
@ -546,10 +583,15 @@ final class Stemmer {
          if (!acceptCase(caseVariant, wordFlags)) {
            continue;
          }
-          // we aren't decompounding (yet)
+          if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
          if (Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
            continue;
          }
          if (context.isCompound()) {
            char cFlag = context.requiredFlag(dictionary);
            if (!Dictionary.hasFlag(wordFlags, cFlag) && !isFlagAppendedByAffix(affix, cFlag)) {
              continue;
            }
          }
          stems.add(newStem(strippedWord, offset, length, forms, i));
        }
      }
@ -594,6 +636,7 @@ final class Stemmer {
              strippedWord,
              offset,
              length,
              context,
              affix,
              flag,
              prefixId,
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordContext.java
@ -0,0 +1,41 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 enum WordContext {
  SIMPLE_WORD,
  COMPOUND_BEGIN,
  COMPOUND_MIDDLE,
  COMPOUND_END;
  boolean isCompound() {
    return this != SIMPLE_WORD;
  }
  char requiredFlag(Dictionary dictionary) {
    switch (this) {
      case COMPOUND_BEGIN:
        return dictionary.compoundBegin;
      case COMPOUND_MIDDLE:
        return dictionary.compoundMiddle;
      case COMPOUND_END:
        return dictionary.compoundEnd;
      default:
        return Dictionary.FLAG_UNSET;
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -80,6 +80,10 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("compoundrule8");
  }
  public void testGermanCompounding() throws Exception {
    doTest("germancompounding");
  }
  protected void doTest(String name) throws Exception {
    InputStream affixStream =
        Objects.requireNonNull(getClass().getResourceAsStream(name + ".aff"), name);
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.aff
@ -0,0 +1,91 @@
 # German compounding
 # handle special casing of German sharp s
 CHECKSHARPS
 # compound flags
 COMPOUNDBEGIN U
 COMPOUNDMIDDLE V
 COMPOUNDEND W
 # Prefixes are allowed at the beginning of compounds,
 # suffixes are allowed at the end of compounds by default:
 # (prefix)?(root)+(affix)?
 # Affixes with COMPOUNDPERMITFLAG may be inside of compounds.
 COMPOUNDPERMITFLAG P
 # for German fogemorphemes (Fuge-element)
 # Hint: ONLYINCOMPOUND is not required everywhere, but the
 # checking will be a little faster with it.
 ONLYINCOMPOUND X
 # forbid uppercase characters at compound word bounds
 CHECKCOMPOUNDCASE
 # for handling Fuge-elements with dashes (Arbeits-) 
 # dash will be a special word
 COMPOUNDMIN 1
 WORDCHARS -
 # compound settings and fogemorpheme for `Arbeit'
 SFX A Y 3
 SFX A 0 s/UPX .
 SFX A 0 s/VPDX .
 SFX A 0 0/WXD .
 SFX B Y 2
 SFX B 0 0/UPX .
 SFX B 0 0/VWXDP .
 # a suffix for `Computer'
 SFX C Y 1
 SFX C 0 n/WD .
 # for forbid exceptions (*Arbeitsnehmer)
 FORBIDDENWORD Z
 # dash prefix for compounds with dash (Arbeits-Computer)
 PFX - Y 1
 PFX - 0 -/P .
 # decapitalizing prefix
 # circumfix for positioning in compounds
 PFX D Y 29
 PFX D A a/PX A
 PFX D Ä ä/PX Ä
 PFX D B b/PX B
 PFX D C c/PX C
 PFX D D d/PX D
 PFX D E e/PX E
 PFX D F f/PX F
 PFX D G g/PX G
 PFX D H h/PX H
 PFX D I i/PX I
 PFX D J j/PX J
 PFX D K k/PX K
 PFX D L l/PX L
 PFX D M m/PX M
 PFX D N n/PX N
 PFX D O o/PX O
 PFX D Ö ö/PX Ö
 PFX D P p/PX P
 PFX D Q q/PX Q
 PFX D R r/PX R
 PFX D S s/PX S
 PFX D T t/PX T
 PFX D U u/PX U
 PFX D Ü ü/PX Ü
 PFX D V v/PX V
 PFX D W w/PX W
 PFX D X x/PX X
 PFX D Y y/PX Y
 PFX D Z z/PX Z
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.dic
@ -0,0 +1,5 @@
 4
 Arbeit/A-
 Computer/BC-
 -/W
 Arbeitsnehmer/Z
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.good
@ -0,0 +1,20 @@
 Computer
 Computern
 Arbeit
 Arbeits-
 Computerarbeit
 Computerarbeits-
 Arbeitscomputer
 Computercomputer
 Computercomputern
 Arbeitscomputern
 Computerarbeitscomputer
 Computerarbeitscomputern
 Arbeitscomputercomputer
 Computercomputerarbeit
 Arbeitscomputerarbeit
 Arbeitsarbeitsarbeit
 Computerarbeitsarbeit
 Computerarbeits-Computer
 Computerarbeits-Computern
 Computer-Arbeit
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germancompounding.wrong
@ -0,0 +1,50 @@
 computer
 computern
 arbeit
 Arbeits
 arbeits
 ComputerArbeit
 ComputernArbeit
 Computernarbeit
 ComputerArbeits
 Arbeitcomputer
 Arbeitcomputern
 ArbeitsComputer
 ArbeitsComputern
 Computerarbeitcomputer
 ComputerArbeitcomputer
 ComputerArbeitscomputer
 Computerarbeitcomputern
 ComputerArbeitcomputern
 ComputerArbeitscomputern
 Arbeitscomputerarbeits
 Arbeitscomputernarbeits
 Computerarbeits-computer
 Arbeitsnehmer
 computers
 computern
 computernarbeit
 computernArbeit
 computerArbeit
 computerArbeits
 arbeitcomputer
 arbeitsComputer
 computerarbeitcomputer
 computerArbeitcomputer
 computerArbeitscomputer
 arbeitscomputerarbeits
 computerarbeits-computer
 arbeitsnehmer
 computernarbeit
 computernArbeit
 arbeits-
 computerarbeit
 computerarbeits-
 arbeitscomputer
 arbeitscomputern
 computerarbeitscomputer
 computerarbeitscomputern
 computerarbeitscomputers
 arbeitscomputerarbeit
 computerarbeits-Computer
 computerarbeits-Computern