LUCENE-9704: Hunspell: support capitalization for German ß (#2260)

2021-01-29 10:03:37 +01:00 · 2021-01-29 10:03:37 +01:00 · 6635d7a5e7
parent 71705c900b
commit 6635d7a5e7
10 changed files with 211 additions and 41 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -133,6 +133,7 @@ public class Dictionary {
  boolean hasStemExceptions;

  boolean ignoreCase;
+  boolean checkSharpS;
  boolean complexPrefixes;
  // if no affixes have continuation classes, no need to do 2-level affix stripping
  boolean twoStageAffix;
@ -353,6 +354,8 @@ public class Dictionary {
        needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
      } else if ("ONLYINCOMPOUND".equals(firstWord)) {
        onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
+      } else if ("CHECKSHARPS".equals(firstWord)) {
+        checkSharpS = true;
      } else if ("IGNORE".equals(firstWord)) {
        ignore = singleArgument(reader, line).toCharArray();
        Arrays.sort(ignore);
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -61,7 +61,7 @@ public class SpellChecker {
      return false;
    }

-    if (checkWord(wordChars, wordChars.length, false)) {
+    if (checkWord(wordChars, wordChars.length, null)) {
      return true;
    }

@ -89,23 +89,39 @@ public class SpellChecker {
    char[] caseVariant = wordChars;
    if (wordCase == WordCase.UPPER) {
      caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
-      if (checkWord(caseVariant, wordChars.length, true)) {
+      if (checkWord(caseVariant, wordChars.length, wordCase)) {
        return true;
      }
      char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
-      if (aposCase != null && checkWord(aposCase, aposCase.length, true)) {
+      if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
+        return true;
+      }
+      for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
+        if (checkWord(variation, variation.length, null)) {
          return true;
        }
      }
-    return checkWord(stemmer.caseFoldLower(caseVariant, wordChars.length), wordChars.length, true);
+    }
+    char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
+    if (checkWord(lower, wordChars.length, wordCase)) {
+      return true;
+    }
+    if (wordCase == WordCase.UPPER) {
+      for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
+        if (checkWord(variation, variation.length, null)) {
+          return true;
+        }
+      }
+    }
+    return false;
  }

-  private boolean checkWord(char[] wordChars, int length, boolean caseVariant) {
+  private boolean checkWord(char[] wordChars, int length, WordCase originalCase) {
    if (dictionary.isForbiddenWord(wordChars, length, scratch)) {
      return false;
    }

-    if (hasStems(wordChars, 0, length, caseVariant, WordContext.SIMPLE_WORD)) {
+    if (hasStems(wordChars, 0, length, originalCase, WordContext.SIMPLE_WORD)) {
      return true;
    }

@ -114,16 +130,16 @@ public class SpellChecker {
      return true;
    }

-    return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, caseVariant, 0);
+    return dictionary.compoundBegin > 0 && checkCompounds(wordChars, 0, length, originalCase, 0);
  }

  private boolean hasStems(
-      char[] chars, int offset, int length, boolean caseVariant, WordContext context) {
-    return !stemmer.doStem(chars, offset, length, caseVariant, context).isEmpty();
+      char[] chars, int offset, int length, WordCase originalCase, WordContext context) {
+    return !stemmer.doStem(chars, offset, length, originalCase, context).isEmpty();
  }

  private boolean checkCompounds(
-      char[] chars, int offset, int length, boolean caseVariant, int depth) {
+      char[] chars, int offset, int length, WordCase originalCase, int depth) {
    if (depth > dictionary.compoundMax - 2) return false;

    int limit = length - dictionary.compoundMin + 1;
@ -131,13 +147,13 @@ public class SpellChecker {
      WordContext context = depth == 0 ? WordContext.COMPOUND_BEGIN : WordContext.COMPOUND_MIDDLE;
      int breakOffset = offset + breakPos;
      if (checkCompoundCase(chars, breakOffset)
-          && hasStems(chars, offset, breakPos, caseVariant, context)) {
+          && hasStems(chars, offset, breakPos, originalCase, context)) {
        int remainingLength = length - breakPos;
-        if (hasStems(chars, breakOffset, remainingLength, caseVariant, WordContext.COMPOUND_END)) {
+        if (hasStems(chars, breakOffset, remainingLength, originalCase, WordContext.COMPOUND_END)) {
          return true;
        }

-        if (checkCompounds(chars, breakOffset, remainingLength, caseVariant, depth + 1)) {
+        if (checkCompounds(chars, breakOffset, remainingLength, originalCase, depth + 1)) {
          return true;
        }
      }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -20,6 +20,8 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
@ -99,20 +101,32 @@ final class Stemmer {
    }

    WordCase wordCase = caseOf(word, length);
-    List<CharsRef> list = doStem(word, 0, length, false, WordContext.SIMPLE_WORD);
+    List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
+    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
+      addCaseVariations(word, length, wordCase, list);
+    }
+    return list;
+  }
+
+  private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
      char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
      if (aposCase != null) {
-        list.addAll(doStem(aposCase, 0, length, true, WordContext.SIMPLE_WORD));
+        list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
+      }
+      list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
+      for (char[] variation : sharpSVariations(titleBuffer, length)) {
+        list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
      }
-      list.addAll(doStem(titleBuffer, 0, length, true, WordContext.SIMPLE_WORD));
    }
-    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
    caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
-      list.addAll(doStem(lowerBuffer, 0, length, true, WordContext.SIMPLE_WORD));
+    list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
+    if (wordCase == WordCase.UPPER) {
+      for (char[] variation : sharpSVariations(lowerBuffer, length)) {
+        list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
+      }
    }
-    return list;
  }

  // temporary buffers for case variants
@ -163,14 +177,52 @@ final class Stemmer {
    return null;
  }

+  List<char[]> sharpSVariations(char[] word, int length) {
+    if (!dictionary.checkSharpS) return Collections.emptyList();
+
+    Stream<String> result =
+        new Object() {
+          int findSS(int start) {
+            for (int i = start; i < length - 1; i++) {
+              if (word[i] == 's' && word[i + 1] == 's') {
+                return i;
+              }
+            }
+            return -1;
+          }
+
+          Stream<String> replaceSS(int start, int depth) {
+            if (depth > 5) { // cut off too large enumeration
+              return Stream.of(new String(word, start, length - start));
+            }
+
+            int ss = findSS(start);
+            if (ss < 0) {
+              return null;
+            } else {
+              String prefix = new String(word, start, ss - start);
+              Stream<String> tails = replaceSS(ss + 2, depth + 1);
+              if (tails == null) {
+                tails = Stream.of(new String(word, ss + 2, length - ss - 2));
+              }
+              return tails.flatMap(s -> Stream.of(prefix + "ss" + s, prefix + "ß" + s));
+            }
+          }
+        }.replaceSS(0, 0);
+    if (result == null) return Collections.emptyList();
+
+    String src = new String(word, 0, length);
+    return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
+  }
+
  List<CharsRef> doStem(
-      char[] word, int offset, int length, boolean caseVariant, WordContext context) {
+      char[] word, int offset, int length, WordCase originalCase, WordContext context) {
    List<CharsRef> stems = new ArrayList<>();
    IntsRef forms = dictionary.lookupWord(word, offset, length);
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
        char[] wordFlags = dictionary.decodeFlags(forms.ints[forms.offset + i], scratch);
-        if (!acceptCase(caseVariant, wordFlags)) {
+        if (!acceptCase(originalCase, wordFlags, word, offset, length)) {
          continue;
        }
        // we can't add this form, it's a pseudostem requiring an affix
@ -203,17 +255,35 @@ final class Stemmer {
              true,
              false,
              false,
-              caseVariant));
+              originalCase));
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
    return stems;
  }

-  private boolean acceptCase(boolean caseVariant, char[] wordFlags) {
-    return caseVariant
-        ? !Dictionary.hasFlag(wordFlags, dictionary.keepcase)
-        : !Dictionary.hasHiddenFlag(wordFlags);
+  private boolean acceptCase(
+      WordCase originalCase, char[] wordFlags, char[] word, int offset, int length) {
+    boolean keepCase = Dictionary.hasFlag(wordFlags, dictionary.keepcase);
+    if (originalCase != null) {
+      if (keepCase
+          && dictionary.checkSharpS
+          && originalCase == WordCase.TITLE
+          && containsSharpS(word, offset, length)) {
+        return true;
+      }
+      return !keepCase;
+    }
+    return !Dictionary.hasHiddenFlag(wordFlags);
+  }
+
+  private boolean containsSharpS(char[] word, int offset, int length) {
+    for (int i = 0; i < length; i++) {
+      if (word[i + offset] == 'ß') {
+        return true;
+      }
+    }
+    return false;
  }

  /**
@ -302,8 +372,8 @@ final class Stemmer {
   *     (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
   * @param circumfix true if the previous prefix removal was signed as a circumfix this means inner
   *     most suffix must also contain circumfix flag.
-   * @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag
-   *     it cannot succeed.
+   * @param originalCase if non-null, represents original word case to disallow case variations of
+   *     word with KEEPCASE flags
   * @return List of stems, or empty list if no stems are found
   */
  private List<CharsRef> stem(
@ -319,7 +389,7 @@ final class Stemmer {
      boolean doSuffix,
      boolean previousWasPrefix,
      boolean circumfix,
-      boolean caseVariant)
+      WordCase originalCase)
      throws IOException {

    // TODO: allow this stuff to be reused by tokenfilter
@ -371,7 +441,7 @@ final class Stemmer {
                    recursionDepth,
                    true,
                    circumfix,
-                    caseVariant));
+                    originalCase));
          }
        }
      }
@ -424,7 +494,7 @@ final class Stemmer {
                    recursionDepth,
                    false,
                    circumfix,
-                    caseVariant));
+                    originalCase));
          }
        }
      }
@ -555,7 +625,7 @@ final class Stemmer {
      int recursionDepth,
      boolean prefix,
      boolean circumfix,
-      boolean caseVariant)
+      WordCase originalCase)
      throws IOException {
    char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);

@ -589,7 +659,7 @@ final class Stemmer {
          }

          // we are looking for a case variant, but this word does not allow it
-          if (!acceptCase(caseVariant, wordFlags)) {
+          if (!acceptCase(originalCase, wordFlags, strippedWord, offset, length)) {
            continue;
          }
          if (!context.isCompound() && Dictionary.hasFlag(wordFlags, dictionary.onlyincompound)) {
@ -654,7 +724,7 @@ final class Stemmer {
              true,
              prefix,
              circumfix,
-              caseVariant));
+              originalCase));
    }

    return stems;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/WordCase.java
@ -28,9 +28,9 @@ enum WordCase {
    boolean seenUpper = false;
    boolean seenLower = false;
    for (int i = 1; i < length; i++) {
-      char ch = word[i];
-      seenUpper = seenUpper || Character.isUpperCase(ch);
-      seenLower = seenLower || Character.isLowerCase(ch);
+      CharCase cc = charCase(word[i]);
+      seenUpper = seenUpper || cc == CharCase.UPPER;
+      seenLower = seenLower || cc == CharCase.LOWER;
      if (seenUpper && seenLower) break;
    }

@ -43,9 +43,9 @@ enum WordCase {
    boolean seenUpper = false;
    boolean seenLower = false;
    for (int i = 1; i < length; i++) {
-      char ch = word.charAt(i);
-      seenUpper = seenUpper || Character.isUpperCase(ch);
-      seenLower = seenLower || Character.isLowerCase(ch);
+      CharCase cc = charCase(word.charAt(i));
+      seenUpper = seenUpper || cc == CharCase.UPPER;
+      seenLower = seenLower || cc == CharCase.LOWER;
      if (seenUpper && seenLower) break;
    }

@ -58,4 +58,20 @@ enum WordCase {
    }
    return seenUpper ? MIXED : LOWER;
  }
+
+  private static CharCase charCase(char c) {
+    if (Character.isUpperCase(c)) {
+      return CharCase.UPPER;
+    }
+    if (Character.isLowerCase(c) && Character.toUpperCase(c) != c) {
+      return CharCase.LOWER;
+    }
+    return CharCase.NEUTRAL;
+  }
+
+  private enum CharCase {
+    UPPER,
+    LOWER,
+    NEUTRAL
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/SpellCheckerTest.java
@ -41,6 +41,11 @@ public class SpellCheckerTest extends StemmerTestBase {
    doTest("allcaps");
  }

+  @Test
+  public void checkSharpS() throws Exception {
+    doTest("checksharps");
+  }
+
  @Test
  public void IJ() throws Exception {
    doTest("IJ");
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCheckSharpS.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCheckSharpS.java
@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import org.junit.BeforeClass;
+
+public class TestCheckSharpS extends StemmerTestBase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    init("checksharps.aff", "checksharps.dic");
+  }
+
+  public void testSharpS() {
+    assertStemsTo("Müßig", "müßig");
+    assertStemsTo("MÜSSIG", "müßig");
+    assertStemsTo("Müssig");
+    assertStemsTo("PROZESSIONSSTRASSE", "Prozessionsstraße");
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.aff
@ -0,0 +1,4 @@
+# test <20> - SS special capitalizing
+CHECKSHARPS
+WORDCHARS <20>.
+KEEPCASE k
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.dic
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.dic
@ -0,0 +1,7 @@
+6
+müßig/k
+Ausstoß
+Abstoß.
+Außenabmessung
+Prozessionsstraße
+Außenmaße
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.good
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.good
@ -0,0 +1,13 @@
+müßig
+Müßig
+MÜSSIG
+Ausstoß
+Abstoß.
+Außenabmessung
+Prozessionsstraße
+Außenmaße
+AUSSTOSS
+ABSTOSS.
+AUSSENABMESSUNG
+PROZESSIONSSTRASSE
+AUSSENMASSE
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.wrong
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/checksharps.wrong
@ -0,0 +1,2 @@
+MÜßIG
+Müssig