LUCENE-9746: Hunspell: unify case variation logic in Stemmer and SpellChecker (#2322)

2021-02-08 21:37:32 +01:00 · 2021-02-08 21:37:32 +01:00 · 80803eb9ad
parent d0b4ef66d7
commit 80803eb9ad
2 changed files with 40 additions and 70 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -73,9 +73,13 @@ public class SpellChecker {
    }

    WordCase wc = stemmer.caseOf(wordChars, wordChars.length);
-    if ((wc == WordCase.UPPER || wc == WordCase.TITLE) && checkCaseVariants(wordChars, wc)) {
+    if ((wc == WordCase.UPPER || wc == WordCase.TITLE)) {
+      Stemmer.CaseVariationProcessor variationProcessor =
+          (variant, varLength, originalCase) -> !checkWord(variant, varLength, originalCase);
+      if (!stemmer.varyCase(wordChars, wordChars.length, wc, variationProcessor)) {
        return true;
      }
+    }

    if (dictionary.breaks.isNotEmpty() && !hasTooManyBreakOccurrences(word)) {
      return tryBreaks(word);
@ -92,42 +96,6 @@ public class SpellChecker {
    return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
  }

-  private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
-    char[] caseVariant = wordChars;
-    if (wordCase == WordCase.UPPER) {
-      caseVariant = stemmer.caseFoldTitle(caseVariant, wordChars.length);
-      if (checkWord(caseVariant, wordChars.length, wordCase)) {
-        return true;
-      }
-      char[] aposCase = Stemmer.capitalizeAfterApostrophe(caseVariant, wordChars.length);
-      if (aposCase != null && checkWord(aposCase, aposCase.length, wordCase)) {
-        return true;
-      }
-      for (char[] variation : stemmer.sharpSVariations(caseVariant, wordChars.length)) {
-        if (checkWord(variation, variation.length, null)) {
-          return true;
-        }
-      }
-    }
-
-    if (dictionary.isDotICaseChangeDisallowed(wordChars)) {
-      return false;
-    }
-
-    char[] lower = stemmer.caseFoldLower(caseVariant, wordChars.length);
-    if (checkWord(lower, wordChars.length, wordCase)) {
-      return true;
-    }
-    if (wordCase == WordCase.UPPER) {
-      for (char[] variation : stemmer.sharpSVariations(lower, wordChars.length)) {
-        if (checkWord(variation, variation.length, null)) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
  boolean checkWord(String word) {
    return checkWord(word.toCharArray(), word.length(), null);
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -111,46 +111,47 @@ final class Stemmer {

    WordCase wordCase = caseOf(word, length);
    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
-      addCaseVariations(word, length, wordCase, processor);
+      CaseVariationProcessor variationProcessor =
+          (variant, varLength, originalCase) ->
+              doStem(variant, 0, varLength, originalCase, WordContext.SIMPLE_WORD, processor);
+      varyCase(word, length, wordCase, variationProcessor);
    }
    return list;
  }

-  private void addCaseVariations(
-      char[] word, int length, WordCase wordCase, RootProcessor processor) {
+  interface CaseVariationProcessor {
+    boolean process(char[] word, int length, WordCase originalCase);
+  }
+
+  boolean varyCase(char[] word, int length, WordCase wordCase, CaseVariationProcessor processor) {
    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
      char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
-      if (aposCase != null) {
-        if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
-          return;
+      if (aposCase != null && !processor.process(aposCase, length, wordCase)) {
+        return false;
      }
+      if (!processor.process(titleBuffer, length, wordCase)) {
+        return false;
      }
-      if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
-        return;
-      }
-      for (char[] variation : sharpSVariations(titleBuffer, length)) {
-        if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
-          return;
-        }
+      if (dictionary.checkSharpS && !varySharpS(titleBuffer, length, processor)) {
+        return false;
      }
    }

    if (dictionary.isDotICaseChangeDisallowed(word)) {
-      return;
+      return true;
    }

    caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
-    if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
-      return;
-    }
-    if (wordCase == WordCase.UPPER) {
-      for (char[] variation : sharpSVariations(lowerBuffer, length)) {
-        if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
-          return;
-        }
+    if (!processor.process(lowerBuffer, length, wordCase)) {
+      return false;
    }
+    if (wordCase == WordCase.UPPER
+        && dictionary.checkSharpS
+        && !varySharpS(lowerBuffer, length, processor)) {
+      return false;
    }
+    return true;
  }

  // temporary buffers for case variants
@ -167,26 +168,24 @@ final class Stemmer {
  }

  /** folds titlecase variant of word to titleBuffer */
-  char[] caseFoldTitle(char[] word, int length) {
+  private void caseFoldTitle(char[] word, int length) {
    titleBuffer = ArrayUtil.grow(titleBuffer, length);
    System.arraycopy(word, 0, titleBuffer, 0, length);
    for (int i = 1; i < length; i++) {
      titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
    }
-    return titleBuffer;
  }

  /** folds lowercase variant of word (title cased) to lowerBuffer */
-  char[] caseFoldLower(char[] word, int length) {
+  private void caseFoldLower(char[] word, int length) {
    lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
    System.arraycopy(word, 0, lowerBuffer, 0, length);
    lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
-    return lowerBuffer;
  }

  // Special prefix handling for Catalan, French, Italian:
  // prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
-  static char[] capitalizeAfterApostrophe(char[] word, int length) {
+  private static char[] capitalizeAfterApostrophe(char[] word, int length) {
    for (int i = 1; i < length - 1; i++) {
      if (word[i] == '\'') {
        char next = word[i + 1];
@ -201,9 +200,7 @@ final class Stemmer {
    return null;
  }

-  List<char[]> sharpSVariations(char[] word, int length) {
-    if (!dictionary.checkSharpS) return Collections.emptyList();
-
+  private boolean varySharpS(char[] word, int length, CaseVariationProcessor processor) {
    Stream<String> result =
        new Object() {
          int findSS(int start) {
@ -233,10 +230,15 @@ final class Stemmer {
            }
          }
        }.replaceSS(0, 0);
-    if (result == null) return Collections.emptyList();
+    if (result == null) return true;

    String src = new String(word, 0, length);
-    return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
+    for (String s : result.collect(Collectors.toList())) {
+      if (!s.equals(src) && !processor.process(s.toCharArray(), s.length(), null)) {
+        return false;
+      }
+    }
+    return true;
  }

  boolean doStem(