LUCENE-9726: Hunspell: speed up spellchecking by stopping at a single… (#2295)

2021-02-04 09:13:11 +01:00 · 2021-02-04 09:13:11 +01:00 · 04167b27f5
parent e2cf6ee74d
commit 04167b27f5
3 changed files with 170 additions and 131 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java
@ -16,7 +16,6 @@
 */
 package org.apache.lucene.analysis.hunspell;

-import java.util.List;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
@ -58,10 +57,9 @@ class CheckCompoundPattern {
  }

  boolean prohibitsCompounding(
-      CharsRef word, int breakPos, List<CharsRef> stemsBefore, List<CharsRef> stemsAfter) {
+      CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
    if (isNonAffixedPattern(endChars)) {
-      if (stemsBefore.stream()
-          .noneMatch(stem -> charsMatch(word, breakPos - stem.length, stem.chars))) {
+      if (!charsMatch(word, breakPos - stemBefore.length, stemBefore.chars)) {
        return false;
      }
    } else if (!charsMatch(word, breakPos - endChars.length, endChars)) {
@ -69,18 +67,18 @@ class CheckCompoundPattern {
    }

    if (isNonAffixedPattern(beginChars)) {
-      if (stemsAfter.stream().noneMatch(stem -> charsMatch(word, breakPos, stem.chars))) {
+      if (!charsMatch(word, breakPos, stemAfter.chars)) {
        return false;
      }
    } else if (!charsMatch(word, breakPos, beginChars)) {
      return false;
    }

-    if (endFlags.length > 0 && !hasStemWithFlags(stemsBefore, endFlags)) {
+    if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
      return false;
    }
    //noinspection RedundantIfStatement
-    if (beginFlags.length > 0 && !hasStemWithFlags(stemsAfter, beginFlags)) {
+    if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
      return false;
    }

@ -91,14 +89,9 @@ class CheckCompoundPattern {
    return pattern.length == 1 && pattern[0] == '0';
  }

-  private boolean hasStemWithFlags(List<CharsRef> stems, char[] flags) {
-    for (CharsRef stem : stems) {
-      IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
-      if (forms != null && hasAllFlags(flags, forms)) {
-        return true;
-      }
-    }
-    return false;
+  private boolean stemHasFlags(CharsRef stem, char[] flags) {
+    IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
+    return forms != null && hasAllFlags(flags, forms);
  }

  private boolean hasAllFlags(char[] flags, IntsRef forms) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/SpellChecker.java
@ -140,7 +140,7 @@ public class SpellChecker {
      return false;
    }

-    if (!stemmer.doStem(wordChars, 0, length, originalCase, SIMPLE_WORD).isEmpty()) {
+    if (findStem(wordChars, 0, length, originalCase, SIMPLE_WORD) != null) {
      return true;
    }

@ -156,8 +156,24 @@ public class SpellChecker {
    return false;
  }

+  private CharsRef findStem(
+      char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
+    CharsRef[] result = {null};
+    stemmer.doStem(
+        wordChars,
+        offset,
+        length,
+        originalCase,
+        context,
+        (stem, forms, formID) -> {
+          result[0] = stem;
+          return false;
+        });
+    return result[0];
+  }
+
  private boolean checkCompounds(
-      CharsRef word, WordCase originalCase, int depth, Predicate<List<CharsRef>> checkPatterns) {
+      CharsRef word, WordCase originalCase, int depth, Predicate<CharsRef> checkPatterns) {
    if (depth > dictionary.compoundMax - 2) return false;

    int limit = word.length - dictionary.compoundMin + 1;
@ -165,16 +181,15 @@ public class SpellChecker {
      WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
      int breakOffset = word.offset + breakPos;
      if (mayBreakIntoCompounds(word.chars, word.offset, word.length, breakOffset)) {
-        List<CharsRef> stems =
-            stemmer.doStem(word.chars, word.offset, breakPos, originalCase, context);
-        if (stems.isEmpty()
+        CharsRef stem = findStem(word.chars, word.offset, breakPos, originalCase, context);
+        if (stem == null
            && dictionary.simplifiedTriple
            && word.chars[breakOffset - 1] == word.chars[breakOffset]) {
-          stems = stemmer.doStem(word.chars, word.offset, breakPos + 1, originalCase, context);
+          stem = findStem(word.chars, word.offset, breakPos + 1, originalCase, context);
        }
-        if (!stems.isEmpty() && checkPatterns.test(stems)) {
-          Predicate<List<CharsRef>> nextCheck = checkNextPatterns(word, breakPos, stems);
-          if (checkCompoundsAfter(word, breakPos, originalCase, depth, stems, nextCheck)) {
+        if (stem != null && checkPatterns.test(stem)) {
+          Predicate<CharsRef> nextCheck = checkNextPatterns(word, breakPos, stem);
+          if (checkCompoundsAfter(word, breakPos, originalCase, depth, stem, nextCheck)) {
            return true;
          }
        }
@ -195,12 +210,11 @@ public class SpellChecker {
      if (expanded != null) {
        WordContext context = depth == 0 ? COMPOUND_BEGIN : COMPOUND_MIDDLE;
        int breakPos = pos + pattern.endLength();
-        List<CharsRef> stems =
-            stemmer.doStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
-        if (!stems.isEmpty()) {
-          Predicate<List<CharsRef>> nextCheck =
-              next -> pattern.prohibitsCompounding(expanded, breakPos, stems, next);
-          if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stems, nextCheck)) {
+        CharsRef stem = findStem(expanded.chars, expanded.offset, breakPos, originalCase, context);
+        if (stem != null) {
+          Predicate<CharsRef> nextCheck =
+              next -> pattern.prohibitsCompounding(expanded, breakPos, stem, next);
+          if (checkCompoundsAfter(expanded, breakPos, originalCase, depth, stem, nextCheck)) {
            return true;
          }
        }
@ -209,11 +223,10 @@ public class SpellChecker {
    return false;
  }

-  private Predicate<List<CharsRef>> checkNextPatterns(
-      CharsRef word, int breakPos, List<CharsRef> stems) {
-    return nextStems ->
+  private Predicate<CharsRef> checkNextPatterns(CharsRef word, int breakPos, CharsRef stems) {
+    return nextStem ->
        dictionary.checkCompoundPatterns.stream()
-            .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStems));
+            .noneMatch(p -> p.prohibitsCompounding(word, breakPos, stems, nextStem));
  }

  private boolean checkCompoundsAfter(
@ -221,16 +234,16 @@ public class SpellChecker {
      int breakPos,
      WordCase originalCase,
      int depth,
-      List<CharsRef> prevStems,
-      Predicate<List<CharsRef>> checkPatterns) {
+      CharsRef prevStem,
+      Predicate<CharsRef> checkPatterns) {
    int remainingLength = word.length - breakPos;
    int breakOffset = word.offset + breakPos;
-    List<CharsRef> tailStems =
-        stemmer.doStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
-    if (!tailStems.isEmpty()
-        && !(dictionary.checkCompoundDup && intersectIgnoreCase(prevStems, tailStems))
+    CharsRef tailStem =
+        findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
+    if (tailStem != null
+        && !(dictionary.checkCompoundDup && equalsIgnoreCase(prevStem, tailStem))
        && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
-        && checkPatterns.test(tailStems)) {
+        && checkPatterns.test(tailStem)) {
      return true;
    }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -100,24 +100,41 @@ final class Stemmer {
      return Collections.emptyList();
    }

+    List<CharsRef> list = new ArrayList<>();
+    RootProcessor processor =
+        (stem, forms, formID) -> {
+          list.add(newStem(stem, forms, formID));
+          return true;
+        };
+
+    if (!doStem(word, 0, length, null, WordContext.SIMPLE_WORD, processor)) {
+      return list;
+    }
+
    WordCase wordCase = caseOf(word, length);
-    List<CharsRef> list = doStem(word, 0, length, null, WordContext.SIMPLE_WORD);
    if (wordCase == WordCase.UPPER || wordCase == WordCase.TITLE) {
-      addCaseVariations(word, length, wordCase, list);
+      addCaseVariations(word, length, wordCase, processor);
    }
    return list;
  }

-  private void addCaseVariations(char[] word, int length, WordCase wordCase, List<CharsRef> list) {
+  private void addCaseVariations(
+      char[] word, int length, WordCase wordCase, RootProcessor processor) {
    if (wordCase == WordCase.UPPER) {
      caseFoldTitle(word, length);
      char[] aposCase = capitalizeAfterApostrophe(titleBuffer, length);
      if (aposCase != null) {
-        list.addAll(doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD));
+        if (!doStem(aposCase, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
+          return;
+        }
+      }
+      if (!doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
+        return;
      }
-      list.addAll(doStem(titleBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
      for (char[] variation : sharpSVariations(titleBuffer, length)) {
-        list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
+        if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
+          return;
+        }
      }
    }

@ -126,10 +143,14 @@ final class Stemmer {
    }

    caseFoldLower(wordCase == WordCase.UPPER ? titleBuffer : word, length);
-    list.addAll(doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD));
+    if (!doStem(lowerBuffer, 0, length, wordCase, WordContext.SIMPLE_WORD, processor)) {
+      return;
+    }
    if (wordCase == WordCase.UPPER) {
      for (char[] variation : sharpSVariations(lowerBuffer, length)) {
-        list.addAll(doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD));
+        if (!doStem(variation, 0, variation.length, null, WordContext.SIMPLE_WORD, processor)) {
+          return;
+        }
      }
    }
  }
@ -220,9 +241,13 @@ final class Stemmer {
    return result.filter(s -> !s.equals(src)).map(String::toCharArray).collect(Collectors.toList());
  }

-  List<CharsRef> doStem(
-      char[] word, int offset, int length, WordCase originalCase, WordContext context) {
-    List<CharsRef> stems = new ArrayList<>();
+  boolean doStem(
+      char[] word,
+      int offset,
+      int length,
+      WordCase originalCase,
+      WordContext context,
+      RootProcessor processor) {
    IntsRef forms = dictionary.lookupWord(word, offset, length);
    if (forms != null) {
      for (int i = 0; i < forms.length; i += formStep) {
@ -241,36 +266,37 @@ final class Stemmer {
        if (context.isCompound()) {
          if (context != WordContext.COMPOUND_END
              && Dictionary.hasFlag(wordFlags, dictionary.compoundForbid)) {
-            return new ArrayList<>();
+            return false;
          }
          if (!Dictionary.hasFlag(wordFlags, dictionary.compoundFlag)
              && !Dictionary.hasFlag(wordFlags, context.requiredFlag(dictionary))) {
            continue;
          }
        }
-        stems.add(newStem(word, offset, length, forms, i));
+        if (!processor.processRoot(new CharsRef(word, offset, length), forms, i)) {
+          return false;
+        }
      }
    }
    try {
-      stems.addAll(
-          stem(
-              word,
-              offset,
-              length,
-              context,
-              -1,
-              Dictionary.FLAG_UNSET,
-              -1,
-              0,
-              true,
-              true,
-              false,
-              false,
-              originalCase));
+      return stem(
+          word,
+          offset,
+          length,
+          context,
+          -1,
+          Dictionary.FLAG_UNSET,
+          -1,
+          0,
+          true,
+          true,
+          false,
+          false,
+          originalCase,
+          processor);
    } catch (IOException bogus) {
      throw new RuntimeException(bogus);
    }
-    return stems;
  }

  private boolean acceptCase(
@ -319,7 +345,12 @@ final class Stemmer {
    return deduped;
  }

-  private CharsRef newStem(char[] buffer, int offset, int length, IntsRef forms, int formID) {
+  interface RootProcessor {
+    /** @return whether the processing should be continued */
+    boolean processRoot(CharsRef stem, IntsRef forms, int formID);
+  }
+
+  private CharsRef newStem(CharsRef stem, IntsRef forms, int formID) {
    final String exception;
    if (dictionary.hasStemExceptions) {
      int exceptionID = forms.ints[forms.offset + formID + 1];
@ -337,7 +368,7 @@ final class Stemmer {
      if (exception != null) {
        scratchSegment.append(exception);
      } else {
-        scratchSegment.append(buffer, offset, length);
+        scratchSegment.append(stem.chars, stem.offset, stem.length);
      }
      try {
        Dictionary.applyMappings(dictionary.oconv, scratchSegment);
@ -351,7 +382,7 @@ final class Stemmer {
      if (exception != null) {
        return new CharsRef(exception);
      } else {
-        return new CharsRef(buffer, offset, length);
+        return stem;
      }
    }
  }
@ -385,9 +416,9 @@ final class Stemmer {
   *     most suffix must also contain circumfix flag.
   * @param originalCase if non-null, represents original word case to disallow case variations of
   *     word with KEEPCASE flags
-   * @return List of stems, or empty list if no stems are found
+   * @return whether the processing should be continued
   */
-  private List<CharsRef> stem(
+  private boolean stem(
      char[] word,
      int offset,
      int length,
@ -400,12 +431,9 @@ final class Stemmer {
      boolean doSuffix,
      boolean previousWasPrefix,
      boolean circumfix,
-      WordCase originalCase)
+      WordCase originalCase,
+      RootProcessor processor)
      throws IOException {
-
-    // TODO: allow this stuff to be reused by tokenfilter
-    List<CharsRef> stems = new ArrayList<>();
-
    if (doPrefix && dictionary.prefixes != null) {
      FST<IntsRef> fst = dictionary.prefixes;
      FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
@ -440,19 +468,21 @@ final class Stemmer {
            }

            boolean pureAffix = strippedWord == word;
-            stems.addAll(
-                applyAffix(
-                    strippedWord,
-                    pureAffix ? offset + i : 0,
-                    pureAffix ? length - i : strippedWord.length,
-                    context,
-                    prefix,
-                    previous,
-                    -1,
-                    recursionDepth,
-                    true,
-                    circumfix,
-                    originalCase));
+            if (!applyAffix(
+                strippedWord,
+                pureAffix ? offset + i : 0,
+                pureAffix ? length - i : strippedWord.length,
+                context,
+                prefix,
+                previous,
+                -1,
+                recursionDepth,
+                true,
+                circumfix,
+                originalCase,
+                processor)) {
+              return false;
+            }
          }
        }
      }
@ -493,25 +523,27 @@ final class Stemmer {
            }

            boolean pureAffix = strippedWord == word;
-            stems.addAll(
-                applyAffix(
-                    strippedWord,
-                    pureAffix ? offset : 0,
-                    pureAffix ? i : strippedWord.length,
-                    context,
-                    suffix,
-                    previous,
-                    prefixId,
-                    recursionDepth,
-                    false,
-                    circumfix,
-                    originalCase));
+            if (!applyAffix(
+                strippedWord,
+                pureAffix ? offset : 0,
+                pureAffix ? i : strippedWord.length,
+                context,
+                suffix,
+                previous,
+                prefixId,
+                recursionDepth,
+                false,
+                circumfix,
+                originalCase,
+                processor)) {
+              return false;
+            }
          }
        }
      }
    }

-    return stems;
+    return true;
  }

  /**
@ -632,9 +664,9 @@ final class Stemmer {
   *     as a stem!
   * @param recursionDepth current recursion depth
   * @param prefix true if we are removing a prefix (false if it's a suffix)
-   * @return List of stems for the word, or an empty list if none are found
+   * @return whether the processing should be continued
   */
-  private List<CharsRef> applyAffix(
+  private boolean applyAffix(
      char[] strippedWord,
      int offset,
      int length,
@ -645,12 +677,11 @@ final class Stemmer {
      int recursionDepth,
      boolean prefix,
      boolean circumfix,
-      WordCase originalCase)
+      WordCase originalCase,
+      RootProcessor processor)
      throws IOException {
    char flag = dictionary.affixData(affix, Dictionary.AFFIX_FLAG);

-    List<CharsRef> stems = new ArrayList<>();
-
    boolean skipLookup = needsAnotherAffix(affix, previousAffix, !prefix);
    IntsRef forms = skipLookup ? null : dictionary.lookupWord(strippedWord, offset, length);
    if (forms != null) {
@ -694,7 +725,9 @@ final class Stemmer {
              continue;
            }
          }
-          stems.add(newStem(strippedWord, offset, length, forms, i));
+          if (!processor.processRoot(new CharsRef(strippedWord, offset, length), forms, i)) {
+            return false;
+          }
        }
      }
    }
@ -720,7 +753,7 @@ final class Stemmer {
          // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
          // COMPLEXPREFIXES = false: combine with another suffix
        } else {
-          return stems;
+          return true;
        }
      } else {
        doPrefix = false;
@ -728,29 +761,29 @@ final class Stemmer {
          prefixId = affix;
          // we took away the second prefix: go look for another suffix
        } else if (prefix || dictionary.complexPrefixes || !dictionary.twoStageAffix) {
-          return stems;
+          return true;
        }
        // we took away a prefix, then a suffix: go look for another suffix
      }

-      stems.addAll(
-          stem(
-              strippedWord,
-              offset,
-              length,
-              context,
-              affix,
-              flag,
-              prefixId,
-              recursionDepth + 1,
-              doPrefix,
-              true,
-              prefix,
-              circumfix,
-              originalCase));
+      return stem(
+          strippedWord,
+          offset,
+          length,
+          context,
+          affix,
+          flag,
+          prefixId,
+          recursionDepth + 1,
+          doPrefix,
+          true,
+          prefix,
+          circumfix,
+          originalCase,
+          processor);
    }

-    return stems;
+    return true;
  }

  private boolean needsAnotherAffix(int affix, int previousAffix, boolean isSuffix) {