LUCENE-9771: Hunspell: don't lookup word roots unnecessarily to check flags (#2369)

2021-02-15 20:21:44 +01:00 · 2021-02-15 20:21:44 +01:00 · ef920388e6
parent 1ff11dd02c
commit ef920388e6
2 changed files with 22 additions and 36 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/CheckCompoundPattern.java
@ -17,7 +17,6 @@
 package org.apache.lucene.analysis.hunspell;

 import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.IntsRef;

 class CheckCompoundPattern {
  private final String endChars;
@ -51,10 +50,9 @@ class CheckCompoundPattern {
    return endChars + " " + beginChars + (replacement == null ? "" : " -> " + replacement);
  }

-  boolean prohibitsCompounding(
-      CharsRef word, int breakPos, CharsRef stemBefore, CharsRef stemAfter) {
+  boolean prohibitsCompounding(CharsRef word, int breakPos, Root<?> rootBefore, Root<?> rootAfter) {
    if (isNonAffixedPattern(endChars)) {
-      if (!charsMatch(word, breakPos - stemBefore.length, stemBefore)) {
+      if (!charsMatch(word, breakPos - rootBefore.word.length(), rootBefore.word)) {
        return false;
      }
    } else if (!charsMatch(word, breakPos - endChars.length(), endChars)) {
@ -62,18 +60,18 @@ class CheckCompoundPattern {
    }

    if (isNonAffixedPattern(beginChars)) {
-      if (!charsMatch(word, breakPos, stemAfter)) {
+      if (!charsMatch(word, breakPos, rootAfter.word)) {
        return false;
      }
    } else if (!charsMatch(word, breakPos, beginChars)) {
      return false;
    }

-    if (endFlags.length > 0 && !stemHasFlags(stemBefore, endFlags)) {
+    if (endFlags.length > 0 && !hasAllFlags(rootBefore, endFlags)) {
      return false;
    }
    //noinspection RedundantIfStatement
-    if (beginFlags.length > 0 && !stemHasFlags(stemAfter, beginFlags)) {
+    if (beginFlags.length > 0 && !hasAllFlags(rootAfter, beginFlags)) {
      return false;
    }

@ -84,14 +82,9 @@ class CheckCompoundPattern {
    return pattern.length() == 1 && pattern.charAt(0) == '0';
  }

-  private boolean stemHasFlags(CharsRef stem, char[] flags) {
-    IntsRef forms = dictionary.lookupWord(stem.chars, stem.offset, stem.length);
-    return forms != null && hasAllFlags(flags, forms);
-  }
-
-  private boolean hasAllFlags(char[] flags, IntsRef forms) {
+  private boolean hasAllFlags(Root<?> root, char[] flags) {
    for (char flag : flags) {
-      if (!dictionary.hasFlag(forms, flag)) {
+      if (!dictionary.hasFlag(root.entryId, flag)) {
        return false;
      }
    }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@ -226,13 +226,13 @@ public class Hunspell {
    int breakPos = prev.length;
    int remainingLength = word.length - breakPos;
    int breakOffset = word.offset + breakPos;
-    Root<CharsRef> tailStem =
+    Root<CharsRef> lastRoot =
        findStem(word.chars, breakOffset, remainingLength, originalCase, COMPOUND_END);
-    if (tailStem != null
-        && !dictionary.hasFlag(tailStem.entryId, dictionary.forbiddenword)
-        && !(dictionary.checkCompoundDup && equalsIgnoreCase(prev.stem, tailStem.word))
-        && !hasForceUCaseProblem(word.chars, breakOffset, remainingLength, originalCase)
-        && prev.mayCompound(tailStem, remainingLength, originalCase)) {
+    if (lastRoot != null
+        && !dictionary.hasFlag(lastRoot.entryId, dictionary.forbiddenword)
+        && !(dictionary.checkCompoundDup && prev.root.equals(lastRoot))
+        && !hasForceUCaseProblem(lastRoot, originalCase)
+        && prev.mayCompound(lastRoot, remainingLength, originalCase)) {
      return true;
    }

@ -240,17 +240,9 @@ public class Hunspell {
    return checkCompounds(tail, originalCase, prev);
  }

-  private boolean hasForceUCaseProblem(
-      char[] chars, int offset, int length, WordCase originalCase) {
-    if (dictionary.forceUCase == FLAG_UNSET) return false;
+  private boolean hasForceUCaseProblem(Root<?> root, WordCase originalCase) {
    if (originalCase == WordCase.TITLE || originalCase == WordCase.UPPER) return false;
-
-    IntsRef forms = dictionary.lookupWord(chars, offset, length);
-    return forms != null && dictionary.hasFlag(forms, dictionary.forceUCase);
-  }
-
-  private boolean equalsIgnoreCase(CharSequence cr1, CharSequence cr2) {
-    return cr1.toString().equalsIgnoreCase(cr2.toString());
+    return dictionary.hasFlag(root.entryId, dictionary.forceUCase);
  }

  /**
@ -274,19 +266,20 @@ public class Hunspell {
  private class CompoundPart {
    final CompoundPart prev;
    final int index, length;
-    final CharsRef tail, stem;
+    final CharsRef tail;
+    final Root<CharsRef> root;
    final CheckCompoundPattern enablingPattern;

    CompoundPart(
        CompoundPart prev,
        CharsRef tail,
        int length,
-        Root<CharsRef> stem,
+        Root<CharsRef> root,
        CheckCompoundPattern enabler) {
      this.prev = prev;
      this.tail = tail;
      this.length = length;
-      this.stem = stem.word;
+      this.root = root;
      index = prev == null ? 1 : prev.index + 1;
      enablingPattern = enabler;
    }
@ -296,12 +289,12 @@ public class Hunspell {
      return (prev == null ? "" : prev + "+") + tail.subSequence(0, length);
    }

-    boolean mayCompound(Root<CharsRef> nextStem, int nextPartLength, WordCase originalCase) {
+    boolean mayCompound(Root<CharsRef> nextRoot, int nextPartLength, WordCase originalCase) {
      boolean patternsOk =
          enablingPattern != null
-              ? enablingPattern.prohibitsCompounding(tail, length, stem, nextStem.word)
+              ? enablingPattern.prohibitsCompounding(tail, length, root, nextRoot)
              : dictionary.checkCompoundPatterns.stream()
-                  .noneMatch(p -> p.prohibitsCompounding(tail, length, stem, nextStem.word));
+                  .noneMatch(p -> p.prohibitsCompounding(tail, length, root, nextRoot));
      if (!patternsOk) {
        return false;
      }