LUCENE-9692: Hunspell: extract Stemmer.stripAffix from similar code in prefix/suffix processing (#2237)

2021-01-25 09:11:11 +01:00 · 2021-01-25 09:11:11 +01:00 · e4ec3e3919
parent f64e7cbbda
commit e4ec3e3919
1 changed files with 37 additions and 44 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -26,7 +26,6 @@ import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Outputs;
 /**
 * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.
@ -305,11 +304,10 @@ final class Stemmer {
    if (doPrefix && dictionary.prefixes != null) {
      FST<IntsRef> fst = dictionary.prefixes;
      Outputs<IntsRef> outputs = fst.outputs;
      FST.BytesReader bytesReader = prefixReaders[recursionDepth];
      FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
      fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = outputs.getNoOutput();
+      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
      IntsRef output = NO_OUTPUT;
      int limit = dictionary.fullStrip ? length + 1 : length;
      for (int i = 0; i < limit; i++) {
@ -333,23 +331,12 @@ final class Stemmer {
          }
          if (isAffixCompatible(prefix, prevFlag, recursionDepth, false)) {
-            int deAffixedLength = length - i;
+            char[] strippedWord = stripAffix(word, length, i, prefix, true);
-
+            if (strippedWord == null) {
            int stripOrd = dictionary.affixData(prefix, Dictionary.AFFIX_STRIP_ORD);
            int stripStart = dictionary.stripOffsets[stripOrd];
            int stripEnd = dictionary.stripOffsets[stripOrd + 1];
            int stripLength = stripEnd - stripStart;
            if (!checkCondition(
                prefix, dictionary.stripData, stripStart, stripLength, word, i, deAffixedLength)) {
              continue;
            }
-            char[] strippedWord = new char[stripLength + deAffixedLength];
+            stems.addAll(
            System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
            System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
            List<CharsRef> stemList =
                applyAffix(
                    strippedWord,
                    strippedWord.length,
@ -358,9 +345,7 @@ final class Stemmer {
                    recursionDepth,
                    true,
                    circumfix,
-                    caseVariant);
+                    caseVariant));
            stems.addAll(stemList);
          }
        }
      }
@ -368,11 +353,10 @@ final class Stemmer {
    if (doSuffix && dictionary.suffixes != null) {
      FST<IntsRef> fst = dictionary.suffixes;
      Outputs<IntsRef> outputs = fst.outputs;
      FST.BytesReader bytesReader = suffixReaders[recursionDepth];
      FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
      fst.getFirstArc(arc);
-      IntsRef NO_OUTPUT = outputs.getNoOutput();
+      IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
      IntsRef output = NO_OUTPUT;
      int limit = dictionary.fullStrip ? 0 : 1;
      for (int i = length; i >= limit; i--) {
@ -396,25 +380,12 @@ final class Stemmer {
          }
          if (isAffixCompatible(suffix, prevFlag, recursionDepth, previousWasPrefix)) {
-            int appendLength = length - i;
+            char[] strippedWord = stripAffix(word, length, length - i, suffix, false);
-            int deAffixedLength = length - appendLength;
+            if (strippedWord == null) {
            int stripOrd = dictionary.affixData(suffix, Dictionary.AFFIX_STRIP_ORD);
            int stripStart = dictionary.stripOffsets[stripOrd];
            int stripEnd = dictionary.stripOffsets[stripOrd + 1];
            int stripLength = stripEnd - stripStart;
            if (!checkCondition(
                suffix, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
              continue;
            }
-            char[] strippedWord = new char[stripLength + deAffixedLength];
+            stems.addAll(
            System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
            System.arraycopy(
                dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
            List<CharsRef> stemList =
                applyAffix(
                    strippedWord,
                    strippedWord.length,
@ -423,9 +394,7 @@ final class Stemmer {
                    recursionDepth,
                    false,
                    circumfix,
-                    caseVariant);
+                    caseVariant));
            stems.addAll(stemList);
          }
        }
      }
@ -434,6 +403,30 @@ final class Stemmer {
    return stems;
  }
  private char[] stripAffix(char[] word, int length, int affixLen, int affix, boolean isPrefix) {
    int deAffixedLen = length - affixLen;
    int stripOrd = dictionary.affixData(affix, Dictionary.AFFIX_STRIP_ORD);
    int stripStart = dictionary.stripOffsets[stripOrd];
    int stripEnd = dictionary.stripOffsets[stripOrd + 1];
    int stripLen = stripEnd - stripStart;
    char[] stripData = dictionary.stripData;
    boolean condition =
        isPrefix
            ? checkCondition(affix, stripData, stripStart, stripLen, word, affixLen, deAffixedLen)
            : checkCondition(affix, word, 0, deAffixedLen, stripData, stripStart, stripLen);
    if (!condition) {
      return null;
    }
    char[] strippedWord = new char[stripLen + deAffixedLen];
    System.arraycopy(
        word, isPrefix ? affixLen : 0, strippedWord, isPrefix ? stripLen : 0, deAffixedLen);
    System.arraycopy(stripData, stripStart, strippedWord, isPrefix ? 0 : deAffixedLen, stripLen);
    return strippedWord;
  }
  private boolean isAffixCompatible(
      int affix, int prevFlag, int recursionDepth, boolean previousWasPrefix) {
    int append = dictionary.affixData(affix, Dictionary.AFFIX_APPEND);
@ -495,9 +488,9 @@ final class Stemmer {
   * @param strippedWord Word the affix has been removed and the strip added
   * @param length valid length of stripped word
   * @param affix HunspellAffix representing the affix rule itself
-   * @param prefixId when we already stripped a prefix, we cant simply recurse and check the suffix,
+   * @param prefixId when we already stripped a prefix, we can't simply recurse and check the
-   *     unless both are compatible so we must check dictionary form against both to add it as a
+   *     suffix, unless both are compatible so we must check dictionary form against both to add it
-   *     stem!
+   *     as a stem!
   * @param recursionDepth current recursion depth
   * @param prefix true if we are removing a prefix (false if it's a suffix)
   * @return List of stems for the word, or an empty list if none are found