LUCENE-9806: Hunspell: speed up affix condition checking (#2423)

* LUCENE-9806: Hunspell: speed up affix condition checking check only stem beginning/end without strip/condition, not the whole candidate avoid regexp if possible * hunspell: simplify AffixCondition, add more tests * add a license to the test
2025-02-28 21:39:25 +00:00 · 2021-02-24 17:45:35 +01:00 · 2021-02-24 17:45:35 +01:00 · 3a99e2aa82
commit 3a99e2aa82
parent e1ff4c1354
6 changed files with 300 additions and 101 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
+import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
+
+import java.util.regex.PatternSyntaxException;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
+
+/**
+ * Checks the "condition" part of affix definition, as in
+ *
+ * <pre>PFX flag stripping prefix [condition [morphological_fields...]]</pre>
+ */
+interface AffixCondition {
+  String ALWAYS_TRUE_KEY = ".*";
+  AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
+  AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
+
+  default boolean acceptsStem(String stem) {
+    return acceptsStem(stem.toCharArray(), 0, stem.length());
+  }
+
+  /**
+   * @return whether the given word matches this condition as a stem with both "strip" and "affix"
+   *     removed
+   */
+  boolean acceptsStem(char[] word, int offset, int length);
+
+  /**
+   * @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
+   *     that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
+   */
+  static String uniqueKey(AffixKind kind, String strip, String condition) {
+    if (".".equals(condition)
+        || kind == PREFIX && strip.startsWith(condition)
+        || kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
+      return ALWAYS_TRUE_KEY;
+    }
+    return condition + " " + kind + " " + strip;
+  }
+
+  /**
+   * Analyzes the given affix kind, strip and condition and returns an object able to efficiently
+   * check that condition.
+   */
+  static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
+    if (!isRegexp(condition)) {
+      if (kind == SUFFIX && condition.endsWith(strip)) {
+        return substringCondition(
+            kind, condition.substring(0, condition.length() - strip.length()));
+      }
+      if (kind == PREFIX && condition.startsWith(strip)) {
+        return substringCondition(kind, condition.substring(strip.length()));
+      }
+      return ALWAYS_FALSE;
+    }
+
+    int lastBracket = condition.lastIndexOf('[');
+    if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
+      // unclosed [ is tolerated by Hunspell and occurs in some dictionaries
+      condition = condition + "]";
+    }
+
+    try {
+      int conditionChars = countCharPatterns(condition);
+      if (conditionChars <= strip.length()) {
+        String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
+        return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
+      }
+
+      if (kind == PREFIX) {
+        int split = skipCharPatterns(condition, strip.length());
+        if (!strip.matches(condition.substring(0, split))) {
+          return ALWAYS_FALSE;
+        }
+        return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
+      }
+
+      int split = skipCharPatterns(condition, conditionChars - strip.length());
+      if (!strip.matches(condition.substring(split))) {
+        return ALWAYS_FALSE;
+      }
+      return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
+    } catch (PatternSyntaxException e) {
+      return ALWAYS_FALSE;
+    } catch (Throwable e) {
+      throw new IllegalArgumentException("On line: " + line, e);
+    }
+  }
+
+  private static int skipCharPatterns(String condition, int count) {
+    int pos = 0;
+    for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
+    return pos;
+  }
+
+  private static int countCharPatterns(String condition) {
+    int conditionChars = 0;
+    for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
+    return conditionChars;
+  }
+
+  private static int skipCharPattern(String condition, int pos) {
+    if (condition.charAt(pos) == '[') {
+      pos = condition.indexOf(']', pos + 1);
+      if (pos < 0) {
+        throw new AssertionError("Malformed condition " + condition);
+      }
+    }
+    return pos + 1;
+  }
+
+  private static boolean isRegexp(String condition) {
+    return condition.contains("[") || condition.contains(".") || condition.contains("-");
+  }
+
+  private static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
+    boolean forSuffix = kind == AffixKind.SUFFIX;
+    int condLength = stemCondition.length();
+    return (word, offset, length) -> {
+      if (length < condLength) {
+        return false;
+      }
+      int matchStart = forSuffix ? offset + length - condLength : offset;
+      for (int i = 0; i < condLength; i++) {
+        if (stemCondition.charAt(i) != word[matchStart + i]) {
+          return false;
+        }
+      }
+      return true;
+    };
+  }
+
+  private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
+    boolean forSuffix = kind == AffixKind.SUFFIX;
+    CharacterRunAutomaton automaton =
+        new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
+    return (word, offset, length) ->
+        length >= charCount
+            && automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
+  }
+
+  // "dash hasn't got special meaning" (we must escape it)
+  private static String escapeDash(String re) {
+    if (!re.contains("-")) return re;
+
+    // we have to be careful, even though dash doesn't have a special meaning,
+    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
+    StringBuilder escaped = new StringBuilder();
+    for (int i = 0; i < re.length(); i++) {
+      char c = re.charAt(i);
+      if (c == '-') {
+        escaped.append("\\-");
+      } else {
+        escaped.append(c);
+        if (c == '\\' && i + 1 < re.length()) {
+          escaped.append(re.charAt(i + 1));
+          i++;
+        }
+      }
+    }
+    return escaped.toString();
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java
@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+enum AffixKind {
+  PREFIX,
+  SUFFIX
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -16,6 +16,8 @@
 */
 package org.apache.lucene.analysis.hunspell;

+import static org.apache.lucene.analysis.hunspell.AffixKind.*;
+
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
@ -59,8 +61,6 @@ import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -89,7 +89,7 @@ public class Dictionary {
   * All condition checks used by prefixes and suffixes. these are typically re-used across many
   * affix stripping rules. so these are deduplicated, to save RAM.
   */
-  ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
+  ArrayList<AffixCondition> patterns = new ArrayList<>();

  /**
   * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
@ -338,7 +338,7 @@ public class Dictionary {
    Map<String, Integer> seenPatterns = new HashMap<>();

    // zero condition -> 0 ord
-    seenPatterns.put(".*", 0);
+    seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0);
    patterns.add(null);

    // zero strip -> 0 ord
@ -362,9 +362,11 @@ public class Dictionary {
      } else if ("AM".equals(firstWord)) {
        parseMorphAlias(line);
      } else if ("PFX".equals(firstWord)) {
-        parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags);
+        parseAffix(
+            prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags);
      } else if ("SFX".equals(firstWord)) {
-        parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags);
+        parseAffix(
+            suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags);
      } else if (line.equals("COMPLEXPREFIXES")) {
        complexPrefixes =
            true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@ -655,25 +657,6 @@ public class Dictionary {
    return fstCompiler.compile();
  }

-  static String escapeDash(String re) {
-    // we have to be careful, even though dash doesn't have a special meaning,
-    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
-    StringBuilder escaped = new StringBuilder();
-    for (int i = 0; i < re.length(); i++) {
-      char c = re.charAt(i);
-      if (c == '-') {
-        escaped.append("\\-");
-      } else {
-        escaped.append(c);
-        if (c == '\\' && i + 1 < re.length()) {
-          escaped.append(re.charAt(i + 1));
-          i++;
-        }
-      }
-    }
-    return escaped.toString();
-  }
-
  /**
   * Parses a specific affix rule putting the result into the provided affix map
   *
@ -688,7 +671,7 @@ public class Dictionary {
      Set<Character> secondStageFlags,
      String header,
      LineNumberReader reader,
-      boolean isSuffix,
+      AffixKind kind,
      Map<String, Integer> seenPatterns,
      Map<String, Integer> seenStrips,
      FlagEnumerator flags)
@ -738,41 +721,18 @@ public class Dictionary {
      }

      String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
-      // at least the gascon affix file has this issue
-      if (condition.startsWith("[") && condition.indexOf(']') == -1) {
-        condition = condition + "]";
-      }
-      // "dash hasn't got special meaning" (we must escape it)
-      if (condition.indexOf('-') >= 0) {
-        condition = escapeDash(condition);
-      }
-
-      final String regex;
-      if (".".equals(condition)) {
-        regex = ".*"; // Zero condition is indicated by dot
-      } else if (condition.equals(strip)) {
-        regex = ".*"; // TODO: optimize this better:
-        // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
-        // but this is complicated...
-      } else {
-        // TODO: really for suffixes we should reverse the automaton and run them backwards
-        regex = isSuffix ? ".*" + condition : condition + ".*";
-      }
+      String key = AffixCondition.uniqueKey(kind, strip, condition);

      // deduplicate patterns
-      Integer patternIndex = seenPatterns.get(regex);
+      Integer patternIndex = seenPatterns.get(key);
      if (patternIndex == null) {
        patternIndex = patterns.size();
        if (patternIndex > Short.MAX_VALUE) {
          throw new UnsupportedOperationException(
              "Too many patterns, please report this to dev@lucene.apache.org");
        }
-        seenPatterns.put(regex, patternIndex);
-        try {
-          patterns.add(new CharacterRunAutomaton(conditionRegexp(regex).toAutomaton()));
-        } catch (IllegalArgumentException e) {
-          throw new IllegalArgumentException("On line " + reader.getLineNumber() + ": " + line, e);
-        }
+        seenPatterns.put(key, patternIndex);
+        patterns.add(AffixCondition.compile(kind, strip, condition, line));
      }

      Integer stripOrd = seenStrips.get(strip);
@ -811,7 +771,7 @@ public class Dictionary {
        affixArg = cleanInput(affixArg, sb).toString();
      }

-      if (isSuffix) {
+      if (kind == SUFFIX) {
        affixArg = new StringBuilder(affixArg).reverse().toString();
      }

@ -820,17 +780,6 @@ public class Dictionary {
    }
  }

-  private static RegExp conditionRegexp(String regex) {
-    try {
-      return new RegExp(regex, RegExp.NONE);
-    } catch (IllegalArgumentException e) {
-      if (e.getMessage().contains("expected ']'")) {
-        return conditionRegexp(regex + "]");
-      }
-      throw e;
-    }
-  }
-
  char affixData(int affixIndex, int offset) {
    return affixData[affixIndex * 4 + offset];
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -269,7 +269,7 @@ class GeneratingSuggester {

  private boolean checkAffixCondition(int suffixId, String stem) {
    int condition = dictionary.getAffixCondition(suffixId);
-    return condition == 0 || dictionary.patterns.get(condition).run(stem);
+    return condition == 0 || dictionary.patterns.get(condition).acceptsStem(stem);
  }

  private int affixStripLength(int affixId) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.fst.FST;

 /**
@ -486,13 +485,12 @@ final class Stemmer {
    int stripLen = stripEnd - stripStart;

    char[] stripData = dictionary.stripData;
-    boolean condition =
-        isPrefix
-            ? checkCondition(
-                affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen)
-            : checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen);
-    if (!condition) {
-      return null;
+    int condition = dictionary.getAffixCondition(affix);
+    if (condition != 0) {
+      int deAffixedOffset = isPrefix ? offset + affixLen : offset;
+      if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
+        return null;
+      }
    }

    if (stripLen == 0) return word;
@ -547,33 +545,6 @@ final class Stemmer {
    return false;
  }

-  /** checks condition of the concatenation of two strings */
-  // note: this is pretty stupid, we really should subtract strip from the condition up front and
-  // just check the stem
-  // but this is a little bit more complicated.
-  private boolean checkCondition(
-      int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
-    int condition = dictionary.getAffixCondition(affix);
-    if (condition != 0) {
-      CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
-      int state = 0;
-      for (int i = c1off; i < c1off + c1len; i++) {
-        state = pattern.step(state, c1[i]);
-        if (state == -1) {
-          return false;
-        }
-      }
-      for (int i = c2off; i < c2off + c2len; i++) {
-        state = pattern.step(state, c2[i]);
-        if (state == -1) {
-          return false;
-        }
-      }
-      return pattern.isAccept(state);
-    }
-    return true;
-  }
-
  /**
   * Applies the affix rule to the given word, producing a list of stems if any are found
   *
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java
@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.hunspell;
+
+import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_FALSE;
+import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_TRUE_KEY;
+import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
+import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
+
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestAffixCondition extends LuceneTestCase {
+
+  public void testPlainSuffixMatching() {
+    AffixCondition condition = AffixCondition.compile(SUFFIX, "b", "ab", "");
+    assertTrue(condition.acceptsStem("a"));
+    assertFalse(condition.acceptsStem("b"));
+    assertFalse(condition.acceptsStem("ab"));
+  }
+
+  public void testPlainPrefixMatching() {
+    AffixCondition condition = AffixCondition.compile(PREFIX, "a", "ab", "");
+    assertFalse(condition.acceptsStem("ab"));
+    assertTrue(condition.acceptsStem("b"));
+    assertFalse(condition.acceptsStem("a"));
+  }
+
+  public void testDotMatching() {
+    AffixCondition condition = AffixCondition.compile(PREFIX, "", "wr.", "");
+    assertTrue(condition.acceptsStem("wry"));
+    assertTrue(condition.acceptsStem("wrong"));
+    assertFalse(condition.acceptsStem("white"));
+  }
+
+  public void testUniqueKey() {
+    assertNotEquals(
+        AffixCondition.uniqueKey(PREFIX, "", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
+    assertNotEquals(
+        AffixCondition.uniqueKey(SUFFIX, "y", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
+    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "", "."));
+    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "abc"));
+
+    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "bc"));
+    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "abc", "ab"));
+  }
+
+  public void testConditionHasBracketsIntersectingWithStrip() {
+    assertTrue(AffixCondition.compile(SUFFIX, "oj", "[io]j", "").acceptsStem("whatever"));
+    assertTrue(AffixCondition.compile(SUFFIX, "oj", "o[ioj", "").acceptsStem("whatever"));
+  }
+
+  public void testImpossibleCondition() {
+    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "a", "b", ""));
+  }
+
+  public void testNonHunspellPatternCharacters() {
+    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)", ""));
+    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^.x)", ""));
+    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "[z](^ax)", ""));
+    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)[z]", ""));
+  }
+}