LUCENE-9806: Hunspell: speed up affix condition checking (#2423)

* LUCENE-9806: Hunspell: speed up affix condition checking check only stem beginning/end without strip/condition, not the whole candidate avoid regexp if possible * hunspell: simplify AffixCondition, add more tests * add a license to the test
2021-02-24 17:45:35 +01:00 · 2021-02-24 17:45:35 +01:00 · 3a99e2aa82
parent e1ff4c1354
commit 3a99e2aa82
6 changed files with 300 additions and 101 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixCondition.java
@ -0,0 +1,181 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
 import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
 import java.util.regex.PatternSyntaxException;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 /**
 * Checks the "condition" part of affix definition, as in
 *
 * <pre>PFX flag stripping prefix [condition [morphological_fields...]]</pre>
 */
 interface AffixCondition {
  String ALWAYS_TRUE_KEY = ".*";
  AffixCondition ALWAYS_TRUE = (word, offset, length) -> true;
  AffixCondition ALWAYS_FALSE = (word, offset, length) -> false;
  default boolean acceptsStem(String stem) {
    return acceptsStem(stem.toCharArray(), 0, stem.length());
  }
  /**
   * @return whether the given word matches this condition as a stem with both "strip" and "affix"
   *     removed
   */
  boolean acceptsStem(char[] word, int offset, int length);
  /**
   * @return a key used to deduplicate same condition+strip+kind triples. For trivial conditions
   *     that need no check, {@link #ALWAYS_TRUE_KEY} is returned.
   */
  static String uniqueKey(AffixKind kind, String strip, String condition) {
    if (".".equals(condition)
        || kind == PREFIX && strip.startsWith(condition)
        || kind == SUFFIX && strip.endsWith(condition) && !isRegexp(condition)) {
      return ALWAYS_TRUE_KEY;
    }
    return condition + " " + kind + " " + strip;
  }
  /**
   * Analyzes the given affix kind, strip and condition and returns an object able to efficiently
   * check that condition.
   */
  static AffixCondition compile(AffixKind kind, String strip, String condition, String line) {
    if (!isRegexp(condition)) {
      if (kind == SUFFIX && condition.endsWith(strip)) {
        return substringCondition(
            kind, condition.substring(0, condition.length() - strip.length()));
      }
      if (kind == PREFIX && condition.startsWith(strip)) {
        return substringCondition(kind, condition.substring(strip.length()));
      }
      return ALWAYS_FALSE;
    }
    int lastBracket = condition.lastIndexOf('[');
    if (lastBracket >= 0 && condition.indexOf(']', lastBracket + 1) < 0) {
      // unclosed [ is tolerated by Hunspell and occurs in some dictionaries
      condition = condition + "]";
    }
    try {
      int conditionChars = countCharPatterns(condition);
      if (conditionChars <= strip.length()) {
        String regex = kind == PREFIX ? ".*" + condition : condition + ".*";
        return strip.matches(regex) ? ALWAYS_TRUE : ALWAYS_FALSE;
      }
      if (kind == PREFIX) {
        int split = skipCharPatterns(condition, strip.length());
        if (!strip.matches(condition.substring(0, split))) {
          return ALWAYS_FALSE;
        }
        return regexpCondition(kind, condition.substring(split), conditionChars - strip.length());
      }
      int split = skipCharPatterns(condition, conditionChars - strip.length());
      if (!strip.matches(condition.substring(split))) {
        return ALWAYS_FALSE;
      }
      return regexpCondition(kind, condition.substring(0, split), conditionChars - strip.length());
    } catch (PatternSyntaxException e) {
      return ALWAYS_FALSE;
    } catch (Throwable e) {
      throw new IllegalArgumentException("On line: " + line, e);
    }
  }
  private static int skipCharPatterns(String condition, int count) {
    int pos = 0;
    for (int i = 0; i < count; i++) pos = skipCharPattern(condition, pos);
    return pos;
  }
  private static int countCharPatterns(String condition) {
    int conditionChars = 0;
    for (int i = 0; i < condition.length(); i = skipCharPattern(condition, i)) conditionChars++;
    return conditionChars;
  }
  private static int skipCharPattern(String condition, int pos) {
    if (condition.charAt(pos) == '[') {
      pos = condition.indexOf(']', pos + 1);
      if (pos < 0) {
        throw new AssertionError("Malformed condition " + condition);
      }
    }
    return pos + 1;
  }
  private static boolean isRegexp(String condition) {
    return condition.contains("[") || condition.contains(".") || condition.contains("-");
  }
  private static AffixCondition substringCondition(AffixKind kind, String stemCondition) {
    boolean forSuffix = kind == AffixKind.SUFFIX;
    int condLength = stemCondition.length();
    return (word, offset, length) -> {
      if (length < condLength) {
        return false;
      }
      int matchStart = forSuffix ? offset + length - condLength : offset;
      for (int i = 0; i < condLength; i++) {
        if (stemCondition.charAt(i) != word[matchStart + i]) {
          return false;
        }
      }
      return true;
    };
  }
  private static AffixCondition regexpCondition(AffixKind kind, String condition, int charCount) {
    boolean forSuffix = kind == AffixKind.SUFFIX;
    CharacterRunAutomaton automaton =
        new CharacterRunAutomaton(new RegExp(escapeDash(condition), RegExp.NONE).toAutomaton());
    return (word, offset, length) ->
        length >= charCount
            && automaton.run(word, forSuffix ? offset + length - charCount : offset, charCount);
  }
  // "dash hasn't got special meaning" (we must escape it)
  private static String escapeDash(String re) {
    if (!re.contains("-")) return re;
    // we have to be careful, even though dash doesn't have a special meaning,
    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
    StringBuilder escaped = new StringBuilder();
    for (int i = 0; i < re.length(); i++) {
      char c = re.charAt(i);
      if (c == '-') {
        escaped.append("\\-");
      } else {
        escaped.append(c);
        if (c == '\\' && i + 1 < re.length()) {
          escaped.append(re.charAt(i + 1));
          i++;
        }
      }
    }
    return escaped.toString();
  }
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/AffixKind.java
@ -0,0 +1,22 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 enum AffixKind {
  PREFIX,
  SUFFIX
 }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -16,6 +16,8 @@
 */
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.AffixKind.*;
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
 import java.io.ByteArrayInputStream;
@ -59,8 +61,6 @@ import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.FSTCompiler;
 import org.apache.lucene.util.fst.IntSequenceOutputs;
@ -89,7 +89,7 @@ public class Dictionary {
   * All condition checks used by prefixes and suffixes. these are typically re-used across many
   * affix stripping rules. so these are deduplicated, to save RAM.
   */
-  ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
+  ArrayList<AffixCondition> patterns = new ArrayList<>();
  /**
   * The entries in the .dic file, mapping to their set of flags. the fst output is the ordinal list
@ -338,7 +338,7 @@ public class Dictionary {
    Map<String, Integer> seenPatterns = new HashMap<>();
    // zero condition -> 0 ord
-    seenPatterns.put(".*", 0);
+    seenPatterns.put(AffixCondition.ALWAYS_TRUE_KEY, 0);
    patterns.add(null);
    // zero strip -> 0 ord
@ -362,9 +362,11 @@ public class Dictionary {
      } else if ("AM".equals(firstWord)) {
        parseMorphAlias(line);
      } else if ("PFX".equals(firstWord)) {
-        parseAffix(prefixes, prefixContFlags, line, reader, false, seenPatterns, seenStrips, flags);
+        parseAffix(
            prefixes, prefixContFlags, line, reader, PREFIX, seenPatterns, seenStrips, flags);
      } else if ("SFX".equals(firstWord)) {
-        parseAffix(suffixes, suffixContFlags, line, reader, true, seenPatterns, seenStrips, flags);
+        parseAffix(
            suffixes, suffixContFlags, line, reader, SUFFIX, seenPatterns, seenStrips, flags);
      } else if (line.equals("COMPLEXPREFIXES")) {
        complexPrefixes =
            true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@ -655,25 +657,6 @@ public class Dictionary {
    return fstCompiler.compile();
  }
  static String escapeDash(String re) {
    // we have to be careful, even though dash doesn't have a special meaning,
    // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it
    StringBuilder escaped = new StringBuilder();
    for (int i = 0; i < re.length(); i++) {
      char c = re.charAt(i);
      if (c == '-') {
        escaped.append("\\-");
      } else {
        escaped.append(c);
        if (c == '\\' && i + 1 < re.length()) {
          escaped.append(re.charAt(i + 1));
          i++;
        }
      }
    }
    return escaped.toString();
  }
  /**
   * Parses a specific affix rule putting the result into the provided affix map
   *
@ -688,7 +671,7 @@ public class Dictionary {
      Set<Character> secondStageFlags,
      String header,
      LineNumberReader reader,
-      boolean isSuffix,
+      AffixKind kind,
      Map<String, Integer> seenPatterns,
      Map<String, Integer> seenStrips,
      FlagEnumerator flags)
@ -738,41 +721,18 @@ public class Dictionary {
      }
      String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
-      // at least the gascon affix file has this issue
+      String key = AffixCondition.uniqueKey(kind, strip, condition);
      if (condition.startsWith("[") && condition.indexOf(']') == -1) {
        condition = condition + "]";
      }
      // "dash hasn't got special meaning" (we must escape it)
      if (condition.indexOf('-') >= 0) {
        condition = escapeDash(condition);
      }
      final String regex;
      if (".".equals(condition)) {
        regex = ".*"; // Zero condition is indicated by dot
      } else if (condition.equals(strip)) {
        regex = ".*"; // TODO: optimize this better:
        // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
        // but this is complicated...
      } else {
        // TODO: really for suffixes we should reverse the automaton and run them backwards
        regex = isSuffix ? ".*" + condition : condition + ".*";
      }
      // deduplicate patterns
-      Integer patternIndex = seenPatterns.get(regex);
+      Integer patternIndex = seenPatterns.get(key);
      if (patternIndex == null) {
        patternIndex = patterns.size();
        if (patternIndex > Short.MAX_VALUE) {
          throw new UnsupportedOperationException(
              "Too many patterns, please report this to dev@lucene.apache.org");
        }
-        seenPatterns.put(regex, patternIndex);
+        seenPatterns.put(key, patternIndex);
-        try {
+        patterns.add(AffixCondition.compile(kind, strip, condition, line));
          patterns.add(new CharacterRunAutomaton(conditionRegexp(regex).toAutomaton()));
        } catch (IllegalArgumentException e) {
          throw new IllegalArgumentException("On line " + reader.getLineNumber() + ": " + line, e);
        }
      }
      Integer stripOrd = seenStrips.get(strip);
@ -811,7 +771,7 @@ public class Dictionary {
        affixArg = cleanInput(affixArg, sb).toString();
      }
-      if (isSuffix) {
+      if (kind == SUFFIX) {
        affixArg = new StringBuilder(affixArg).reverse().toString();
      }
@ -820,17 +780,6 @@ public class Dictionary {
    }
  }
  private static RegExp conditionRegexp(String regex) {
    try {
      return new RegExp(regex, RegExp.NONE);
    } catch (IllegalArgumentException e) {
      if (e.getMessage().contains("expected ']'")) {
        return conditionRegexp(regex + "]");
      }
      throw e;
    }
  }
  char affixData(int affixIndex, int offset) {
    return affixData[affixIndex * 4 + offset];
  }
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/GeneratingSuggester.java
@ -269,7 +269,7 @@ class GeneratingSuggester {
  private boolean checkAffixCondition(int suffixId, String stem) {
    int condition = dictionary.getAffixCondition(suffixId);
-    return condition == 0 || dictionary.patterns.get(condition).run(stem);
+    return condition == 0 || dictionary.patterns.get(condition).acceptsStem(stem);
  }
  private int affixStripLength(int affixId) {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.apache.lucene.util.fst.FST;
 /**
@ -486,13 +485,12 @@ final class Stemmer {
    int stripLen = stripEnd - stripStart;
    char[] stripData = dictionary.stripData;
-    boolean condition =
+    int condition = dictionary.getAffixCondition(affix);
-        isPrefix
+    if (condition != 0) {
-            ? checkCondition(
+      int deAffixedOffset = isPrefix ? offset + affixLen : offset;
-                affix, stripData, stripStart, stripLen, word, offset + affixLen, deAffixedLen)
+      if (!dictionary.patterns.get(condition).acceptsStem(word, deAffixedOffset, deAffixedLen)) {
-            : checkCondition(affix, word, offset, deAffixedLen, stripData, stripStart, stripLen);
+        return null;
-    if (!condition) {
+      }
      return null;
    }
    if (stripLen == 0) return word;
@ -547,33 +545,6 @@ final class Stemmer {
    return false;
  }
  /** checks condition of the concatenation of two strings */
  // note: this is pretty stupid, we really should subtract strip from the condition up front and
  // just check the stem
  // but this is a little bit more complicated.
  private boolean checkCondition(
      int affix, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
    int condition = dictionary.getAffixCondition(affix);
    if (condition != 0) {
      CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
      int state = 0;
      for (int i = c1off; i < c1off + c1len; i++) {
        state = pattern.step(state, c1[i]);
        if (state == -1) {
          return false;
        }
      }
      for (int i = c2off; i < c2off + c2len; i++) {
        state = pattern.step(state, c2[i]);
        if (state == -1) {
          return false;
        }
      }
      return pattern.isAccept(state);
    }
    return true;
  }
  /**
   * Applies the affix rule to the given word, producing a list of stems if any are found
   *
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAffixCondition.java
@ -0,0 +1,76 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.hunspell;
 import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_FALSE;
 import static org.apache.lucene.analysis.hunspell.AffixCondition.ALWAYS_TRUE_KEY;
 import static org.apache.lucene.analysis.hunspell.AffixKind.PREFIX;
 import static org.apache.lucene.analysis.hunspell.AffixKind.SUFFIX;
 import org.apache.lucene.util.LuceneTestCase;
 public class TestAffixCondition extends LuceneTestCase {
  public void testPlainSuffixMatching() {
    AffixCondition condition = AffixCondition.compile(SUFFIX, "b", "ab", "");
    assertTrue(condition.acceptsStem("a"));
    assertFalse(condition.acceptsStem("b"));
    assertFalse(condition.acceptsStem("ab"));
  }
  public void testPlainPrefixMatching() {
    AffixCondition condition = AffixCondition.compile(PREFIX, "a", "ab", "");
    assertFalse(condition.acceptsStem("ab"));
    assertTrue(condition.acceptsStem("b"));
    assertFalse(condition.acceptsStem("a"));
  }
  public void testDotMatching() {
    AffixCondition condition = AffixCondition.compile(PREFIX, "", "wr.", "");
    assertTrue(condition.acceptsStem("wry"));
    assertTrue(condition.acceptsStem("wrong"));
    assertFalse(condition.acceptsStem("white"));
  }
  public void testUniqueKey() {
    assertNotEquals(
        AffixCondition.uniqueKey(PREFIX, "", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
    assertNotEquals(
        AffixCondition.uniqueKey(SUFFIX, "y", "x"), AffixCondition.uniqueKey(SUFFIX, "", "x"));
    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "", "."));
    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "abc"));
    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(SUFFIX, "abc", "bc"));
    assertEquals(ALWAYS_TRUE_KEY, AffixCondition.uniqueKey(PREFIX, "abc", "ab"));
  }
  public void testConditionHasBracketsIntersectingWithStrip() {
    assertTrue(AffixCondition.compile(SUFFIX, "oj", "[io]j", "").acceptsStem("whatever"));
    assertTrue(AffixCondition.compile(SUFFIX, "oj", "o[ioj", "").acceptsStem("whatever"));
  }
  public void testImpossibleCondition() {
    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "a", "b", ""));
  }
  public void testNonHunspellPatternCharacters() {
    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)", ""));
    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^.x)", ""));
    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "[z](^ax)", ""));
    assertEquals(ALWAYS_FALSE, AffixCondition.compile(SUFFIX, "x", "(^ax)[z]", ""));
  }
 }