LUCENE-5518: minor hunspell optimizations

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1576738 13f79535-47bb-0310-9956-ffa450edef68
2014-03-12 13:44:48 +00:00 · 2014-03-12 13:44:48 +00:00 · 0e2966b6c0
parent 00718a2704
commit 0e2966b6c0
4 changed files with 121 additions and 46 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -27,6 +27,8 @@ import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.OfflineSorter;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
 import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+import org.apache.lucene.util.automaton.RegExp;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
@ -54,6 +56,7 @@ import java.util.Arrays;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
+import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
@ -83,17 +86,16 @@ public class Dictionary {
  private static final String UTF8_FLAG_TYPE = "UTF-8";
  private static final String LONG_FLAG_TYPE = "long";
  
+  // TODO: really for suffixes we should reverse the automaton and run them backwards
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

  FST<IntsRef> prefixes;
  FST<IntsRef> suffixes;
  
-  // all Patterns used by prefixes and suffixes. these are typically re-used across
+  // all condition checks used by prefixes and suffixes. these are typically re-used across
  // many affix stripping rules. so these are deduplicated, to save RAM.
-  // TODO: maybe don't use Pattern for the condition check...
-  // TODO: when we cut over Affix to FST, just store integer index to this.
-  ArrayList<Pattern> patterns = new ArrayList<>();
+  ArrayList<CharacterRunAutomaton> patterns = new ArrayList<>();
  
  // the entries in the .dic file, mapping to their set of flags.
  // the fst output is the ordinal list for flagLookup
@ -103,7 +105,8 @@ public class Dictionary {
  BytesRefHash flagLookup = new BytesRefHash();
  
  // the list of unique strip affixes.
-  BytesRefHash stripLookup = new BytesRefHash();
+  char[] stripData;
+  int[] stripOffsets;
  
  // 8 bytes per affix
  byte[] affixData = new byte[64];
@ -118,6 +121,7 @@ public class Dictionary {
  
  boolean ignoreCase;
  boolean complexPrefixes;
+  boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
  
  int circumfix = -1; // circumfix flag, or -1 if one is not defined
  
@ -160,7 +164,6 @@ public class Dictionary {
    this.needsInputCleaning = ignoreCase;
    this.needsOutputCleaning = false; // set if we have an OCONV
    flagLookup.add(new BytesRef()); // no flags -> ord 0
-    stripLookup.add(new BytesRef()); // no strip -> ord 0

    File aff = File.createTempFile("affix", "aff", tempDir);
    OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
@ -272,6 +275,14 @@ public class Dictionary {
    TreeMap<String, List<Character>> prefixes = new TreeMap<>();
    TreeMap<String, List<Character>> suffixes = new TreeMap<>();
    Map<String,Integer> seenPatterns = new HashMap<>();
+    
+    // zero condition -> 0 ord
+    seenPatterns.put(".*", 0);
+    patterns.add(null);
+    
+    // zero strip -> 0 ord
+    Map<String,Integer> seenStrips = new LinkedHashMap<>();
+    seenStrips.put("", 0);

    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
    String line = null;
@ -283,9 +294,9 @@ public class Dictionary {
      if (line.startsWith(ALIAS_KEY)) {
        parseAlias(line);
      } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
      } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
      } else if (line.startsWith(FLAG_KEY)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
@ -326,6 +337,22 @@ public class Dictionary {
    
    this.prefixes = affixFST(prefixes);
    this.suffixes = affixFST(suffixes);
+    
+    int totalChars = 0;
+    for (String strip : seenStrips.keySet()) {
+      totalChars += strip.length();
+    }
+    stripData = new char[totalChars];
+    stripOffsets = new int[seenStrips.size()+1];
+    int currentOffset = 0;
+    int currentIndex = 0;
+    for (String strip : seenStrips.keySet()) {
+      stripOffsets[currentIndex++] = currentOffset;
+      strip.getChars(0, strip.length(), stripData, currentOffset);
+      currentOffset += strip.length();
+    }
+    assert currentIndex == seenStrips.size();
+    stripOffsets[currentIndex] = currentOffset;
  }
  
  private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
@ -360,7 +387,8 @@ public class Dictionary {
                          String header,
                          LineNumberReader reader,
                          String conditionPattern,
-                          Map<String,Integer> seenPatterns) throws IOException, ParseException {
+                          Map<String,Integer> seenPatterns,
+                          Map<String,Integer> seenStrips) throws IOException, ParseException {
    
    BytesRef scratch = new BytesRef();
    StringBuilder sb = new StringBuilder();
@ -399,7 +427,10 @@ public class Dictionary {
        
        appendFlags = flagParsingStrategy.parseFlags(flagPart);
        Arrays.sort(appendFlags);
+        twoStageAffix = true;
      }
+      
+      // TODO: add test and fix zero-affix handling!

      String condition = ruleArgs.length > 4 ? ruleArgs[4] : ".";
      // at least the gascon affix file has this issue
@ -411,7 +442,16 @@ public class Dictionary {
        condition = condition.replace("-", "\\-");
      }

-      String regex = String.format(Locale.ROOT, conditionPattern, condition);
+      final String regex;
+      if (".".equals(condition)) {
+        regex = ".*"; // Zero condition is indicated by dot
+      } else if (condition.equals(strip)) {
+        regex = ".*"; // TODO: optimize this better:
+                      // if we remove 'strip' from condition, we don't have to append 'strip' to check it...!
+                      // but this is complicated...
+      } else {
+        regex = String.format(Locale.ROOT, conditionPattern, condition);
+      }
      
      // deduplicate patterns
      Integer patternIndex = seenPatterns.get(regex);
@ -421,17 +461,17 @@ public class Dictionary {
          throw new UnsupportedOperationException("Too many patterns, please report this to dev@lucene.apache.org");          
        }
        seenPatterns.put(regex, patternIndex);
-        Pattern pattern = Pattern.compile(regex);
+        CharacterRunAutomaton pattern = new CharacterRunAutomaton(new RegExp(regex, RegExp.NONE).toAutomaton());
        patterns.add(pattern);
      }
      
-      scratch.copyChars(strip);
-      int stripOrd = stripLookup.add(scratch);
-      if (stripOrd < 0) {
-        // already exists in our hash
-        stripOrd = (-stripOrd)-1;
-      } else if (stripOrd > Character.MAX_VALUE) {
-        throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
+      Integer stripOrd = seenStrips.get(strip);
+      if (stripOrd == null) {
+        stripOrd = seenStrips.size();
+        seenStrips.put(strip, stripOrd);
+        if (stripOrd > Character.MAX_VALUE) {
+          throw new UnsupportedOperationException("Too many unique strips, please report this to dev@lucene.apache.org");
+        }
      }

      if (appendFlags == null) {
@ -449,7 +489,7 @@ public class Dictionary {
      }
      
      affixWriter.writeShort((short)flag);
-      affixWriter.writeShort((short)stripOrd);
+      affixWriter.writeShort((short)stripOrd.intValue());
      // encode crossProduct into patternIndex
      int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
      affixWriter.writeShort((short)patternOrd);
@ -765,6 +805,9 @@ public class Dictionary {
  }
  
  static char[] decodeFlags(BytesRef b) {
+    if (b.length == 0) {
+      return CharsRef.EMPTY_CHARS;
+    }
    int len = b.length >>> 1;
    char flags[] = new char[len];
    int upto = 0;
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@ -22,7 +22,6 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
-import java.util.regex.Pattern;

 import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.store.ByteArrayDataInput;
@ -31,6 +30,7 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;

 /**
 * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word.  It
@ -160,7 +160,7 @@ final class Stemmer {
    // TODO: allow this stuff to be reused by tokenfilter
    List<CharsRef> stems = new ArrayList<CharsRef>();
    
-    if (doPrefix) {
+    if (doPrefix && dictionary.prefixes != null) {
      for (int i = length - 1; i >= 0; i--) {
        IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
        if (prefixes == null) {
@ -197,12 +197,19 @@ final class Stemmer {
            int deAffixedStart = i;
            int deAffixedLength = length - deAffixedStart;
            
-            dictionary.stripLookup.get(stripOrd, scratch);
-            String strippedWord = new StringBuilder().append(scratch.utf8ToString())
-                .append(word, deAffixedStart, deAffixedLength)
-                .toString();
+            int stripStart = dictionary.stripOffsets[stripOrd];
+            int stripEnd = dictionary.stripOffsets[stripOrd+1];
+            int stripLength = stripEnd - stripStart;
            
-            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
+            if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
+              continue;
+            }
+            
+            char strippedWord[] = new char[stripLength + deAffixedLength];
+            System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
+            System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
+
+            List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
            
            stems.addAll(stemList);
          }
@ -210,7 +217,7 @@ final class Stemmer {
      }
    } 
    
-    if (doSuffix) {
+    if (doSuffix && dictionary.suffixes != null) {
      for (int i = 0; i < length; i++) {
        IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
        if (suffixes == null) {
@ -246,11 +253,20 @@ final class Stemmer {
          if (compatible) {
            int appendLength = length - i;
            int deAffixedLength = length - appendLength;
-            // TODO: can we do this in-place?
-            dictionary.stripLookup.get(stripOrd, scratch);
-            String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
            
-            List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
+            int stripStart = dictionary.stripOffsets[stripOrd];
+            int stripEnd = dictionary.stripOffsets[stripOrd+1];
+            int stripLength = stripEnd - stripStart;
+            
+            if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
+              continue;
+            }
+
+            char strippedWord[] = new char[stripLength + deAffixedLength];
+            System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
+            System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
+            
+            List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
            
            stems.addAll(stemList);
          }
@ -260,6 +276,30 @@ final class Stemmer {

    return stems;
  }
+  
+  /** checks condition of the concatenation of two strings */
+  // note: this is pretty stupid, we really should subtract strip from the condition up front and just check the stem
+  // but this is a little bit more complicated.
+  private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
+    if (condition != 0) {
+      CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
+      int state = pattern.getInitialState();
+      for (int i = c1off; i < c1off + c1len; i++) {
+        state = pattern.step(state, c1[i]);
+        if (state == -1) {
+          return false;
+        }
+      }
+      for (int i = c2off; i < c2off + c2len; i++) {
+        state = pattern.step(state, c2[i]);
+        if (state == -1) {
+          return false;
+        }
+      }
+      return pattern.isAccept(state);
+    }
+    return true;
+  }

  /**
   * Applies the affix rule to the given word, producing a list of stems if any are found
@ -273,10 +313,7 @@ final class Stemmer {
   * @param prefix true if we are removing a prefix (false if its a suffix)
   * @return List of stems for the word, or an empty list if none are found
   */
-  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
-    segment.setLength(0);
-    segment.append(strippedWord, 0, length);
-    
+  List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {    
    // TODO: just pass this in from before, no need to decode it twice
    affixReader.setPosition(8 * affix);
    char flag = (char) (affixReader.readShort() & 0xffff);
@ -285,11 +322,6 @@ final class Stemmer {
    boolean crossProduct = (condition & 1) == 1;
    condition >>>= 1;
    char append = (char) (affixReader.readShort() & 0xffff);
-    
-    Pattern pattern = dictionary.patterns.get(condition);
-    if (!pattern.matcher(segment).matches()) {
-      return Collections.emptyList();
-    }

    List<CharsRef> stems = new ArrayList<CharsRef>();

@ -338,9 +370,9 @@ final class Stemmer {
        if (prefix) {
          // we took away the first prefix.
          // COMPLEXPREFIXES = true:  combine with a second prefix and another suffix 
-          // COMPLEXPREFIXES = false: combine with another suffix
-          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
-        } else if (!dictionary.complexPrefixes) {
+          // COMPLEXPREFIXES = false: combine with a suffix
+          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
+        } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
          // we took away a suffix.
          // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
          // COMPLEXPREFIXES = false: combine with another suffix
@ -350,7 +382,7 @@ final class Stemmer {
        if (prefix && dictionary.complexPrefixes) {
          // we took away the second prefix: go look for another suffix
          stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
-        } else if (prefix == false && dictionary.complexPrefixes == false) {
+        } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
          // we took away a prefix, then a suffix: go look for another suffix
          stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
        }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@ -169,7 +169,7 @@ public class TestAllDictionaries extends LuceneTestCase {
          System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
-                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
@ -185,7 +185,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
          System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
-                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripData) + ", " +
                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +