LUCENE-5468: convert affixes to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68
2014-02-27 17:53:30 +00:00 · 2014-02-27 17:53:30 +00:00 · b2b86dd8ad
parent cdec14902b
commit b2b86dd8ad
4 changed files with 83 additions and 19 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;

 import java.io.*;
 import java.nio.charset.Charset;
@ -46,6 +48,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.regex.Pattern;

 /**
@ -68,8 +71,8 @@ public class Dictionary {
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

-  public CharArrayMap<List<Character>> prefixes;
-  public CharArrayMap<List<Character>> suffixes;
+  public FST<IntsRef> prefixes;
+  public FST<IntsRef> suffixes;
  
  // all Patterns used by prefixes and suffixes. these are typically re-used across
  // many affix stripping rules. so these are deduplicated, to save RAM.
@ -137,7 +140,7 @@ public class Dictionary {
      ord = lookupOrd(word, offset, length);
    } catch (IOException ex) { /* bogus */ }
    if (ord == null) {
-      return null;
+      return null;  
    }
    return decodeFlags(flagLookup.get(ord, scratch));
  }
@ -175,8 +178,8 @@ public class Dictionary {
   * @param length Length from the offset that the String is
   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
   */
-  public List<Character> lookupPrefix(char word[], int offset, int length) {
-    return prefixes.get(word, offset, length);
+  IntsRef lookupPrefix(char word[], int offset, int length) {
+    return lookupAffix(prefixes, word, offset, length);
  }

  /**
@ -187,8 +190,42 @@ public class Dictionary {
   * @param length Length from the offset that the String is
   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
   */
-  List<Character> lookupSuffix(char word[], int offset, int length) {
-    return suffixes.get(word, offset, length);
+  IntsRef lookupSuffix(char word[], int offset, int length) {
+    return lookupAffix(suffixes, word, offset, length);
+  }
+  
+  // TODO: this is pretty stupid, considering how the stemming algorithm works
+  // we can speed it up to be significantly faster!
+  IntsRef lookupAffix(FST<IntsRef> fst, char word[], int offset, int length) {
+    if (fst == null) {
+      return null;
+    }
+    final FST.BytesReader bytesReader = fst.getBytesReader();
+    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
+    // Accumulate output as we go
+    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
+    IntsRef output = NO_OUTPUT;
+    
+    int l = offset + length;
+    try {
+      for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
+        cp = Character.codePointAt(word, i, l);
+        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
+          return null;
+        } else if (arc.output != NO_OUTPUT) {
+          output = fst.outputs.add(output, arc.output);
+        }
+      }
+      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        return fst.outputs.add(output, arc.output);
+      } else {
+        return output;
+      }
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
  }

  /**
@ -199,8 +236,8 @@ public class Dictionary {
   * @throws IOException Can be thrown while reading from the InputStream
   */
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    TreeMap<String, List<Character>> prefixes = new TreeMap<>();
+    TreeMap<String, List<Character>> suffixes = new TreeMap<>();
    Map<String,Integer> seenPatterns = new HashMap<>();

    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@ -218,6 +255,27 @@ public class Dictionary {
        flagParsingStrategy = getFlagParsingStrategy(line);
      }
    }
+    
+    this.prefixes = affixFST(prefixes);
+    this.suffixes = affixFST(suffixes);
+  }
+  
+  private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
+    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
+    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+    
+    IntsRef scratch = new IntsRef();
+    for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
+      Util.toUTF32(entry.getKey(), scratch);
+      List<Character> entries = entry.getValue();
+      IntsRef output = new IntsRef(entries.size());
+      int upto = 0;
+      for (Character c : entries) {
+        output.ints[output.length++] = c;
+      }
+      builder.add(scratch, output);
+    }
+    return builder.finish();
  }

  /**
@ -231,7 +289,7 @@ public class Dictionary {
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
-  private void parseAffix(CharArrayMap<List<Character>> affixes,
+  private void parseAffix(TreeMap<String,List<Character>> affixes,
                          String header,
                          LineNumberReader reader,
                          String conditionPattern,
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;

 /**
@ -125,12 +126,13 @@ final class Stemmer {
    List<CharsRef> stems = new ArrayList<CharsRef>();

    for (int i = 0; i < length; i++) {
-      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
      if (suffixes == null) {
        continue;
      }

-      for (Character suffix : suffixes) {
+      for (int j = 0; j < suffixes.length; j++) {
+        int suffix = suffixes.ints[suffixes.offset + j];
        affixReader.setPosition(8 * suffix);
        char flag = (char) (affixReader.readShort() & 0xffff);
        if (hasCrossCheckedFlag(flag, flags)) {
@ -149,12 +151,13 @@ final class Stemmer {
    }

    for (int i = length - 1; i >= 0; i--) {
-      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
+      IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
      if (prefixes == null) {
        continue;
      }

-      for (Character prefix : prefixes) {
+      for (int j = 0; j < prefixes.length; j++) {
+        int prefix = prefixes.ints[prefixes.offset + j];
        affixReader.setPosition(8 * prefix);
        char flag = (char) (affixReader.readShort() & 0xffff);
        if (hasCrossCheckedFlag(flag, flags)) {
@ -185,7 +188,7 @@ final class Stemmer {
   * @param recursionDepth Level of recursion this stemming step is at
   * @return List of stems for the word, or an empty list if none are found
   */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
    segment.setLength(0);
    segment.append(strippedWord, 0, length);
    
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase {
          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
+                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
                             "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
        }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase {
    InputStream dictStream = getClass().getResourceAsStream("simple.dic");

    Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
    char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
    assertNotNull(flags);
    assertEquals(1, flags.length);
@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase {
    InputStream dictStream = getClass().getResourceAsStream("compressed.dic");

    Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
    
    affixStream.close();