From b2b86dd8add14c8e16d9b794707a3948834a6e68 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 17:53:30 +0000
Subject: [PATCH] LUCENE-5468: convert affixes to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Dictionary.java | 78 ++++++++++++++++---
 .../lucene/analysis/hunspell2/Stemmer.java    | 13 ++--
 .../hunspell2/TestAllDictionaries.java        |  3 +
 .../analysis/hunspell2/TestDictionary.java    |  8 +-
 4 files changed, 83 insertions(+), 19 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index b30bdaa1e92..b9f9c82c2f5 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil;
 import org.apache.lucene.util.Version;
 import org.apache.lucene.util.fst.Builder;
 import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.IntSequenceOutputs;
 import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
 
 import java.io.*;
 import java.nio.charset.Charset;
@@ -46,6 +48,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.TreeMap;
 import java.util.regex.Pattern;
 
 /**
@@ -68,8 +71,8 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
-  public CharArrayMap<List<Character>> prefixes;
-  public CharArrayMap<List<Character>> suffixes;
+  public FST<IntsRef> prefixes;
+  public FST<IntsRef> suffixes;
   
   // all Patterns used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
@@ -137,7 +140,7 @@ public class Dictionary {
       ord = lookupOrd(word, offset, length);
     } catch (IOException ex) { /* bogus */ }
     if (ord == null) {
-      return null;
+      return null;  
     }
     return decodeFlags(flagLookup.get(ord, scratch));
   }
@@ -175,8 +178,8 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
    */
-  public List<Character> lookupPrefix(char word[], int offset, int length) {
-    return prefixes.get(word, offset, length);
+  IntsRef lookupPrefix(char word[], int offset, int length) {
+    return lookupAffix(prefixes, word, offset, length);
   }
 
   /**
@@ -187,8 +190,42 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
    */
-  List<Character> lookupSuffix(char word[], int offset, int length) {
-    return suffixes.get(word, offset, length);
+  IntsRef lookupSuffix(char word[], int offset, int length) {
+    return lookupAffix(suffixes, word, offset, length);
+  }
+  
+  // TODO: this is pretty stupid, considering how the stemming algorithm works
+  // we can speed it up to be significantly faster!
+  IntsRef lookupAffix(FST<IntsRef> fst, char word[], int offset, int length) {
+    if (fst == null) {
+      return null;
+    }
+    final FST.BytesReader bytesReader = fst.getBytesReader();
+    final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
+    // Accumulate output as we go
+    final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
+    IntsRef output = NO_OUTPUT;
+    
+    int l = offset + length;
+    try {
+      for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
+        cp = Character.codePointAt(word, i, l);
+        if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
+          return null;
+        } else if (arc.output != NO_OUTPUT) {
+          output = fst.outputs.add(output, arc.output);
+        }
+      }
+      if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
+        return null;
+      } else if (arc.output != NO_OUTPUT) {
+        return fst.outputs.add(output, arc.output);
+      } else {
+        return output;
+      }
+    } catch (IOException bogus) {
+      throw new RuntimeException(bogus);
+    }
   }
 
   /**
@@ -199,8 +236,8 @@ public class Dictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    TreeMap<String, List<Character>> prefixes = new TreeMap<>();
+    TreeMap<String, List<Character>> suffixes = new TreeMap<>();
     Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@@ -218,6 +255,27 @@ public class Dictionary {
         flagParsingStrategy = getFlagParsingStrategy(line);
       }
     }
+    
+    this.prefixes = affixFST(prefixes);
+    this.suffixes = affixFST(suffixes);
+  }
+  
+  private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
+    IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
+    Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
+    
+    IntsRef scratch = new IntsRef();
+    for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
+      Util.toUTF32(entry.getKey(), scratch);
+      List<Character> entries = entry.getValue();
+      IntsRef output = new IntsRef(entries.size());
+      int upto = 0;
+      for (Character c : entries) {
+        output.ints[output.length++] = c;
+      }
+      builder.add(scratch, output);
+    }
+    return builder.finish();
   }
 
   /**
@@ -231,7 +289,7 @@ public class Dictionary {
    * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
-  private void parseAffix(CharArrayMap<List<Character>> affixes,
+  private void parseAffix(TreeMap<String,List<Character>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 4eaff6a9e95..d6b0133830a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet;
 import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.Version;
 
 /**
@@ -125,12 +126,13 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     for (int i = 0; i < length; i++) {
-      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i);
       if (suffixes == null) {
         continue;
       }
 
-      for (Character suffix : suffixes) {
+      for (int j = 0; j < suffixes.length; j++) {
+        int suffix = suffixes.ints[suffixes.offset + j];
         affixReader.setPosition(8 * suffix);
         char flag = (char) (affixReader.readShort() & 0xffff);
         if (hasCrossCheckedFlag(flag, flags)) {
@@ -149,12 +151,13 @@ final class Stemmer {
     }
 
     for (int i = length - 1; i >= 0; i--) {
-      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
+      IntsRef prefixes = dictionary.lookupPrefix(word, 0, i);
       if (prefixes == null) {
         continue;
       }
 
-      for (Character prefix : prefixes) {
+      for (int j = 0; j < prefixes.length; j++) {
+        int prefix = prefixes.ints[prefixes.offset + j];
         affixReader.setPosition(8 * prefix);
         char flag = (char) (affixReader.readShort() & 0xffff);
         if (hasCrossCheckedFlag(flag, flags)) {
@@ -185,7 +188,7 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
index 9f9bce98236..d00fc634944 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase {
           System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
                              "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
                              "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
+                             "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
+                             "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " +
+                             "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " +
                              "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
                              "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
         }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
index 14c6e8967d0..e8e0fd0d030 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
@@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase {
     InputStream dictStream = getClass().getResourceAsStream("simple.dic");
 
     Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
     char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
     assertNotNull(flags);
     assertEquals(1, flags.length);
@@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase {
     InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
 
     Dictionary dictionary = new Dictionary(affixStream, dictStream);
-    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
-    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
     assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
     
     affixStream.close();