From cdec14902bc86e7826c3194199dceaa40991c153 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 27 Feb 2014 17:19:15 +0000
Subject: [PATCH] LUCENE-5468: encode affix data as 8 bytes per affix, before
 cutting over to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 113 ------------------
 .../lucene/analysis/hunspell2/Dictionary.java |  64 ++++++----
 .../lucene/analysis/hunspell2/Stemmer.java    |  43 +++++--
 3 files changed, 71 insertions(+), 149 deletions(-)
 delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
deleted file mode 100644
index eb67f60e763..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ /dev/null
@@ -1,113 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Wrapper class representing a hunspell affix
- */
-final class Affix {
-
-  private int appendFlags; // continuation class flags
-  private int condition; // check condition
-  private boolean crossProduct;
-  private char flag;
-  private int strip;
-
-  /**
-   * Returns the flags defined for the affix append
-   *
-   * @return Flags defined for the affix append
-   */
-  public int getAppendFlags() {
-    return appendFlags;
-  }
-
-  /**
-   * Sets the flags defined for the affix append
-   *
-   * @param appendFlags Flags defined for the affix append
-   */
-  public void setAppendFlags(int appendFlags) {
-    this.appendFlags = appendFlags;
-  }
-
-  /**
-   * Returns the stripping characters defined for the affix
-   *
-   * @return Stripping characters defined for the affix
-   */
-  public int getStrip() {
-    return strip;
-  }
-
-  /**
-   * Sets the stripping characters defined for the affix
-   *
-   * @param strip Stripping characters defined for the affix
-   */
-  public void setStrip(int strip) {
-    this.strip = strip;
-  }
-
-  /**
-   * Sets the condition that must be met before the affix can be applied
-   */
-  public void setCondition(int condition) {
-    this.condition = condition;
-  }
-  
-  public int getCondition() {
-    return condition;
-  }
-
-  /**
-   * Returns the affix flag
-   *
-   * @return Affix flag
-   */
-  public char getFlag() {
-    return flag;
-  }
-
-  /**
-   * Sets the affix flag
-   *
-   * @param flag Affix flag
-   */
-  public void setFlag(char flag) {
-    this.flag = flag;
-  }
-
-  /**
-   * Returns whether the affix is defined as cross product
-   *
-   * @return {@code true} if the affix is cross product, {@code false} otherwise
-   */
-  public boolean isCrossProduct() {
-    return crossProduct;
-  }
-
-  /**
-   * Sets whether the affix is defined as cross product
-   *
-   * @param crossProduct Whether the affix is defined as cross product
-   */
-  public void setCrossProduct(boolean crossProduct) {
-    this.crossProduct = crossProduct;
-  }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index 35c7aee6081..b30bdaa1e92 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2;
  */
 
 import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.IOUtils;
@@ -66,8 +68,8 @@ public class Dictionary {
   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
 
-  public CharArrayMap<List<Affix>> prefixes;
-  public CharArrayMap<List<Affix>> suffixes;
+  public CharArrayMap<List<Character>> prefixes;
+  public CharArrayMap<List<Character>> suffixes;
   
   // all Patterns used by prefixes and suffixes. these are typically re-used across
   // many affix stripping rules. so these are deduplicated, to save RAM.
@@ -84,6 +86,10 @@ public class Dictionary {
   
   // the list of unique strip affixes.
   public BytesRefHash stripLookup = new BytesRefHash();
+  
+  // 8 bytes per affix
+  public byte[] affixData = new byte[64];
+  private int currentAffix = 0;
 
   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
 
@@ -169,7 +175,7 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
    */
-  public List<Affix> lookupPrefix(char word[], int offset, int length) {
+  public List<Character> lookupPrefix(char word[], int offset, int length) {
     return prefixes.get(word, offset, length);
   }
 
@@ -181,7 +187,7 @@ public class Dictionary {
    * @param length Length from the offset that the String is
    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
    */
-  List<Affix> lookupSuffix(char word[], int offset, int length) {
+  List<Character> lookupSuffix(char word[], int offset, int length) {
     return suffixes.get(word, offset, length);
   }
 
@@ -193,8 +199,8 @@ public class Dictionary {
    * @throws IOException Can be thrown while reading from the InputStream
    */
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
     Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@@ -225,7 +231,7 @@ public class Dictionary {
    * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
-  private void parseAffix(CharArrayMap<List<Affix>> affixes,
+  private void parseAffix(CharArrayMap<List<Character>> affixes,
                           String header,
                           LineNumberReader reader,
                           String conditionPattern,
@@ -237,14 +243,20 @@ public class Dictionary {
     boolean crossProduct = args[2].equals("Y");
     
     int numLines = Integer.parseInt(args[3]);
+    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
+    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
+    
     for (int i = 0; i < numLines; i++) {
+      if (currentAffix > Short.MAX_VALUE) {
+        throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
+      }
+      assert affixWriter.getPosition() == currentAffix << 3;
       String line = reader.readLine();
       String ruleArgs[] = line.split("\\s+");
 
       if (ruleArgs.length < 5) {
           throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
       }
-
       
       char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
       String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
@@ -285,36 +297,42 @@ public class Dictionary {
         patterns.add(pattern);
       }
       
-      Affix affix = new Affix();
       scratch.copyChars(strip);
-      int ord = stripLookup.add(scratch);
-      if (ord < 0) {
+      int stripOrd = stripLookup.add(scratch);
+      if (stripOrd < 0) {
         // already exists in our hash
-        ord = (-ord)-1;
+        stripOrd = (-stripOrd)-1;
       }
-      affix.setStrip(ord);
-      affix.setFlag(flag);
-      affix.setCondition(patternIndex);
-      affix.setCrossProduct(crossProduct);
+
       if (appendFlags == null) {
         appendFlags = NOFLAGS;
       }
       
       final int hashCode = encodeFlagsWithHash(scratch, appendFlags);
-      ord = flagLookup.add(scratch, hashCode);
-      if (ord < 0) {
+      int appendFlagsOrd = flagLookup.add(scratch, hashCode);
+      if (appendFlagsOrd < 0) {
         // already exists in our hash
-        ord = (-ord)-1;
+        appendFlagsOrd = (-appendFlagsOrd)-1;
+      } else if (appendFlagsOrd > Short.MAX_VALUE) {
+        // this limit is probably flexible, but its a good sanity check too
+        throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
       }
-      affix.setAppendFlags(ord);
       
-      List<Affix> list = affixes.get(affixArg);
+      affixWriter.writeShort((short)flag);
+      affixWriter.writeShort((short)stripOrd);
+      // encode crossProduct into patternIndex
+      int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
+      affixWriter.writeShort((short)patternOrd);
+      affixWriter.writeShort((short)appendFlagsOrd);
+      
+      List<Character> list = affixes.get(affixArg);
       if (list == null) {
-        list = new ArrayList<Affix>();
+        list = new ArrayList<Character>();
         affixes.put(affixArg, list);
       }
       
-      list.add(affix);
+      list.add((char)currentAffix);
+      currentAffix++;
     }
   }
 
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
index 54dce381b1a..4eaff6a9e95 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@@ -24,6 +24,7 @@ import java.util.List;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Version;
@@ -37,6 +38,7 @@ final class Stemmer {
   private final Dictionary dictionary;
   private BytesRef scratch = new BytesRef();
   private final StringBuilder segment = new StringBuilder();
+  private final ByteArrayDataInput affixReader;
 
   /**
    * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the 
@@ -56,6 +58,7 @@ final class Stemmer {
    */
   public Stemmer(Dictionary dictionary, int recursionCap) {
     this.dictionary = dictionary;
+    this.affixReader = new ByteArrayDataInput(dictionary.affixData);
     this.recursionCap = recursionCap;
   } 
   
@@ -122,17 +125,20 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     for (int i = 0; i < length; i++) {
-      List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
       if (suffixes == null) {
         continue;
       }
 
-      for (Affix suffix : suffixes) {
-        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
+      for (Character suffix : suffixes) {
+        affixReader.setPosition(8 * suffix);
+        char flag = (char) (affixReader.readShort() & 0xffff);
+        if (hasCrossCheckedFlag(flag, flags)) {
           int appendLength = length - i;
           int deAffixedLength = length - appendLength;
           // TODO: can we do this in-place?
-          dictionary.stripLookup.get(suffix.getStrip(), scratch);
+          char stripOrd = (char) (affixReader.readShort() & 0xffff);
+          dictionary.stripLookup.get(stripOrd, scratch);
           String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
 
           List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
@@ -143,17 +149,20 @@ final class Stemmer {
     }
 
     for (int i = length - 1; i >= 0; i--) {
-      List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
+      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
       if (prefixes == null) {
         continue;
       }
 
-      for (Affix prefix : prefixes) {
-        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
+      for (Character prefix : prefixes) {
+        affixReader.setPosition(8 * prefix);
+        char flag = (char) (affixReader.readShort() & 0xffff);
+        if (hasCrossCheckedFlag(flag, flags)) {
           int deAffixedStart = i;
           int deAffixedLength = length - deAffixedStart;
+          char stripOrd = (char) (affixReader.readShort() & 0xffff);
 
-          dictionary.stripLookup.get(prefix.getStrip(), scratch);
+          dictionary.stripLookup.get(stripOrd, scratch);
           String strippedWord = new StringBuilder().append(scratch.utf8ToString())
               .append(word, deAffixedStart, deAffixedLength)
               .toString();
@@ -176,11 +185,19 @@ final class Stemmer {
    * @param recursionDepth Level of recursion this stemming step is at
    * @return List of stems for the word, or an empty list if none are found
    */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
     segment.setLength(0);
     segment.append(strippedWord, 0, length);
     
-    Pattern pattern = dictionary.patterns.get(affix.getCondition());
+    affixReader.setPosition(8 * affix);
+    char flag = (char) (affixReader.readShort() & 0xffff);
+    affixReader.skipBytes(2); // strip
+    int condition = (char) (affixReader.readShort() & 0xffff);
+    boolean crossProduct = (condition & 1) == 1;
+    condition >>>= 1;
+    char append = (char) (affixReader.readShort() & 0xffff);
+
+    Pattern pattern = dictionary.patterns.get(condition);
     if (!pattern.matcher(segment).matches()) {
       return Collections.emptyList();
     }
@@ -188,12 +205,12 @@ final class Stemmer {
     List<CharsRef> stems = new ArrayList<CharsRef>();
 
     char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
-    if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
+    if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) {
       stems.add(new CharsRef(strippedWord, 0, length));
     }
 
-    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
-      dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
+    if (crossProduct && recursionDepth < recursionCap) {
+      dictionary.flagLookup.get(append, scratch);
       char appendFlags[] = Dictionary.decodeFlags(scratch);
       stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
     }