LUCENE-5468: encode affix data as 8 bytes per affix, before cutting over to FST

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68
2014-02-27 17:19:15 +00:00 · 2014-02-27 17:19:15 +00:00 · cdec14902b
parent 9896e610d3
commit cdec14902b
3 changed files with 71 additions and 149 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@ -1,113 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Wrapper class representing a hunspell affix
- */
-final class Affix {
-
-  private int appendFlags; // continuation class flags
-  private int condition; // check condition
-  private boolean crossProduct;
-  private char flag;
-  private int strip;
-
-  /**
-   * Returns the flags defined for the affix append
-   *
-   * @return Flags defined for the affix append
-   */
-  public int getAppendFlags() {
-    return appendFlags;
-  }
-
-  /**
-   * Sets the flags defined for the affix append
-   *
-   * @param appendFlags Flags defined for the affix append
-   */
-  public void setAppendFlags(int appendFlags) {
-    this.appendFlags = appendFlags;
-  }
-
-  /**
-   * Returns the stripping characters defined for the affix
-   *
-   * @return Stripping characters defined for the affix
-   */
-  public int getStrip() {
-    return strip;
-  }
-
-  /**
-   * Sets the stripping characters defined for the affix
-   *
-   * @param strip Stripping characters defined for the affix
-   */
-  public void setStrip(int strip) {
-    this.strip = strip;
-  }
-
-  /**
-   * Sets the condition that must be met before the affix can be applied
-   */
-  public void setCondition(int condition) {
-    this.condition = condition;
-  }
-  
-  public int getCondition() {
-    return condition;
-  }
-
-  /**
-   * Returns the affix flag
-   *
-   * @return Affix flag
-   */
-  public char getFlag() {
-    return flag;
-  }
-
-  /**
-   * Sets the affix flag
-   *
-   * @param flag Affix flag
-   */
-  public void setFlag(char flag) {
-    this.flag = flag;
-  }
-
-  /**
-   * Returns whether the affix is defined as cross product
-   *
-   * @return {@code true} if the affix is cross product, {@code false} otherwise
-   */
-  public boolean isCrossProduct() {
-    return crossProduct;
-  }
-
-  /**
-   * Sets whether the affix is defined as cross product
-   *
-   * @param crossProduct Whether the affix is defined as cross product
-   */
-  public void setCrossProduct(boolean crossProduct) {
-    this.crossProduct = crossProduct;
-  }
-}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2;
 */

 import org.apache.lucene.analysis.util.CharArrayMap;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.IOUtils;
@ -66,8 +68,8 @@ public class Dictionary {
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

-  public CharArrayMap<List<Affix>> prefixes;
-  public CharArrayMap<List<Affix>> suffixes;
+  public CharArrayMap<List<Character>> prefixes;
+  public CharArrayMap<List<Character>> suffixes;
  
  // all Patterns used by prefixes and suffixes. these are typically re-used across
  // many affix stripping rules. so these are deduplicated, to save RAM.
@ -85,6 +87,10 @@ public class Dictionary {
  // the list of unique strip affixes.
  public BytesRefHash stripLookup = new BytesRefHash();
  
+  // 8 bytes per affix
+  public byte[] affixData = new byte[64];
+  private int currentAffix = 0;
+
  private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy

  private String[] aliases;
@ -169,7 +175,7 @@ public class Dictionary {
   * @param length Length from the offset that the String is
   * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
   */
-  public List<Affix> lookupPrefix(char word[], int offset, int length) {
+  public List<Character> lookupPrefix(char word[], int offset, int length) {
    return prefixes.get(word, offset, length);
  }

@ -181,7 +187,7 @@ public class Dictionary {
   * @param length Length from the offset that the String is
   * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
   */
-  List<Affix> lookupSuffix(char word[], int offset, int length) {
+  List<Character> lookupSuffix(char word[], int offset, int length) {
    return suffixes.get(word, offset, length);
  }

@ -193,8 +199,8 @@ public class Dictionary {
   * @throws IOException Can be thrown while reading from the InputStream
   */
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
-    prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
-    suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    prefixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
+    suffixes = new CharArrayMap<List<Character>>(Version.LUCENE_CURRENT, 8, false);
    Map<String,Integer> seenPatterns = new HashMap<>();

    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
@ -225,7 +231,7 @@ public class Dictionary {
   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
-  private void parseAffix(CharArrayMap<List<Affix>> affixes,
+  private void parseAffix(CharArrayMap<List<Character>> affixes,
                          String header,
                          LineNumberReader reader,
                          String conditionPattern,
@ -237,7 +243,14 @@ public class Dictionary {
    boolean crossProduct = args[2].equals("Y");
    
    int numLines = Integer.parseInt(args[3]);
+    affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
+    ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3);
+    
    for (int i = 0; i < numLines; i++) {
+      if (currentAffix > Short.MAX_VALUE) {
+        throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org");
+      }
+      assert affixWriter.getPosition() == currentAffix << 3;
      String line = reader.readLine();
      String ruleArgs[] = line.split("\\s+");

@ -245,7 +258,6 @@ public class Dictionary {
          throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
      }
      
-      
      char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
      String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
      String affixArg = ruleArgs[3];
@ -285,36 +297,42 @@ public class Dictionary {
        patterns.add(pattern);
      }
      
-      Affix affix = new Affix();
      scratch.copyChars(strip);
-      int ord = stripLookup.add(scratch);
-      if (ord < 0) {
+      int stripOrd = stripLookup.add(scratch);
+      if (stripOrd < 0) {
        // already exists in our hash
-        ord = (-ord)-1;
+        stripOrd = (-stripOrd)-1;
      }
-      affix.setStrip(ord);
-      affix.setFlag(flag);
-      affix.setCondition(patternIndex);
-      affix.setCrossProduct(crossProduct);
+
      if (appendFlags == null) {
        appendFlags = NOFLAGS;
      }
      
      final int hashCode = encodeFlagsWithHash(scratch, appendFlags);
-      ord = flagLookup.add(scratch, hashCode);
-      if (ord < 0) {
+      int appendFlagsOrd = flagLookup.add(scratch, hashCode);
+      if (appendFlagsOrd < 0) {
        // already exists in our hash
-        ord = (-ord)-1;
+        appendFlagsOrd = (-appendFlagsOrd)-1;
+      } else if (appendFlagsOrd > Short.MAX_VALUE) {
+        // this limit is probably flexible, but its a good sanity check too
+        throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org");
      }
-      affix.setAppendFlags(ord);
      
-      List<Affix> list = affixes.get(affixArg);
+      affixWriter.writeShort((short)flag);
+      affixWriter.writeShort((short)stripOrd);
+      // encode crossProduct into patternIndex
+      int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
+      affixWriter.writeShort((short)patternOrd);
+      affixWriter.writeShort((short)appendFlagsOrd);
+      
+      List<Character> list = affixes.get(affixArg);
      if (list == null) {
-        list = new ArrayList<Affix>();
+        list = new ArrayList<Character>();
        affixes.put(affixArg, list);
      }
      
-      list.add(affix);
+      list.add((char)currentAffix);
+      currentAffix++;
    }
  }

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
@ -24,6 +24,7 @@ import java.util.List;
 import java.util.regex.Pattern;

 import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.store.ByteArrayDataInput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.Version;
@ -37,6 +38,7 @@ final class Stemmer {
  private final Dictionary dictionary;
  private BytesRef scratch = new BytesRef();
  private final StringBuilder segment = new StringBuilder();
+  private final ByteArrayDataInput affixReader;

  /**
   * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the 
@ -56,6 +58,7 @@ final class Stemmer {
   */
  public Stemmer(Dictionary dictionary, int recursionCap) {
    this.dictionary = dictionary;
+    this.affixReader = new ByteArrayDataInput(dictionary.affixData);
    this.recursionCap = recursionCap;
  } 
  
@ -122,17 +125,20 @@ final class Stemmer {
    List<CharsRef> stems = new ArrayList<CharsRef>();

    for (int i = 0; i < length; i++) {
-      List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
+      List<Character> suffixes = dictionary.lookupSuffix(word, i, length - i);
      if (suffixes == null) {
        continue;
      }

-      for (Affix suffix : suffixes) {
-        if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
+      for (Character suffix : suffixes) {
+        affixReader.setPosition(8 * suffix);
+        char flag = (char) (affixReader.readShort() & 0xffff);
+        if (hasCrossCheckedFlag(flag, flags)) {
          int appendLength = length - i;
          int deAffixedLength = length - appendLength;
          // TODO: can we do this in-place?
-          dictionary.stripLookup.get(suffix.getStrip(), scratch);
+          char stripOrd = (char) (affixReader.readShort() & 0xffff);
+          dictionary.stripLookup.get(stripOrd, scratch);
          String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();

          List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
@ -143,17 +149,20 @@ final class Stemmer {
    }

    for (int i = length - 1; i >= 0; i--) {
-      List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
+      List<Character> prefixes = dictionary.lookupPrefix(word, 0, i);
      if (prefixes == null) {
        continue;
      }

-      for (Affix prefix : prefixes) {
-        if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
+      for (Character prefix : prefixes) {
+        affixReader.setPosition(8 * prefix);
+        char flag = (char) (affixReader.readShort() & 0xffff);
+        if (hasCrossCheckedFlag(flag, flags)) {
          int deAffixedStart = i;
          int deAffixedLength = length - deAffixedStart;
+          char stripOrd = (char) (affixReader.readShort() & 0xffff);

-          dictionary.stripLookup.get(prefix.getStrip(), scratch);
+          dictionary.stripLookup.get(stripOrd, scratch);
          String strippedWord = new StringBuilder().append(scratch.utf8ToString())
              .append(word, deAffixedStart, deAffixedLength)
              .toString();
@ -176,11 +185,19 @@ final class Stemmer {
   * @param recursionDepth Level of recursion this stemming step is at
   * @return List of stems for the word, or an empty list if none are found
   */
-  public List<CharsRef> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
+  public List<CharsRef> applyAffix(char strippedWord[], int length, char affix, int recursionDepth) {
    segment.setLength(0);
    segment.append(strippedWord, 0, length);
    
-    Pattern pattern = dictionary.patterns.get(affix.getCondition());
+    affixReader.setPosition(8 * affix);
+    char flag = (char) (affixReader.readShort() & 0xffff);
+    affixReader.skipBytes(2); // strip
+    int condition = (char) (affixReader.readShort() & 0xffff);
+    boolean crossProduct = (condition & 1) == 1;
+    condition >>>= 1;
+    char append = (char) (affixReader.readShort() & 0xffff);
+
+    Pattern pattern = dictionary.patterns.get(condition);
    if (!pattern.matcher(segment).matches()) {
      return Collections.emptyList();
    }
@ -188,12 +205,12 @@ final class Stemmer {
    List<CharsRef> stems = new ArrayList<CharsRef>();

    char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
-    if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
+    if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) {
      stems.add(new CharsRef(strippedWord, 0, length));
    }

-    if (affix.isCrossProduct() && recursionDepth < recursionCap) {
-      dictionary.flagLookup.get(affix.getAppendFlags(), scratch);
+    if (crossProduct && recursionDepth < recursionCap) {
+      dictionary.flagLookup.get(append, scratch);
      char appendFlags[] = Dictionary.decodeFlags(scratch);
      stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth));
    }