LUCENE-5468: deduplicate patterns used by affix condition check

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68
2014-02-25 19:18:09 +00:00 · 2014-02-25 19:18:09 +00:00 · 10f548d205
parent 803226ece4
commit 10f548d205
3 changed files with 32 additions and 21 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@ -28,7 +28,6 @@ final class Affix {
  private char appendFlags[]; // continuation class flags
  private String strip;
  
-  private String condition;
  private Pattern conditionPattern;
  
  private char flag;
@ -99,24 +98,13 @@ final class Affix {
    this.strip = strip;
  }

-  /**
-   * Returns the condition that must be met before the affix can be applied
-   *
-   * @return Condition that must be met before the affix can be applied
-   */
-  public String getCondition() {
-    return condition;
-  }
-
  /**
   * Sets the condition that must be met before the affix can be applied
   *
-   * @param condition Condition to be met before affix application
   * @param pattern Condition as a regular expression pattern
   */
-  public void setCondition(String condition, String pattern) {
-    this.condition = condition;
-    this.conditionPattern = Pattern.compile(pattern);
+  public void setCondition(Pattern pattern) {
+    this.conditionPattern = pattern;
  }

  /**
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@ -44,6 +44,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.regex.Pattern;

 /**
 * In-memory structure for the dictionary (.dic) and affix (.aff)
@ -68,6 +69,12 @@ public class Dictionary {
  public CharArrayMap<List<Affix>> prefixes;
  public CharArrayMap<List<Affix>> suffixes;
  
+  // all Patterns used by prefixes and suffixes. these are typically re-used across
+  // many affix stripping rules. so these are deduplicated, to save RAM.
+  // TODO: maybe don't use Pattern for the condition check...
+  // TODO: when we cut over Affix to FST, just store integer index to this.
+  public ArrayList<Pattern> patterns = new ArrayList<>();
+  
  // the entries in the .dic file, mapping to their set of flags.
  // the fst output is the ordinal for flagLookup
  public FST<Long> words;
@ -184,6 +191,7 @@ public class Dictionary {
  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
    prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
    suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    Map<String,Integer> seenPatterns = new HashMap<>();

    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
    String line = null;
@ -191,9 +199,9 @@ public class Dictionary {
      if (line.startsWith(ALIAS_KEY)) {
        parseAlias(line);
      } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
      } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
      } else if (line.startsWith(FLAG_KEY)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
@ -210,12 +218,14 @@ public class Dictionary {
   * @param reader BufferedReader to read the content of the rule from
   * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
   *                         pattern
+   * @param seenPatterns map from condition -> index of patterns, for deduplication.
   * @throws IOException Can be thrown while reading the rule
   */
  private void parseAffix(CharArrayMap<List<Affix>> affixes,
                          String header,
                          LineNumberReader reader,
-                          String conditionPattern) throws IOException, ParseException {
+                          String conditionPattern,
+                          Map<String,Integer> seenPatterns) throws IOException, ParseException {
    String args[] = header.split("\\s+");

    boolean crossProduct = args[2].equals("Y");
@ -261,7 +271,16 @@ public class Dictionary {
      if (condition.indexOf('-') >= 0) {
        condition = condition.replace("-", "\\-");
      }
-      affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
+      // deduplicate patterns
+      String regex = String.format(Locale.ROOT, conditionPattern, condition);
+      Integer patternIndex = seenPatterns.get(regex);
+      if (patternIndex == null) {
+        patternIndex = patterns.size();
+        seenPatterns.put(regex, patternIndex);
+        Pattern pattern = Pattern.compile(regex);
+        patterns.add(pattern);
+      }
+      affix.setCondition(patterns.get(patternIndex));
      affix.setCrossProduct(crossProduct);
      
      List<Affix> list = affixes.get(affix.getAppend());
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@ -33,12 +33,12 @@ import org.junit.Ignore;
 * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
 * Note some of the files differ only in case. This may be a problem on your operating system!
 */
-@Ignore("enable manually")
+//@Ignore("enable manually")
 public class TestAllDictionaries extends LuceneTestCase {
  
  // set this to the location of where you downloaded all the files
  static final File DICTIONARY_HOME = 
-      new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
+      new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
  
  final String tests[] = {
    /* zip file */               /* dictionary */       /* affix */
@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase {
        try (InputStream dictionary = zip.getInputStream(dicEntry);
             InputStream affix = zip.getInputStream(affEntry)) {
          Dictionary dic = new Dictionary(affix, dictionary);
-          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
+          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
+                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
+                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
+                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
+                             "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
        }
      }
    }