From 10f548d205e9443872c919f7af0ac1b01c735ed3 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Tue, 25 Feb 2014 19:18:09 +0000
Subject: [PATCH] LUCENE-5468: deduplicate patterns used by affix condition
 check

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene/analysis/hunspell2/Affix.java      | 16 ++---------
 .../lucene/analysis/hunspell2/Dictionary.java | 27 ++++++++++++++++---
 .../hunspell2/TestAllDictionaries.java        | 10 ++++---
 3 files changed, 32 insertions(+), 21 deletions(-)
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
index 41c3553fb77..443c006c97d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java
@@ -28,7 +28,6 @@ final class Affix {
   private char appendFlags[]; // continuation class flags
   private String strip;
   
-  private String condition;
   private Pattern conditionPattern;
   
   private char flag;
@@ -99,24 +98,13 @@ final class Affix {
     this.strip = strip;
   }
 
-  /**
-   * Returns the condition that must be met before the affix can be applied
-   *
-   * @return Condition that must be met before the affix can be applied
-   */
-  public String getCondition() {
-    return condition;
-  }
-
   /**
    * Sets the condition that must be met before the affix can be applied
    *
-   * @param condition Condition to be met before affix application
    * @param pattern Condition as a regular expression pattern
    */
-  public void setCondition(String condition, String pattern) {
-    this.condition = condition;
-    this.conditionPattern = Pattern.compile(pattern);
+  public void setCondition(Pattern pattern) {
+    this.conditionPattern = pattern;
   }
 
   /**
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
index 10baa403413..0456d9946d3 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
@@ -44,6 +44,7 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.regex.Pattern;
 
 /**
  * In-memory structure for the dictionary (.dic) and affix (.aff)
@@ -68,6 +69,12 @@ public class Dictionary {
   public CharArrayMap<List<Affix>> prefixes;
   public CharArrayMap<List<Affix>> suffixes;
   
+  // all Patterns used by prefixes and suffixes. these are typically re-used across
+  // many affix stripping rules. so these are deduplicated, to save RAM.
+  // TODO: maybe don't use Pattern for the condition check...
+  // TODO: when we cut over Affix to FST, just store integer index to this.
+  public ArrayList<Pattern> patterns = new ArrayList<>();
+  
   // the entries in the .dic file, mapping to their set of flags.
   // the fst output is the ordinal for flagLookup
   public FST<Long> words;
@@ -184,6 +191,7 @@ public class Dictionary {
   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
     prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
     suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
+    Map<String,Integer> seenPatterns = new HashMap<>();
 
     LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
     String line = null;
@@ -191,9 +199,9 @@ public class Dictionary {
       if (line.startsWith(ALIAS_KEY)) {
         parseAlias(line);
       } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
       } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
       } else if (line.startsWith(FLAG_KEY)) {
         // Assume that the FLAG line comes before any prefix or suffixes
         // Store the strategy so it can be used when parsing the dic file
@@ -210,12 +218,14 @@ public class Dictionary {
    * @param reader BufferedReader to read the content of the rule from
    * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
    *                         pattern
+   * @param seenPatterns map from condition -> index of patterns, for deduplication.
    * @throws IOException Can be thrown while reading the rule
    */
   private void parseAffix(CharArrayMap<List<Affix>> affixes,
                           String header,
                           LineNumberReader reader,
-                          String conditionPattern) throws IOException, ParseException {
+                          String conditionPattern,
+                          Map<String,Integer> seenPatterns) throws IOException, ParseException {
     String args[] = header.split("\\s+");
 
     boolean crossProduct = args[2].equals("Y");
@@ -261,7 +271,16 @@ public class Dictionary {
       if (condition.indexOf('-') >= 0) {
         condition = condition.replace("-", "\\-");
       }
-      affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
+      // deduplicate patterns
+      String regex = String.format(Locale.ROOT, conditionPattern, condition);
+      Integer patternIndex = seenPatterns.get(regex);
+      if (patternIndex == null) {
+        patternIndex = patterns.size();
+        seenPatterns.put(regex, patternIndex);
+        Pattern pattern = Pattern.compile(regex);
+        patterns.add(pattern);
+      }
+      affix.setCondition(patterns.get(patternIndex));
       affix.setCrossProduct(crossProduct);
       
       List<Affix> list = affixes.get(affix.getAppend());
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
index ecb21b97a7c..9f9bce98236 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
@@ -33,12 +33,12 @@ import org.junit.Ignore;
  * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
  * Note some of the files differ only in case. This may be a problem on your operating system!
  */
-@Ignore("enable manually")
+//@Ignore("enable manually")
 public class TestAllDictionaries extends LuceneTestCase {
   
   // set this to the location of where you downloaded all the files
   static final File DICTIONARY_HOME = 
-      new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
+      new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
   
   final String tests[] = {
     /* zip file */               /* dictionary */       /* affix */
@@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase {
         try (InputStream dictionary = zip.getInputStream(dicEntry);
              InputStream affix = zip.getInputStream(affEntry)) {
           Dictionary dic = new Dictionary(affix, dictionary);
-          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
+          System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
+                             "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
+                             "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
+                             "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
+                             "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
         }
       }
     }