From d179a26647949ed030ad5bb257eb83054e7a0456 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Sun, 9 Mar 2014 17:27:49 +0000
Subject: [PATCH] LUCENE-5507: fix hunspell affix loading for certain
 dictionaries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1575729 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |  3 ++
 .../lucene/analysis/hunspell/Dictionary.java  | 54 ++++++++++++++-----
 .../hunspell/TestAllDictionaries2.java        | 14 ++---
 .../analysis/hunspell/TestDictionary.java     | 34 ++++++++++++
 .../hunspell/compressed-before-set.aff        | 29 ++++++++++
 .../hunspell/compressed-empty-alias.aff       | 30 +++++++++++
 .../lucene/analysis/hunspell/compressed.aff   | 10 ++--
 7 files changed, 148 insertions(+), 26 deletions(-)
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index f6800af51d7..43ef7cbe458 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -153,6 +153,9 @@ Bug fixes
 * LUCENE-5505: HunspellStemFilter ignores BOM markers in dictionaries and handles varying
   types of whitespace in SET/FLAG commands. (Robert Muir)
 
+* LUCENE-5507: Fix HunspellStemFilter loading of dictionaries with large amounts of aliases
+  etc before the encoding declaration.  (Robert Muir)
+
 Test Framework
 
 * LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index 68a4b457749..0c8ad44cf16 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -35,12 +35,16 @@ import org.apache.lucene.util.fst.Outputs;
 import org.apache.lucene.util.fst.Util;
 
 import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
 import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.LineNumberReader;
+import java.io.OutputStream;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
@@ -155,21 +159,41 @@ public class Dictionary {
     this.ignoreCase = ignoreCase;
     this.needsInputCleaning = ignoreCase;
     this.needsOutputCleaning = false; // set if we have an OCONV
-    // TODO: we really need to probably buffer this on disk since so many newer dictionaries
-    // (en_GB, hu_HU, etc) now have tons of AM lines (morph metadata) etc before they finally declare 
-    // their encoding... but for now this large buffer is a workaround
-    BufferedInputStream buffered = new BufferedInputStream(affix, 65536);
-    buffered.mark(65536);
-    String encoding = getDictionaryEncoding(buffered);
-    buffered.reset();
-    CharsetDecoder decoder = getJavaEncoding(encoding);
-    readAffixFile(buffered, decoder);
     flagLookup.add(new BytesRef()); // no flags -> ord 0
     stripLookup.add(new BytesRef()); // no strip -> ord 0
-    IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
-    Builder<IntsRef> b = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, o);
-    readDictionaryFiles(dictionaries, decoder, b);
-    words = b.finish();
+
+    File aff = File.createTempFile("affix", "aff", tempDir);
+    OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
+    InputStream aff1 = null;
+    InputStream aff2 = null;
+    try {
+      // copy contents of affix stream to temp file
+      final byte [] buffer = new byte [1024 * 8];
+      int len;
+      while ((len = affix.read(buffer)) > 0) {
+        out.write(buffer, 0, len);
+      }
+      out.close();
+      
+      // pass 1: get encoding
+      aff1 = new BufferedInputStream(new FileInputStream(aff));
+      String encoding = getDictionaryEncoding(aff1);
+      
+      // pass 2: parse affixes
+      CharsetDecoder decoder = getJavaEncoding(encoding);
+      aff2 = new BufferedInputStream(new FileInputStream(aff));
+      readAffixFile(aff2, decoder);
+      
+      // read dictionary entries
+      IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
+      Builder<IntsRef> b = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, o);
+      readDictionaryFiles(dictionaries, decoder, b);
+      words = b.finish();
+      aliases = null; // no longer needed
+    } finally {
+      IOUtils.closeWhileHandlingException(out, aff1, aff2);
+      aff.delete();
+    }
   }
 
   /**
@@ -772,7 +796,9 @@ public class Dictionary {
       final int count = Integer.parseInt(ruleArgs[1]);
       aliases = new String[count];
     } else {
-      aliases[aliasCount++] = ruleArgs[1];
+      // an alias can map to no flags
+      String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1];
+      aliases[aliasCount++] = aliasValue;
     }
   }
   
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
index 5e7935a66d8..21d7ad62c36 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java
@@ -47,7 +47,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
     "afrikaans_spell_checker-20110323-fx+tb+fn+sm.xpi",                               "dictionaries/af-ZA.dic",             "dictionaries/af-ZA.aff",
     "albanisches_worterbuch-1.6.9-fx+tb+sm+fn.xpi",                                   "dictionaries/sq.dic",                "dictionaries/sq.aff",
     "amharic_spell_checker-0.4-fx+fn+tb+sm.xpi",                                      "dictionaries/am_ET.dic",             "dictionaries/am_ET.aff",
-//BUG! "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi",                        "dictionaries/ar.dic",                "dictionaries/ar.aff",
+    "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi",                        "dictionaries/ar.dic",                "dictionaries/ar.aff",
     "armenian_spell_checker_dictionary-0.32-fx+tb+sm.xpi",                            "dictionaries/hy_AM.dic",             "dictionaries/hy_AM.aff",
     "azerbaijani_spell_checker-0.3-fx+tb+fn+sm+sb.xpi",                               "dictionaries/az-Latn-AZ.dic",        "dictionaries/az-Latn-AZ.aff",
     "belarusian_classic_dictionary-0.1.2-tb+fx+sm.xpi",                               "dictionaries/be-classic.dic",        "dictionaries/be-classic.aff",
@@ -101,8 +101,8 @@ public class TestAllDictionaries2 extends LuceneTestCase {
     "hausa_spelling_dictionary-0.2-tb+fx.xpi",                                        "dictionaries/ha-GH.dic",             "dictionaries/ha-GH.aff",
     "hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi",              "dictionaries/he.dic",                "dictionaries/he.aff",
     "hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi",                                     "dictionaries/hi_IN.dic",             "dictionaries/hi_IN.aff",
-//BUG! "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi",                                   "dictionaries/hu.dic",                "dictionaries/hu.aff",
-//BUG! "icelandic_dictionary-1.3-fx+tb+sm.xpi",                                          "dictionaries/is.dic",                "dictionaries/is.aff",
+    "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi",                                   "dictionaries/hu.dic",                "dictionaries/hu.aff",
+//BUG: has no encoding declaration "icelandic_dictionary-1.3-fx+tb+sm.xpi",                                          "dictionaries/is.dic",                "dictionaries/is.aff",
     "kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi",                            "dictionaries/id.dic",                "dictionaries/id.aff",
     "kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi",                                 "dictionaries/kn.dic",                "dictionaries/kn.aff",
     "kashubian_spell_checker_poland-0.9-sm+tb+fx.xpi",                                "dictionaries/Kaszebsczi.dic",        "dictionaries/Kaszebsczi.aff",
@@ -146,11 +146,11 @@ public class TestAllDictionaries2 extends LuceneTestCase {
     "telugu_spell_checker-0.3-tb+fx+sm.xpi",                                          "dictionaries/te_IN.dic",             "dictionaries/te_IN.aff",
     "te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi",                                    "dictionaries/mi-x-Tai Tokerau.dic",  "dictionaries/mi-x-Tai Tokerau.aff",
     "te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi",                                    "dictionaries/mi.dic",                "dictionaries/mi.aff",
-//BUG! "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi",                           "dictionaries/ta_IN.dic",             "dictionaries/ta_IN.aff",
+//BUG: broken file (hunspell refuses to load, too)    "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi",                           "dictionaries/ta_IN.dic",             "dictionaries/ta_IN.aff",
     "tsonga_spell_checker-20110323-tb+sm+fx+fn.xpi",                                  "dictionaries/ts-ZA.dic",             "dictionaries/ts-ZA.aff",
     "tswana_spell_checker-20110323-tb+sm+fx+fn.xpi",                                  "dictionaries/tn-ZA.dic",             "dictionaries/tn-ZA.aff",
     "turkce_yazm_denetimi-3.5-sm+tb+fx.xpi",                                          "dictionaries/tr.dic",                "dictionaries/tr.aff",
-//BUG! "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi",                            "dictionaries/tk_TM.dic",             "dictionaries/tk_TM.aff",
+    "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi",                            "dictionaries/tk_TM.dic",             "dictionaries/tk_TM.aff",
     "ukrainian_dictionary-1.7.0-sm+an+fx+fn+tb.xpi",                                  "dictionaries/uk-UA.dic",             "dictionaries/uk-UA.aff",
     "united_states_english_spellchecker-7.0.1-sm+tb+fx+an.xpi",                       "dictionaries/en-US.dic",             "dictionaries/en-US.aff",
     "upper_sorbian_spelling_dictionary-0.0.20060327.3-tb+fx+sm.xpi",                  "dictionaries/hsb.dic",               "dictionaries/hsb.aff",
@@ -196,7 +196,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
   }
   
   public void testOneDictionary() throws Exception {
-    String toTest = "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi";
+    String toTest = "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi";
     for (int i = 0; i < tests.length; i++) {
       if (tests[i].equals(toTest)) {
         File f = new File(DICTIONARY_HOME, tests[i]);
@@ -210,7 +210,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
         
           try (InputStream dictionary = zip.getInputStream(dicEntry);
                InputStream affix = zip.getInputStream(affEntry)) {
-              new Dictionary(affix, dictionary);
+            new Dictionary(affix, dictionary);
           }
         }
       }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index bf7da6bf7d1..a653ddb36a2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -79,6 +79,40 @@ public class TestDictionary extends LuceneTestCase {
     affixStream.close();
     dictStream.close();
   }
+  
+  public void testCompressedBeforeSetDictionary() throws Exception {
+    InputStream affixStream = getClass().getResourceAsStream("compressed-before-set.aff");
+    InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
+
+    Dictionary dictionary = new Dictionary(affixStream, dictStream);
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
+    IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3);
+    BytesRef ref = new BytesRef();
+    dictionary.flagLookup.get(ordList.ints[0], ref);
+    char flags[] = Dictionary.decodeFlags(ref);
+    assertEquals(1, flags.length);
+    
+    affixStream.close();
+    dictStream.close();
+  }
+  
+  public void testCompressedEmptyAliasDictionary() throws Exception {
+    InputStream affixStream = getClass().getResourceAsStream("compressed-empty-alias.aff");
+    InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
+
+    Dictionary dictionary = new Dictionary(affixStream, dictStream);
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
+    IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3);
+    BytesRef ref = new BytesRef();
+    dictionary.flagLookup.get(ordList.ints[0], ref);
+    char flags[] = Dictionary.decodeFlags(ref);
+    assertEquals(1, flags.length);
+    
+    affixStream.close();
+    dictStream.close();
+  }
 
   // malformed rule causes ParseException
   public void testInvalidData() throws Exception {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff
new file mode 100644
index 00000000000..e4a1b37300f
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff
@@ -0,0 +1,29 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+FLAG long
+
+AF 5
+AF AA
+AF BB
+AF CC
+AF DD
+AF EE
+
+SFX AA Y 3
+SFX AA   0     e         n
+SFX AA   0     e         t
+SFX AA   0     e         h
+
+SFX CC Y 2
+SFX CC   0     d/3       c
+SFX CC   0     c         b
+
+SFX DD Y 1
+SFX DD   0     s         o
+
+SFX EE Y 1
+SFX EE   0     d         o
+
+PFX BB Y 1
+PFX BB   0     s         o
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff
new file mode 100644
index 00000000000..a27273fc714
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff
@@ -0,0 +1,30 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+FLAG long
+
+AF 6
+AF AA
+AF BB
+AF CC
+AF DD
+AF EE
+AF  
+
+SFX AA Y 3
+SFX AA   0     e         n
+SFX AA   0     e         t
+SFX AA   0     e         h
+
+SFX CC Y 2
+SFX CC   0     d/3       c
+SFX CC   0     c         b
+
+SFX DD Y 1
+SFX DD   0     s         o
+
+SFX EE Y 1
+SFX EE   0     d         o
+
+PFX BB Y 1
+PFX BB   0     s         o
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
index e4a1b37300f..c747c27ef80 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
@@ -1,8 +1,3 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-FLAG long
-
 AF 5
 AF AA
 AF BB
@@ -10,6 +5,11 @@ AF CC
 AF DD
 AF EE
 
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+FLAG long
+
 SFX AA Y 3
 SFX AA   0     e         n
 SFX AA   0     e         t