From d179a26647949ed030ad5bb257eb83054e7a0456 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sun, 9 Mar 2014 17:27:49 +0000 Subject: [PATCH] LUCENE-5507: fix hunspell affix loading for certain dictionaries git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1575729 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 ++ .../lucene/analysis/hunspell/Dictionary.java | 54 ++++++++++++++----- .../hunspell/TestAllDictionaries2.java | 14 ++--- .../analysis/hunspell/TestDictionary.java | 34 ++++++++++++ .../hunspell/compressed-before-set.aff | 29 ++++++++++ .../hunspell/compressed-empty-alias.aff | 30 +++++++++++ .../lucene/analysis/hunspell/compressed.aff | 10 ++-- 7 files changed, 148 insertions(+), 26 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f6800af51d7..43ef7cbe458 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -153,6 +153,9 @@ Bug fixes * LUCENE-5505: HunspellStemFilter ignores BOM markers in dictionaries and handles varying types of whitespace in SET/FLAG commands. (Robert Muir) +* LUCENE-5507: Fix HunspellStemFilter loading of dictionaries with large amounts of aliases + etc before the encoding declaration. (Robert Muir) + Test Framework * LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 68a4b457749..0c8ad44cf16 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -35,12 +35,16 @@ import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.Util; import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; +import java.io.OutputStream; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; @@ -155,21 +159,41 @@ public class Dictionary { this.ignoreCase = ignoreCase; this.needsInputCleaning = ignoreCase; this.needsOutputCleaning = false; // set if we have an OCONV - // TODO: we really need to probably buffer this on disk since so many newer dictionaries - // (en_GB, hu_HU, etc) now have tons of AM lines (morph metadata) etc before they finally declare - // their encoding... but for now this large buffer is a workaround - BufferedInputStream buffered = new BufferedInputStream(affix, 65536); - buffered.mark(65536); - String encoding = getDictionaryEncoding(buffered); - buffered.reset(); - CharsetDecoder decoder = getJavaEncoding(encoding); - readAffixFile(buffered, decoder); flagLookup.add(new BytesRef()); // no flags -> ord 0 stripLookup.add(new BytesRef()); // no strip -> ord 0 - IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); - Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); - readDictionaryFiles(dictionaries, decoder, b); - words = b.finish(); + + File aff = File.createTempFile("affix", "aff", tempDir); + OutputStream out = new BufferedOutputStream(new FileOutputStream(aff)); + InputStream aff1 = null; + InputStream aff2 = null; + try { + // copy contents of affix stream to temp file + final byte [] buffer = new byte [1024 * 8]; + int len; + while ((len = affix.read(buffer)) > 0) { + out.write(buffer, 0, len); + } + out.close(); + + // pass 1: get encoding + aff1 = new BufferedInputStream(new FileInputStream(aff)); + String encoding = getDictionaryEncoding(aff1); + + // pass 2: parse affixes + CharsetDecoder decoder = getJavaEncoding(encoding); + aff2 = new BufferedInputStream(new FileInputStream(aff)); + readAffixFile(aff2, decoder); + + // read dictionary entries + IntSequenceOutputs o = IntSequenceOutputs.getSingleton(); + Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); + readDictionaryFiles(dictionaries, decoder, b); + words = b.finish(); + aliases = null; // no longer needed + } finally { + IOUtils.closeWhileHandlingException(out, aff1, aff2); + aff.delete(); + } } /** @@ -772,7 +796,9 @@ public class Dictionary { final int count = Integer.parseInt(ruleArgs[1]); aliases = new String[count]; } else { - aliases[aliasCount++] = ruleArgs[1]; + // an alias can map to no flags + String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1]; + aliases[aliasCount++] = aliasValue; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java index 5e7935a66d8..21d7ad62c36 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries2.java @@ -47,7 +47,7 @@ public class TestAllDictionaries2 extends LuceneTestCase { "afrikaans_spell_checker-20110323-fx+tb+fn+sm.xpi", "dictionaries/af-ZA.dic", "dictionaries/af-ZA.aff", "albanisches_worterbuch-1.6.9-fx+tb+sm+fn.xpi", "dictionaries/sq.dic", "dictionaries/sq.aff", "amharic_spell_checker-0.4-fx+fn+tb+sm.xpi", "dictionaries/am_ET.dic", "dictionaries/am_ET.aff", -//BUG! "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic", "dictionaries/ar.aff", + "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic", "dictionaries/ar.aff", "armenian_spell_checker_dictionary-0.32-fx+tb+sm.xpi", "dictionaries/hy_AM.dic", "dictionaries/hy_AM.aff", "azerbaijani_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/az-Latn-AZ.dic", "dictionaries/az-Latn-AZ.aff", "belarusian_classic_dictionary-0.1.2-tb+fx+sm.xpi", "dictionaries/be-classic.dic", "dictionaries/be-classic.aff", @@ -101,8 +101,8 @@ public class TestAllDictionaries2 extends LuceneTestCase { "hausa_spelling_dictionary-0.2-tb+fx.xpi", "dictionaries/ha-GH.dic", "dictionaries/ha-GH.aff", "hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi", "dictionaries/he.dic", "dictionaries/he.aff", "hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi", "dictionaries/hi_IN.dic", "dictionaries/hi_IN.aff", -//BUG! "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff", -//BUG! "icelandic_dictionary-1.3-fx+tb+sm.xpi", "dictionaries/is.dic", "dictionaries/is.aff", + "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff", +//BUG: has no encoding declaration "icelandic_dictionary-1.3-fx+tb+sm.xpi", "dictionaries/is.dic", "dictionaries/is.aff", "kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi", "dictionaries/id.dic", "dictionaries/id.aff", "kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff", "kashubian_spell_checker_poland-0.9-sm+tb+fx.xpi", "dictionaries/Kaszebsczi.dic", "dictionaries/Kaszebsczi.aff", @@ -146,11 +146,11 @@ public class TestAllDictionaries2 extends LuceneTestCase { "telugu_spell_checker-0.3-tb+fx+sm.xpi", "dictionaries/te_IN.dic", "dictionaries/te_IN.aff", "te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi-x-Tai Tokerau.dic", "dictionaries/mi-x-Tai Tokerau.aff", "te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi.dic", "dictionaries/mi.aff", -//BUG! "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi", "dictionaries/ta_IN.dic", "dictionaries/ta_IN.aff", +//BUG: broken file (hunspell refuses to load, too) "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi", "dictionaries/ta_IN.dic", "dictionaries/ta_IN.aff", "tsonga_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/ts-ZA.dic", "dictionaries/ts-ZA.aff", "tswana_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/tn-ZA.dic", "dictionaries/tn-ZA.aff", "turkce_yazm_denetimi-3.5-sm+tb+fx.xpi", "dictionaries/tr.dic", "dictionaries/tr.aff", -//BUG! "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi", "dictionaries/tk_TM.dic", "dictionaries/tk_TM.aff", + "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi", "dictionaries/tk_TM.dic", "dictionaries/tk_TM.aff", "ukrainian_dictionary-1.7.0-sm+an+fx+fn+tb.xpi", "dictionaries/uk-UA.dic", "dictionaries/uk-UA.aff", "united_states_english_spellchecker-7.0.1-sm+tb+fx+an.xpi", "dictionaries/en-US.dic", "dictionaries/en-US.aff", "upper_sorbian_spelling_dictionary-0.0.20060327.3-tb+fx+sm.xpi", "dictionaries/hsb.dic", "dictionaries/hsb.aff", @@ -196,7 +196,7 @@ public class TestAllDictionaries2 extends LuceneTestCase { } public void testOneDictionary() throws Exception { - String toTest = "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi"; + String toTest = "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi"; for (int i = 0; i < tests.length; i++) { if (tests[i].equals(toTest)) { File f = new File(DICTIONARY_HOME, tests[i]); @@ -210,7 +210,7 @@ public class TestAllDictionaries2 extends LuceneTestCase { try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { - new Dictionary(affix, dictionary); + new Dictionary(affix, dictionary); } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index bf7da6bf7d1..a653ddb36a2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -79,6 +79,40 @@ public class TestDictionary extends LuceneTestCase { affixStream.close(); dictStream.close(); } + + public void testCompressedBeforeSetDictionary() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("compressed-before-set.aff"); + InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); + + Dictionary dictionary = new Dictionary(affixStream, dictStream); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); + IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3); + BytesRef ref = new BytesRef(); + dictionary.flagLookup.get(ordList.ints[0], ref); + char flags[] = Dictionary.decodeFlags(ref); + assertEquals(1, flags.length); + + affixStream.close(); + dictStream.close(); + } + + public void testCompressedEmptyAliasDictionary() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("compressed-empty-alias.aff"); + InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); + + Dictionary dictionary = new Dictionary(affixStream, dictStream); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); + IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3); + BytesRef ref = new BytesRef(); + dictionary.flagLookup.get(ordList.ints[0], ref); + char flags[] = Dictionary.decodeFlags(ref); + assertEquals(1, flags.length); + + affixStream.close(); + dictStream.close(); + } // malformed rule causes ParseException public void testInvalidData() throws Exception { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff new file mode 100644 index 00000000000..e4a1b37300f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-before-set.aff @@ -0,0 +1,29 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +FLAG long + +AF 5 +AF AA +AF BB +AF CC +AF DD +AF EE + +SFX AA Y 3 +SFX AA 0 e n +SFX AA 0 e t +SFX AA 0 e h + +SFX CC Y 2 +SFX CC 0 d/3 c +SFX CC 0 c b + +SFX DD Y 1 +SFX DD 0 s o + +SFX EE Y 1 +SFX EE 0 d o + +PFX BB Y 1 +PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff new file mode 100644 index 00000000000..a27273fc714 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed-empty-alias.aff @@ -0,0 +1,30 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +FLAG long + +AF 6 +AF AA +AF BB +AF CC +AF DD +AF EE +AF + +SFX AA Y 3 +SFX AA 0 e n +SFX AA 0 e t +SFX AA 0 e h + +SFX CC Y 2 +SFX CC 0 d/3 c +SFX CC 0 c b + +SFX DD Y 1 +SFX DD 0 s o + +SFX EE Y 1 +SFX EE 0 d o + +PFX BB Y 1 +PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff index e4a1b37300f..c747c27ef80 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff @@ -1,8 +1,3 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -FLAG long - AF 5 AF AA AF BB @@ -10,6 +5,11 @@ AF CC AF DD AF EE +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +FLAG long + SFX AA Y 3 SFX AA 0 e n SFX AA 0 e t