mirror of https://github.com/apache/lucene.git
LUCENE-5507: fix hunspell affix loading for certain dictionaries
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1575729 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8c4068413f
commit
d179a26647
|
@ -153,6 +153,9 @@ Bug fixes
|
||||||
* LUCENE-5505: HunspellStemFilter ignores BOM markers in dictionaries and handles varying
|
* LUCENE-5505: HunspellStemFilter ignores BOM markers in dictionaries and handles varying
|
||||||
types of whitespace in SET/FLAG commands. (Robert Muir)
|
types of whitespace in SET/FLAG commands. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-5507: Fix HunspellStemFilter loading of dictionaries with large amounts of aliases
|
||||||
|
etc before the encoding declaration. (Robert Muir)
|
||||||
|
|
||||||
Test Framework
|
Test Framework
|
||||||
|
|
||||||
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
|
* LUCENE-5449: Rename _TestUtil and _TestHelper to remove the leading _.
|
||||||
|
|
|
@ -35,12 +35,16 @@ import org.apache.lucene.util.fst.Outputs;
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.LineNumberReader;
|
import java.io.LineNumberReader;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
|
@ -155,21 +159,41 @@ public class Dictionary {
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
this.needsInputCleaning = ignoreCase;
|
this.needsInputCleaning = ignoreCase;
|
||||||
this.needsOutputCleaning = false; // set if we have an OCONV
|
this.needsOutputCleaning = false; // set if we have an OCONV
|
||||||
// TODO: we really need to probably buffer this on disk since so many newer dictionaries
|
|
||||||
// (en_GB, hu_HU, etc) now have tons of AM lines (morph metadata) etc before they finally declare
|
|
||||||
// their encoding... but for now this large buffer is a workaround
|
|
||||||
BufferedInputStream buffered = new BufferedInputStream(affix, 65536);
|
|
||||||
buffered.mark(65536);
|
|
||||||
String encoding = getDictionaryEncoding(buffered);
|
|
||||||
buffered.reset();
|
|
||||||
CharsetDecoder decoder = getJavaEncoding(encoding);
|
|
||||||
readAffixFile(buffered, decoder);
|
|
||||||
flagLookup.add(new BytesRef()); // no flags -> ord 0
|
flagLookup.add(new BytesRef()); // no flags -> ord 0
|
||||||
stripLookup.add(new BytesRef()); // no strip -> ord 0
|
stripLookup.add(new BytesRef()); // no strip -> ord 0
|
||||||
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
|
||||||
Builder<IntsRef> b = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, o);
|
File aff = File.createTempFile("affix", "aff", tempDir);
|
||||||
readDictionaryFiles(dictionaries, decoder, b);
|
OutputStream out = new BufferedOutputStream(new FileOutputStream(aff));
|
||||||
words = b.finish();
|
InputStream aff1 = null;
|
||||||
|
InputStream aff2 = null;
|
||||||
|
try {
|
||||||
|
// copy contents of affix stream to temp file
|
||||||
|
final byte [] buffer = new byte [1024 * 8];
|
||||||
|
int len;
|
||||||
|
while ((len = affix.read(buffer)) > 0) {
|
||||||
|
out.write(buffer, 0, len);
|
||||||
|
}
|
||||||
|
out.close();
|
||||||
|
|
||||||
|
// pass 1: get encoding
|
||||||
|
aff1 = new BufferedInputStream(new FileInputStream(aff));
|
||||||
|
String encoding = getDictionaryEncoding(aff1);
|
||||||
|
|
||||||
|
// pass 2: parse affixes
|
||||||
|
CharsetDecoder decoder = getJavaEncoding(encoding);
|
||||||
|
aff2 = new BufferedInputStream(new FileInputStream(aff));
|
||||||
|
readAffixFile(aff2, decoder);
|
||||||
|
|
||||||
|
// read dictionary entries
|
||||||
|
IntSequenceOutputs o = IntSequenceOutputs.getSingleton();
|
||||||
|
Builder<IntsRef> b = new Builder<IntsRef>(FST.INPUT_TYPE.BYTE4, o);
|
||||||
|
readDictionaryFiles(dictionaries, decoder, b);
|
||||||
|
words = b.finish();
|
||||||
|
aliases = null; // no longer needed
|
||||||
|
} finally {
|
||||||
|
IOUtils.closeWhileHandlingException(out, aff1, aff2);
|
||||||
|
aff.delete();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -772,7 +796,9 @@ public class Dictionary {
|
||||||
final int count = Integer.parseInt(ruleArgs[1]);
|
final int count = Integer.parseInt(ruleArgs[1]);
|
||||||
aliases = new String[count];
|
aliases = new String[count];
|
||||||
} else {
|
} else {
|
||||||
aliases[aliasCount++] = ruleArgs[1];
|
// an alias can map to no flags
|
||||||
|
String aliasValue = ruleArgs.length == 1 ? "" : ruleArgs[1];
|
||||||
|
aliases[aliasCount++] = aliasValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
||||||
"afrikaans_spell_checker-20110323-fx+tb+fn+sm.xpi", "dictionaries/af-ZA.dic", "dictionaries/af-ZA.aff",
|
"afrikaans_spell_checker-20110323-fx+tb+fn+sm.xpi", "dictionaries/af-ZA.dic", "dictionaries/af-ZA.aff",
|
||||||
"albanisches_worterbuch-1.6.9-fx+tb+sm+fn.xpi", "dictionaries/sq.dic", "dictionaries/sq.aff",
|
"albanisches_worterbuch-1.6.9-fx+tb+sm+fn.xpi", "dictionaries/sq.dic", "dictionaries/sq.aff",
|
||||||
"amharic_spell_checker-0.4-fx+fn+tb+sm.xpi", "dictionaries/am_ET.dic", "dictionaries/am_ET.aff",
|
"amharic_spell_checker-0.4-fx+fn+tb+sm.xpi", "dictionaries/am_ET.dic", "dictionaries/am_ET.aff",
|
||||||
//BUG! "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic", "dictionaries/ar.aff",
|
"arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic", "dictionaries/ar.aff",
|
||||||
"armenian_spell_checker_dictionary-0.32-fx+tb+sm.xpi", "dictionaries/hy_AM.dic", "dictionaries/hy_AM.aff",
|
"armenian_spell_checker_dictionary-0.32-fx+tb+sm.xpi", "dictionaries/hy_AM.dic", "dictionaries/hy_AM.aff",
|
||||||
"azerbaijani_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/az-Latn-AZ.dic", "dictionaries/az-Latn-AZ.aff",
|
"azerbaijani_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/az-Latn-AZ.dic", "dictionaries/az-Latn-AZ.aff",
|
||||||
"belarusian_classic_dictionary-0.1.2-tb+fx+sm.xpi", "dictionaries/be-classic.dic", "dictionaries/be-classic.aff",
|
"belarusian_classic_dictionary-0.1.2-tb+fx+sm.xpi", "dictionaries/be-classic.dic", "dictionaries/be-classic.aff",
|
||||||
|
@ -101,8 +101,8 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
||||||
"hausa_spelling_dictionary-0.2-tb+fx.xpi", "dictionaries/ha-GH.dic", "dictionaries/ha-GH.aff",
|
"hausa_spelling_dictionary-0.2-tb+fx.xpi", "dictionaries/ha-GH.dic", "dictionaries/ha-GH.aff",
|
||||||
"hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi", "dictionaries/he.dic", "dictionaries/he.aff",
|
"hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi", "dictionaries/he.dic", "dictionaries/he.aff",
|
||||||
"hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi", "dictionaries/hi_IN.dic", "dictionaries/hi_IN.aff",
|
"hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi", "dictionaries/hi_IN.dic", "dictionaries/hi_IN.aff",
|
||||||
//BUG! "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff",
|
"hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff",
|
||||||
//BUG! "icelandic_dictionary-1.3-fx+tb+sm.xpi", "dictionaries/is.dic", "dictionaries/is.aff",
|
//BUG: has no encoding declaration "icelandic_dictionary-1.3-fx+tb+sm.xpi", "dictionaries/is.dic", "dictionaries/is.aff",
|
||||||
"kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi", "dictionaries/id.dic", "dictionaries/id.aff",
|
"kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi", "dictionaries/id.dic", "dictionaries/id.aff",
|
||||||
"kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff",
|
"kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff",
|
||||||
"kashubian_spell_checker_poland-0.9-sm+tb+fx.xpi", "dictionaries/Kaszebsczi.dic", "dictionaries/Kaszebsczi.aff",
|
"kashubian_spell_checker_poland-0.9-sm+tb+fx.xpi", "dictionaries/Kaszebsczi.dic", "dictionaries/Kaszebsczi.aff",
|
||||||
|
@ -146,11 +146,11 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
||||||
"telugu_spell_checker-0.3-tb+fx+sm.xpi", "dictionaries/te_IN.dic", "dictionaries/te_IN.aff",
|
"telugu_spell_checker-0.3-tb+fx+sm.xpi", "dictionaries/te_IN.dic", "dictionaries/te_IN.aff",
|
||||||
"te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi-x-Tai Tokerau.dic", "dictionaries/mi-x-Tai Tokerau.aff",
|
"te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi-x-Tai Tokerau.dic", "dictionaries/mi-x-Tai Tokerau.aff",
|
||||||
"te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi.dic", "dictionaries/mi.aff",
|
"te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi.dic", "dictionaries/mi.aff",
|
||||||
//BUG! "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi", "dictionaries/ta_IN.dic", "dictionaries/ta_IN.aff",
|
//BUG: broken file (hunspell refuses to load, too) "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi", "dictionaries/ta_IN.dic", "dictionaries/ta_IN.aff",
|
||||||
"tsonga_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/ts-ZA.dic", "dictionaries/ts-ZA.aff",
|
"tsonga_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/ts-ZA.dic", "dictionaries/ts-ZA.aff",
|
||||||
"tswana_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/tn-ZA.dic", "dictionaries/tn-ZA.aff",
|
"tswana_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/tn-ZA.dic", "dictionaries/tn-ZA.aff",
|
||||||
"turkce_yazm_denetimi-3.5-sm+tb+fx.xpi", "dictionaries/tr.dic", "dictionaries/tr.aff",
|
"turkce_yazm_denetimi-3.5-sm+tb+fx.xpi", "dictionaries/tr.dic", "dictionaries/tr.aff",
|
||||||
//BUG! "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi", "dictionaries/tk_TM.dic", "dictionaries/tk_TM.aff",
|
"turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi", "dictionaries/tk_TM.dic", "dictionaries/tk_TM.aff",
|
||||||
"ukrainian_dictionary-1.7.0-sm+an+fx+fn+tb.xpi", "dictionaries/uk-UA.dic", "dictionaries/uk-UA.aff",
|
"ukrainian_dictionary-1.7.0-sm+an+fx+fn+tb.xpi", "dictionaries/uk-UA.dic", "dictionaries/uk-UA.aff",
|
||||||
"united_states_english_spellchecker-7.0.1-sm+tb+fx+an.xpi", "dictionaries/en-US.dic", "dictionaries/en-US.aff",
|
"united_states_english_spellchecker-7.0.1-sm+tb+fx+an.xpi", "dictionaries/en-US.dic", "dictionaries/en-US.aff",
|
||||||
"upper_sorbian_spelling_dictionary-0.0.20060327.3-tb+fx+sm.xpi", "dictionaries/hsb.dic", "dictionaries/hsb.aff",
|
"upper_sorbian_spelling_dictionary-0.0.20060327.3-tb+fx+sm.xpi", "dictionaries/hsb.dic", "dictionaries/hsb.aff",
|
||||||
|
@ -196,7 +196,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOneDictionary() throws Exception {
|
public void testOneDictionary() throws Exception {
|
||||||
String toTest = "turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi";
|
String toTest = "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi";
|
||||||
for (int i = 0; i < tests.length; i++) {
|
for (int i = 0; i < tests.length; i++) {
|
||||||
if (tests[i].equals(toTest)) {
|
if (tests[i].equals(toTest)) {
|
||||||
File f = new File(DICTIONARY_HOME, tests[i]);
|
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||||
|
@ -210,7 +210,7 @@ public class TestAllDictionaries2 extends LuceneTestCase {
|
||||||
|
|
||||||
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||||
InputStream affix = zip.getInputStream(affEntry)) {
|
InputStream affix = zip.getInputStream(affEntry)) {
|
||||||
new Dictionary(affix, dictionary);
|
new Dictionary(affix, dictionary);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,6 +79,40 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
affixStream.close();
|
affixStream.close();
|
||||||
dictStream.close();
|
dictStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCompressedBeforeSetDictionary() throws Exception {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("compressed-before-set.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
|
||||||
|
|
||||||
|
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||||
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
|
||||||
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
|
||||||
|
IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3);
|
||||||
|
BytesRef ref = new BytesRef();
|
||||||
|
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||||
|
char flags[] = Dictionary.decodeFlags(ref);
|
||||||
|
assertEquals(1, flags.length);
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompressedEmptyAliasDictionary() throws Exception {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("compressed-empty-alias.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
|
||||||
|
|
||||||
|
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||||
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length);
|
||||||
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length);
|
||||||
|
IntsRef ordList = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3);
|
||||||
|
BytesRef ref = new BytesRef();
|
||||||
|
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||||
|
char flags[] = Dictionary.decodeFlags(ref);
|
||||||
|
assertEquals(1, flags.length);
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
// malformed rule causes ParseException
|
// malformed rule causes ParseException
|
||||||
public void testInvalidData() throws Exception {
|
public void testInvalidData() throws Exception {
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
FLAG long
|
||||||
|
|
||||||
|
AF 5
|
||||||
|
AF AA
|
||||||
|
AF BB
|
||||||
|
AF CC
|
||||||
|
AF DD
|
||||||
|
AF EE
|
||||||
|
|
||||||
|
SFX AA Y 3
|
||||||
|
SFX AA 0 e n
|
||||||
|
SFX AA 0 e t
|
||||||
|
SFX AA 0 e h
|
||||||
|
|
||||||
|
SFX CC Y 2
|
||||||
|
SFX CC 0 d/3 c
|
||||||
|
SFX CC 0 c b
|
||||||
|
|
||||||
|
SFX DD Y 1
|
||||||
|
SFX DD 0 s o
|
||||||
|
|
||||||
|
SFX EE Y 1
|
||||||
|
SFX EE 0 d o
|
||||||
|
|
||||||
|
PFX BB Y 1
|
||||||
|
PFX BB 0 s o
|
|
@ -0,0 +1,30 @@
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
FLAG long
|
||||||
|
|
||||||
|
AF 6
|
||||||
|
AF AA
|
||||||
|
AF BB
|
||||||
|
AF CC
|
||||||
|
AF DD
|
||||||
|
AF EE
|
||||||
|
AF
|
||||||
|
|
||||||
|
SFX AA Y 3
|
||||||
|
SFX AA 0 e n
|
||||||
|
SFX AA 0 e t
|
||||||
|
SFX AA 0 e h
|
||||||
|
|
||||||
|
SFX CC Y 2
|
||||||
|
SFX CC 0 d/3 c
|
||||||
|
SFX CC 0 c b
|
||||||
|
|
||||||
|
SFX DD Y 1
|
||||||
|
SFX DD 0 s o
|
||||||
|
|
||||||
|
SFX EE Y 1
|
||||||
|
SFX EE 0 d o
|
||||||
|
|
||||||
|
PFX BB Y 1
|
||||||
|
PFX BB 0 s o
|
|
@ -1,8 +1,3 @@
|
||||||
SET UTF-8
|
|
||||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
|
||||||
|
|
||||||
FLAG long
|
|
||||||
|
|
||||||
AF 5
|
AF 5
|
||||||
AF AA
|
AF AA
|
||||||
AF BB
|
AF BB
|
||||||
|
@ -10,6 +5,11 @@ AF CC
|
||||||
AF DD
|
AF DD
|
||||||
AF EE
|
AF EE
|
||||||
|
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
FLAG long
|
||||||
|
|
||||||
SFX AA Y 3
|
SFX AA Y 3
|
||||||
SFX AA 0 e n
|
SFX AA 0 e n
|
||||||
SFX AA 0 e t
|
SFX AA 0 e t
|
||||||
|
|
Loading…
Reference in New Issue