LUCENE-9723: Hunspell: update sanity tests that load all dictionaries (#2290)

This commit is contained in:
Peter Gromov 2021-02-03 10:45:35 +01:00 committed by GitHub
parent d0ae2bd2b9
commit 84aa683b6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 69 additions and 541 deletions

View File

@ -93,6 +93,7 @@ grant {
// Some Hunspell tests may read from external files specified in system properties // Some Hunspell tests may read from external files specified in system properties
permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read"; permission java.io.FilePermission "${hunspell.repo.path}${/}-", "read";
permission java.io.FilePermission "${hunspell.dictionaries}${/}-", "read";
}; };
// Permissions to support ant build // Permissions to support ant build

View File

@ -16,11 +16,14 @@
*/ */
/** /**
* Stemming TokenFilter using a Java implementation of the <a * A Java implementation of <a href="http://hunspell.github.io/">Hunspell</a> stemming and
* href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">Hunspell stemming * spell-checking algorithms, and a stemming TokenFilter based on it.
* algorithm.</a>
* *
* <p>Dictionaries can be found on <a * <p>For dictionaries, see e.g. <a href="https://github.com/LibreOffice/dictionaries">LibreOffice
* href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice's wiki</a> * repository</a> or <a href="https://github.com/wooorm/dictionaries">Titus Wormer's collection
* (UTF)</a>
*
* @see org.apache.lucene.analysis.hunspell.HunspellStemFilter
* @see org.apache.lucene.analysis.hunspell.SpellChecker
*/ */
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;

View File

@ -16,224 +16,87 @@
*/ */
package org.apache.lucene.analysis.hunspell; package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.text.ParseException;
import org.apache.lucene.store.Directory; import java.util.List;
import org.apache.lucene.util.IOUtils; import java.util.stream.Collectors;
import org.apache.lucene.store.BaseDirectoryWrapper;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks; import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.util.RamUsageTester; import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil; import org.junit.Assume;
import org.junit.Ignore; import org.junit.Ignore;
/** /**
* Can be retrieved via: wget --mirror -np * Loads all dictionaries from the directory specified in {@code -Dhunspell.dictionaries=...} and
* http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ Note some * prints their memory usage. All *.aff files are traversed directly inside the given directory or
* of the files differ only in case. This may be a problem on your operating system! * in its immediate subdirectories. Each *.aff file must have a same-named sibling *.dic file. For
* examples of such directories, refer to the {@link org.apache.lucene.analysis.hunspell package
* documentation}
*/ */
@Ignore("enable manually") @Ignore("enable manually")
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary") @SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries extends LuceneTestCase { public class TestAllDictionaries extends LuceneTestCase {
// set this to the location of where you downloaded all the files private static List<Path> findAllAffixFiles() throws IOException {
static final Path DICTIONARY_HOME = String dicDir = System.getProperty("hunspell.dictionaries");
Paths.get( Assume.assumeFalse("Missing -Dhunspell.dictionaries=...", dicDir == null);
"/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); return Files.walk(Path.of(dicDir), 2)
.filter(f -> f.toString().endsWith(".aff"))
.collect(Collectors.toList());
}
final String tests[] = { private static Dictionary loadDictionary(Path aff) throws IOException, ParseException {
/* zip file */ String affPath = aff.toString();
/* dictionary */ Path dic = Path.of(affPath.substring(0, affPath.length() - 4) + ".dic");
/* affix */ assert Files.exists(dic) : dic;
"af_ZA.zip", "af_ZA.dic", "af_ZA.aff", try (InputStream dictionary = Files.newInputStream(dic);
"ak_GH.zip", "ak_GH.dic", "ak_GH.aff", InputStream affix = Files.newInputStream(aff);
"bg_BG.zip", "bg_BG.dic", "bg_BG.aff", BaseDirectoryWrapper tempDir = newDirectory()) {
"ca_ANY.zip", "catalan.dic", "catalan.aff", return new Dictionary(tempDir, "dictionary", affix, dictionary);
"ca_ES.zip", "ca_ES.dic", "ca_ES.aff",
// BUG: broken flag "cop_EG.zip", "cop_EG.dic", "cop_EG.aff",
"cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff",
"cy_GB.zip", "cy_GB.dic", "cy_GB.aff",
"da_DK.zip", "da_DK.dic", "da_DK.aff",
"de_AT.zip", "de_AT.dic", "de_AT.aff",
"de_CH.zip", "de_CH.dic", "de_CH.aff",
"de_DE.zip", "de_DE.dic", "de_DE.aff",
"de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff",
"de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff",
"de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff",
"el_GR.zip", "el_GR.dic", "el_GR.aff",
"en_AU.zip", "en_AU.dic", "en_AU.aff",
"en_CA.zip", "en_CA.dic", "en_CA.aff",
"en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff",
"en_GB.zip", "en_GB.dic", "en_GB.aff",
"en_NZ.zip", "en_NZ.dic", "en_NZ.aff",
"eo.zip", "eo_l3.dic", "eo_l3.aff",
"eo_EO.zip", "eo_EO.dic", "eo_EO.aff",
"es_AR.zip", "es_AR.dic", "es_AR.aff",
"es_BO.zip", "es_BO.dic", "es_BO.aff",
"es_CL.zip", "es_CL.dic", "es_CL.aff",
"es_CO.zip", "es_CO.dic", "es_CO.aff",
"es_CR.zip", "es_CR.dic", "es_CR.aff",
"es_CU.zip", "es_CU.dic", "es_CU.aff",
"es_DO.zip", "es_DO.dic", "es_DO.aff",
"es_EC.zip", "es_EC.dic", "es_EC.aff",
"es_ES.zip", "es_ES.dic", "es_ES.aff",
"es_GT.zip", "es_GT.dic", "es_GT.aff",
"es_HN.zip", "es_HN.dic", "es_HN.aff",
"es_MX.zip", "es_MX.dic", "es_MX.aff",
"es_NEW.zip", "es_NEW.dic", "es_NEW.aff",
"es_NI.zip", "es_NI.dic", "es_NI.aff",
"es_PA.zip", "es_PA.dic", "es_PA.aff",
"es_PE.zip", "es_PE.dic", "es_PE.aff",
"es_PR.zip", "es_PR.dic", "es_PR.aff",
"es_PY.zip", "es_PY.dic", "es_PY.aff",
"es_SV.zip", "es_SV.dic", "es_SV.aff",
"es_UY.zip", "es_UY.dic", "es_UY.aff",
"es_VE.zip", "es_VE.dic", "es_VE.aff",
"et_EE.zip", "et_EE.dic", "et_EE.aff",
"fo_FO.zip", "fo_FO.dic", "fo_FO.aff",
"fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff",
"fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
"fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff",
"fy_NL.zip", "fy_NL.dic", "fy_NL.aff",
"ga_IE.zip", "ga_IE.dic", "ga_IE.aff",
"gd_GB.zip", "gd_GB.dic", "gd_GB.aff",
"gl_ES.zip", "gl_ES.dic", "gl_ES.aff",
"gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff",
"gu_IN.zip", "gu_IN.dic", "gu_IN.aff",
"he_IL.zip", "he_IL.dic", "he_IL.aff",
"hi_IN.zip", "hi_IN.dic", "hi_IN.aff",
"hil_PH.zip", "hil_PH.dic", "hil_PH.aff",
"hr_HR.zip", "hr_HR.dic", "hr_HR.aff",
"hu_HU.zip", "hu_HU.dic", "hu_HU.aff",
"hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff",
"ia.zip", "ia.dic", "ia.aff",
"id_ID.zip", "id_ID.dic", "id_ID.aff",
"it_IT.zip", "it_IT.dic", "it_IT.aff",
"ku_TR.zip", "ku_TR.dic", "ku_TR.aff",
"la.zip", "la.dic", "la.aff",
"lt_LT.zip", "lt_LT.dic", "lt_LT.aff",
"lv_LV.zip", "lv_LV.dic", "lv_LV.aff",
"mg_MG.zip", "mg_MG.dic", "mg_MG.aff",
"mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff",
"mk_MK.zip", "mk_MK.dic", "mk_MK.aff",
"mos_BF.zip", "mos_BF.dic", "mos_BF.aff",
"mr_IN.zip", "mr_IN.dic", "mr_IN.aff",
"ms_MY.zip", "ms_MY.dic", "ms_MY.aff",
"nb_NO.zip", "nb_NO.dic", "nb_NO.aff",
"ne_NP.zip", "ne_NP.dic", "ne_NP.aff",
"nl_NL.zip", "nl_NL.dic", "nl_NL.aff",
"nl_med.zip", "nl_med.dic", "nl_med.aff",
"nn_NO.zip", "nn_NO.dic", "nn_NO.aff",
"nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff",
"ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff",
"ny_MW.zip", "ny_MW.dic", "ny_MW.aff",
"oc_FR.zip", "oc_FR.dic", "oc_FR.aff",
"pl_PL.zip", "pl_PL.dic", "pl_PL.aff",
"pt_BR.zip", "pt_BR.dic", "pt_BR.aff",
"pt_PT.zip", "pt_PT.dic", "pt_PT.aff",
"ro_RO.zip", "ro_RO.dic", "ro_RO.aff",
"ru_RU.zip", "ru_RU.dic", "ru_RU.aff",
"ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff",
"ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff",
"rw_RW.zip", "rw_RW.dic", "rw_RW.aff",
"sk_SK.zip", "sk_SK.dic", "sk_SK.aff",
"sl_SI.zip", "sl_SI.dic", "sl_SI.aff",
"sq_AL.zip", "sq_AL.dic", "sq_AL.aff",
"ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff",
"st_ZA.zip", "st_ZA.dic", "st_ZA.aff",
"sv_SE.zip", "sv_SE.dic", "sv_SE.aff",
"sw_KE.zip", "sw_KE.dic", "sw_KE.aff",
"tet_ID.zip", "tet_ID.dic", "tet_ID.aff",
"th_TH.zip", "th_TH.dic", "th_TH.aff",
"tl_PH.zip", "tl_PH.dic", "tl_PH.aff",
"tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff",
"ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff",
"uk_UA.zip", "uk_UA.dic", "uk_UA.aff",
"ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff",
"vi_VN.zip", "vi_VN.dic", "vi_VN.aff",
"xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff",
"zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff",
};
public void test() throws Exception {
Path tmp = LuceneTestCase.createTempDir();
for (int i = 0; i < tests.length; i += 3) {
Path f = DICTIONARY_HOME.resolve(tests[i]);
assert Files.exists(f);
IOUtils.rm(tmp);
Files.createDirectory(tmp);
try (InputStream in = Files.newInputStream(f);
Directory tempDir = getDirectory()) {
TestUtil.unzip(in, tmp);
Path dicEntry = tmp.resolve(tests[i + 1]);
Path affEntry = tmp.resolve(tests[i + 2]);
try (InputStream dictionary = Files.newInputStream(dicEntry);
InputStream affix = Files.newInputStream(affEntry)) {
Dictionary dic = new Dictionary(tempDir, "dictionary", affix, dictionary);
System.out.println(
tests[i]
+ "\t"
+ RamUsageTester.humanSizeOf(dic)
+ "\t("
+ "words="
+ RamUsageTester.humanSizeOf(dic.words)
+ ", "
+ "flags="
+ RamUsageTester.humanSizeOf(dic.flagLookup)
+ ", "
+ "strips="
+ RamUsageTester.humanSizeOf(dic.stripData)
+ ", "
+ "conditions="
+ RamUsageTester.humanSizeOf(dic.patterns)
+ ", "
+ "affixData="
+ RamUsageTester.humanSizeOf(dic.affixData)
+ ", "
+ "prefixes="
+ RamUsageTester.humanSizeOf(dic.prefixes)
+ ", "
+ "suffixes="
+ RamUsageTester.humanSizeOf(dic.suffixes)
+ ")");
}
}
} }
} }
public void testOneDictionary() throws Exception { public void testDictionariesLoadSuccessfully() throws Exception {
Path tmp = LuceneTestCase.createTempDir(); int failures = 0;
for (Path aff : findAllAffixFiles()) {
String toTest = "zu_ZA.zip"; try {
for (int i = 0; i < tests.length; i++) { System.out.println(aff + "\t" + memoryUsage(loadDictionary(aff)));
if (tests[i].equals(toTest)) { } catch (Throwable e) {
Path f = DICTIONARY_HOME.resolve(tests[i]); failures++;
assert Files.exists(f); System.err.println("While checking " + aff + ":");
e.printStackTrace();
IOUtils.rm(tmp);
Files.createDirectory(tmp);
try (InputStream in = Files.newInputStream(f)) {
TestUtil.unzip(in, tmp);
Path dicEntry = tmp.resolve(tests[i + 1]);
Path affEntry = tmp.resolve(tests[i + 2]);
try (InputStream dictionary = Files.newInputStream(dicEntry);
InputStream affix = Files.newInputStream(affEntry);
Directory tempDir = getDirectory()) {
new Dictionary(tempDir, "dictionary", affix, dictionary);
}
}
} }
} }
assertEquals(failures + " failures!", 0, failures);
} }
private Directory getDirectory() { private static String memoryUsage(Dictionary dic) {
return newDirectory(); return RamUsageTester.humanSizeOf(dic)
+ "\t("
+ "words="
+ RamUsageTester.humanSizeOf(dic.words)
+ ", "
+ "flags="
+ RamUsageTester.humanSizeOf(dic.flagLookup)
+ ", "
+ "strips="
+ RamUsageTester.humanSizeOf(dic.stripData)
+ ", "
+ "conditions="
+ RamUsageTester.humanSizeOf(dic.patterns)
+ ", "
+ "affixData="
+ RamUsageTester.humanSizeOf(dic.affixData)
+ ", "
+ "prefixes="
+ RamUsageTester.humanSizeOf(dic.prefixes)
+ ", "
+ "suffixes="
+ RamUsageTester.humanSizeOf(dic.suffixes)
+ ")";
} }
} }

View File

@ -1,339 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.hunspell;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.SuppressSysoutChecks;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil;
import org.junit.Ignore;
/**
* These thunderbird dictionaries can be retrieved via:
* https://addons.mozilla.org/en-US/thunderbird/language-tools/ You must click and download every
* file: sorry!
*/
@Ignore("enable manually")
@SuppressSysoutChecks(bugUrl = "prints important memory utilization stats per dictionary")
public class TestAllDictionaries2 extends LuceneTestCase {
// set this to the location of where you downloaded all the files
static final Path DICTIONARY_HOME = Paths.get("/data/thunderbirdDicts");
/** Triplets of [zip file, dictionary, affix]. */
final String tests[] = {
"addon-0.4.5-an+fx+tb+fn+sm.xpi", "dictionaries/ru.dic", "dictionaries/ru.aff",
"addon-0.5.5-fx+tb.xpi", "dictionaries/ko-KR.dic", "dictionaries/ko-KR.aff",
"afrikaans_spell_checker-20110323-fx+tb+fn+sm.xpi", "dictionaries/af-ZA.dic",
"dictionaries/af-ZA.aff",
"albanisches_worterbuch-1.6.9-fx+tb+sm+fn.xpi", "dictionaries/sq.dic", "dictionaries/sq.aff",
"amharic_spell_checker-0.4-fx+fn+tb+sm.xpi", "dictionaries/am_ET.dic", "dictionaries/am_ET.aff",
"arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic",
"dictionaries/ar.aff",
"armenian_spell_checker_dictionary-0.32-fx+tb+sm.xpi", "dictionaries/hy_AM.dic",
"dictionaries/hy_AM.aff",
"azerbaijani_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/az-Latn-AZ.dic",
"dictionaries/az-Latn-AZ.aff",
"belarusian_classic_dictionary-0.1.2-tb+fx+sm.xpi", "dictionaries/be-classic.dic",
"dictionaries/be-classic.aff",
"belarusian_dictionary-0.1.2-fx+sm+tb.xpi", "dictionaries/be.dic", "dictionaries/be.aff",
"bengali_bangladesh_dictionary-0.08-sm+tb+fx.xpi", "dictionaries/bn-BD.dic",
"dictionaries/bn-BD.aff",
"brazilian_portuguese_dictionary_former_spelling-28.20140203-tb+sm+fx.xpi",
"dictionaries/pt-BR-antigo.dic", "dictionaries/pt-BR-antigo.aff",
"brazilian_portuguese_dictionary_new_spelling-28.20140203-fx+sm+tb.xpi",
"dictionaries/pt-BR.dic", "dictionaries/pt-BR.aff",
"british_english_dictionary_updated-1.19.5-sm+fx+tb.xpi", "dictionaries/en-GB.dic",
"dictionaries/en-GB.aff",
"bulgarian_dictionary-4.3-fx+tb+sm.xpi", "dictionaries/bg.dic", "dictionaries/bg.aff",
"canadian_english_dictionary-2.0.8-fx+sm+tb.xpi", "dictionaries/en-CA.dic",
"dictionaries/en-CA.aff",
"ceske_slovniky_pro_kontrolu_pravopisu-1.0.4-tb+sm+fx.xpi", "dictionaries/cs.dic",
"dictionaries/cs.aff",
"chichewa_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/ny_MW.dic",
"dictionaries/ny_MW.aff",
"corrector_de_galego-13.10.0-fn+sm+tb+fx.xpi", "dictionaries/gl_ES.dic",
"dictionaries/gl_ES.aff",
// BUG: broken flags "corrector_orthographic_de_interlingua-6.0-fn+sm+tb+fx.xpi",
// "dictionaries/ia-ia.dic", "dictionaries/ia-ia.aff",
"corrector_ortografico_aragones-0.2-fx+tb+sm.xpi", "dictionaries/an_ES.dic",
"dictionaries/an_ES.aff",
"croatian_dictionary_-_hrvatski_rjecnik-1.0.1-firefox+thunderbird+seamonkey.xpi",
"dictionaries/hr.dic", "dictionaries/hr.aff",
"croatian_dictionary_hrvatski_rjecnik-1.0.9-an+fx+fn+tb+sm.xpi", "dictionaries/hr-HR.dic",
"dictionaries/hr-HR.aff",
"dansk_ordbog_til_stavekontrollen-2.2.1-sm+tb+fx.xpi", "dictionaries/da.dic",
"dictionaries/da.aff",
"deutsches_worterbuch_de_de_alte_rechtschreibung-2.1.8-sm.xpi", "dictionaries/de-DE-1901.dic",
"dictionaries/de-DE-1901.aff",
"diccionario_de_espanolespana-1.7-sm+tb+fn+fx.xpi", "dictionaries/es-ES.dic",
"dictionaries/es-ES.aff",
"diccionario_en_espanol_para_venezuela-1.1.17-sm+an+tb+fn+fx.xpi", "dictionaries/es_VE.dic",
"dictionaries/es_VE.aff",
"diccionario_espanol_argentina-2.5.1-tb+fx+sm.xpi", "dictionaries/es_AR.dic",
"dictionaries/es_AR.aff",
"diccionario_espanol_mexico-1.1.3-fn+tb+fx+sm.xpi", "dictionaries/es_MX.dic",
"dictionaries/es_MX.aff",
"diccionario_ortografico_valenciano-2.2.0-fx+tb+fn+sm.xpi", "dictionaries/roa-ES-val.dic",
"dictionaries/roa-ES-val.aff",
"diccionario_papiamentoaruba-0.2-fn+sm+tb+fx.xpi", "dictionaries/Papiamento.dic",
"dictionaries/Papiamento.aff",
"dictionnaires_francais-5.0.2-fx+tb+sm.xpi", "dictionaries/fr-classic-reform.dic",
"dictionaries/fr-classic-reform.aff",
"dictionnaires_francais-5.0.2-fx+tb+sm.xpi", "dictionaries/fr-classic.dic",
"dictionaries/fr-classic.aff",
"dictionnaires_francais-5.0.2-fx+tb+sm.xpi", "dictionaries/fr-modern.dic",
"dictionaries/fr-modern.aff",
"dictionnaires_francais-5.0.2-fx+tb+sm.xpi", "dictionaries/fr-reform.dic",
"dictionaries/fr-reform.aff",
"difazier_an_drouizig-0.12-tb+sm+fx.xpi", "dictionaries/br.dic", "dictionaries/br.aff",
"dikshonario_papiamentuantia_hulandes-0.5-fx+tb+fn+sb+sm.xpi", "dictionaries/Papiamentu.dic",
"dictionaries/Papiamentu.aff",
"dizionari_furlan-3.1-tb+fx+sm.xpi", "dictionaries/fur-IT.dic", "dictionaries/fur-IT.aff",
"dizionario_italiano-3.3.2-fx+sm+tb.xpi", "dictionaries/it_IT.dic", "dictionaries/it_IT.aff",
"eesti_keele_speller-3.2-fx+tb+sm.xpi", "dictionaries/et-EE.dic", "dictionaries/et-EE.aff",
"english_australian_dictionary-2.1.2-tb+fx+sm.xpi", "dictionaries/en-AU.dic",
"dictionaries/en-AU.aff",
"esperanta_vortaro-1.0.2-fx+tb+sm.xpi", "dictionaries/eo-EO.dic", "dictionaries/eo-EO.aff",
"european_portuguese_spellchecker-14.1.1.1-tb+fx.xpi", "dictionaries/pt-PT.dic",
"dictionaries/pt-PT.aff",
"faroese_spell_checker_faroe_islands-2.0-tb+sm+fx+fn.xpi", "dictionaries/fo_FO.dic",
"dictionaries/fo_FO.aff",
"frysk_wurdboek-2.1.1-fn+sm+fx+an+tb.xpi", "dictionaries/fy.dic", "dictionaries/fy.aff",
"geiriadur_cymraeg-1.08-tb+sm+fx.xpi", "dictionaries/cy_GB.dic", "dictionaries/cy_GB.aff",
"general_catalan_dictionary-2.5.0-tb+sm+fn+fx.xpi", "dictionaries/ca.dic",
"dictionaries/ca.aff",
"german_dictionary-2.0.3-fn+fx+sm+tb.xpi", "dictionaries/de-DE.dic", "dictionaries/de-DE.aff",
"german_dictionary_de_at_new_orthography-20130905-tb+fn+an+fx+sm.xpi", "dictionaries/de-AT.dic",
"dictionaries/de-AT.aff",
"german_dictionary_de_ch_new_orthography-20130905-fx+tb+fn+sm+an.xpi", "dictionaries/de-CH.dic",
"dictionaries/de-CH.aff",
"german_dictionary_de_de_new_orthography-20130905-tb+sm+an+fn+fx.xpi", "dictionaries/de-DE.dic",
"dictionaries/de-DE.aff",
"german_dictionary_extended_for_austria-2.0.3-fx+fn+sm+tb.xpi", "dictionaries/de-AT.dic",
"dictionaries/de-AT.aff",
"german_dictionary_switzerland-2.0.3-sm+fx+tb+fn.xpi", "dictionaries/de-CH.dic",
"dictionaries/de-CH.aff",
"greek_spelling_dictionary-0.8.5-fx+tb+sm.xpi", "dictionaries/el-GR.dic",
"dictionaries/el-GR.aff",
"gujarati_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/gu_IN.dic",
"dictionaries/gu_IN.aff",
"haitian_creole_spell_checker-0.08-tb+sm+fx.xpi", "dictionaries/ht-HT.dic",
"dictionaries/ht-HT.aff",
"hausa_spelling_dictionary-0.2-tb+fx.xpi", "dictionaries/ha-GH.dic", "dictionaries/ha-GH.aff",
"hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi", "dictionaries/he.dic",
"dictionaries/he.aff",
"hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi", "dictionaries/hi_IN.dic",
"dictionaries/hi_IN.aff",
"hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff",
// BUG: has no encoding declaration "icelandic_dictionary-1.3-fx+tb+sm.xpi",
// "dictionaries/is.dic", "dictionaries/is.aff",
"kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi", "dictionaries/id.dic",
"dictionaries/id.aff",
"kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff",
"kashubian_spell_checker_poland-0.9-sm+tb+fx.xpi", "dictionaries/Kaszebsczi.dic",
"dictionaries/Kaszebsczi.aff",
"kiswahili_spell_checker-0.3-sb+tb+fn+fx+sm.xpi", "dictionaries/sw_TZ.dic",
"dictionaries/sw_TZ.aff",
"kurdish_spell_checker-0.96-fx+tb+sm.xpi", "dictionaries/ku-TR.dic", "dictionaries/ku-TR.aff",
"lao_spellchecking_dictionary-0-fx+tb+sm+fn+an.xpi", "dictionaries/lo_LA.dic",
"dictionaries/lo_LA.aff",
"latviesu_valodas_pareizrakstibas_parbaudes_vardnica-1.0.0-fn+fx+tb+sm.xpi",
"dictionaries/lv_LV.dic", "dictionaries/lv_LV.aff",
"lithuanian_spelling_check_dictionary-1.3-fx+tb+sm+fn.xpi", "dictionaries/lt.dic",
"dictionaries/lt.aff",
"litreoir_gaelspell_do_mhozilla-4.7-tb+fx+sm+fn.xpi", "dictionaries/ga.dic",
"dictionaries/ga.aff",
"litreoir_na_liongailise-0.03-fx+sm+tb.xpi", "dictionaries/ln-CD.dic", "dictionaries/ln-CD.aff",
"macedonian_mk_mk_spellchecker-1.2-fn+tb+fx+sm+sb.xpi", "dictionaries/mk-MK-Cyrl.dic",
"dictionaries/mk-MK-Cyrl.aff",
"macedonian_mk_mk_spellchecker-1.2-fn+tb+fx+sm+sb.xpi", "dictionaries/mk-MK-Latn.dic",
"dictionaries/mk-MK-Latn.aff",
"malagasy_spell_checker-0.3-fn+tb+fx+sm+sb.xpi", "dictionaries/mg_MG.dic",
"dictionaries/mg_MG.aff",
"marathi_dictionary-9.3-sm+tb+sb+fx.xpi", "dictionaries/mr-IN.dic", "dictionaries/mr-IN.aff",
"ndebele_south_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/nr-ZA.dic",
"dictionaries/nr-ZA.aff",
"nepali_dictionary-1.2-fx+tb.xpi", "dictionaries/ne_NP.dic", "dictionaries/ne_NP.aff",
"norsk_bokmal_ordliste-2.0.10.2-fx+tb+sm.xpi", "dictionaries/nb.dic", "dictionaries/nb.aff",
"norsk_nynorsk_ordliste-2.1.0-sm+fx+tb.xpi", "dictionaries/nn.dic", "dictionaries/nn.aff",
"northern_sotho_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/nso-ZA.dic",
"dictionaries/nso-ZA.aff",
"oriya_spell_checker-0.3-fn+tb+fx+sm+sb.xpi", "dictionaries/or-IN.dic",
"dictionaries/or-IN.aff",
"polski_slownik_poprawnej_pisowni-1.0.20110621-fx+tb+sm.xpi", "dictionaries/pl.dic",
"dictionaries/pl.aff",
"punjabi_spell_checker-0.3-fx+tb+sm+sb+fn.xpi", "dictionaries/pa-IN.dic",
"dictionaries/pa-IN.aff",
"romanian_spellchecking_dictionary-1.14-sm+tb+fx.xpi", "dictionaries/ro_RO-ante1993.dic",
"dictionaries/ro_RO-ante1993.aff",
"russian_hunspell_dictionary-1.0.20131101-tb+sm+fn+fx.xpi", "dictionaries/ru_RU.dic",
"dictionaries/ru_RU.aff",
"sanskrit_spell_checker-1.1-fx+tb+sm+sb+fn.xpi", "dictionaries/sa_IN.dic",
"dictionaries/sa_IN.aff",
"scottish_gaelic_spell_checker-2.7-tb+fx+sm.xpi", "dictionaries/gd-GB.dic",
"dictionaries/gd-GB.aff",
"serbian_dictionary-0.18-fx+tb+sm.xpi", "dictionaries/sr-RS-Cyrl.dic",
"dictionaries/sr-RS-Cyrl.aff",
"serbian_dictionary-0.18-fx+tb+sm.xpi", "dictionaries/sr-RS-Latn.dic",
"dictionaries/sr-RS-Latn.aff",
"slovak_spell_checking_dictionary-2.04.0-tb+fx+sm.xpi", "dictionaries/sk-SK.dic",
"dictionaries/sk-SK.aff",
"slovak_spell_checking_dictionary-2.04.0-tb+fx+sm.xpi", "dictionaries/sk-SK-ascii.dic",
"dictionaries/sk-SK-ascii.aff",
"slovar_za_slovenski_jezik-0.1.1.1-fx+tb+sm.xpi", "dictionaries/sl.dic", "dictionaries/sl.aff",
"songhay_spell_checker-0.03-fx+tb+sm.xpi", "dictionaries/Songhay - Mali.dic",
"dictionaries/Songhay - Mali.aff",
"southern_sotho_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/st-ZA.dic",
"dictionaries/st-ZA.aff",
"sownik_acinski-0.41.20110603-tb+fx+sm.xpi", "dictionaries/la.dic", "dictionaries/la.aff",
"sownik_jezyka_dolnouzyckiego-1.4.8-an+fx+tb+fn+sm.xpi", "dictionaries/dsb.dic",
"dictionaries/dsb.aff",
"srpska_latinica-0.1-fx+tb+sm.xpi", "dictionaries/Srpski_latinica.dic",
"dictionaries/Srpski_latinica.aff",
"svenska_fria_ordlistan-1.1-tb+sm+fx.xpi", "dictionaries/sv.dic", "dictionaries/sv.aff",
"svenska_fria_ordlistan-1.1-tb+sm+fx.xpi", "dictionaries/sv_FI.dic", "dictionaries/sv_FI.aff",
"swati_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/ss-ZA.dic",
"dictionaries/ss-ZA.aff",
"tamil_spell_checker_for_firefox-0.4-tb+fx.xpi", "dictionaries/ta-TA.dic",
"dictionaries/ta-TA.aff",
"telugu_spell_checker-0.3-tb+fx+sm.xpi", "dictionaries/te_IN.dic", "dictionaries/te_IN.aff",
"te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi-x-Tai Tokerau.dic",
"dictionaries/mi-x-Tai Tokerau.aff",
"te_papakupu_m__ori-0.9.9.20080630-fx+tb.xpi", "dictionaries/mi.dic", "dictionaries/mi.aff",
// BUG: broken file (hunspell refuses to load, too)
// "thamizha_solthiruthitamil_spellchecker-0.8-fx+tb.xpi",
// "dictionaries/ta_IN.dic", "dictionaries/ta_IN.aff",
"tsonga_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/ts-ZA.dic",
"dictionaries/ts-ZA.aff",
"tswana_spell_checker-20110323-tb+sm+fx+fn.xpi", "dictionaries/tn-ZA.dic",
"dictionaries/tn-ZA.aff",
// BUG: missing FLAG declaration "turkce_yazm_denetimi-3.5-sm+tb+fx.xpi",
// "dictionaries/tr.dic", "dictionaries/tr.aff",
"turkmen_spell_checker_dictionary-0.1.6-tb+fx+sm.xpi", "dictionaries/tk_TM.dic",
"dictionaries/tk_TM.aff",
"ukrainian_dictionary-1.7.0-sm+an+fx+fn+tb.xpi", "dictionaries/uk-UA.dic",
"dictionaries/uk-UA.aff",
"united_states_english_spellchecker-7.0.1-sm+tb+fx+an.xpi", "dictionaries/en-US.dic",
"dictionaries/en-US.aff",
"upper_sorbian_spelling_dictionary-0.0.20060327.3-tb+fx+sm.xpi", "dictionaries/hsb.dic",
"dictionaries/hsb.aff",
"urdu_dictionary-0.64-fx+tb+sm+sb.xpi", "dictionaries/ur.dic", "dictionaries/ur.aff",
"uzbek_spell_checker-0.3-fn+tb+fx+sm+sb.xpi", "dictionaries/uz.dic", "dictionaries/uz.aff",
"valencian_catalan_dictionary-2.5.0-tb+fn+sm+fx.xpi", "dictionaries/ca-ES-valencia.dic",
"dictionaries/ca-ES-valencia.aff",
"venda_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/ve-ZA.dic",
"dictionaries/ve-ZA.aff",
"verificador_ortografico_para_portugues_do_brasil-2.3-3.2b1-tb+sm+fn+fx.xpi",
"dictionaries/pt_BR.dic", "dictionaries/pt_BR.aff",
"vietnamese_dictionary-2.1.0.159-an+sm+tb+fx+fn.xpi", "dictionaries/vi-DauCu.dic",
"dictionaries/vi-DauCu.aff",
"vietnamese_dictionary-2.1.0.159-an+sm+tb+fx+fn.xpi", "dictionaries/vi-DauMoi.dic",
"dictionaries/vi-DauMoi.aff",
"woordenboek_nederlands-3.1.1-sm+tb+fx+fn.xpi", "dictionaries/nl.dic", "dictionaries/nl.aff",
"xhosa_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/xh-ZA.dic",
"dictionaries/xh-ZA.aff",
"xuxen-4.0.1-fx+tb+sm.xpi", "dictionaries/eu.dic", "dictionaries/eu.aff",
"yiddish_spell_checker_yivo-0.0.3-sm+fn+fx+tb.xpi", "dictionaries/yi.dic",
"dictionaries/yi.aff",
"zulu_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/zu-ZA.dic",
"dictionaries/zu-ZA.aff"
};
public void test() throws Exception {
Path tmp = LuceneTestCase.createTempDir();
for (int i = 0; i < tests.length; i += 3) {
Path f = DICTIONARY_HOME.resolve(tests[i]);
assert Files.exists(f);
IOUtils.rm(tmp);
Files.createDirectory(tmp);
try (InputStream in = Files.newInputStream(f)) {
TestUtil.unzip(in, tmp);
Path dicEntry = tmp.resolve(tests[i + 1]);
Path affEntry = tmp.resolve(tests[i + 2]);
try (InputStream dictionary = Files.newInputStream(dicEntry);
InputStream affix = Files.newInputStream(affEntry);
Directory tempDir = newDirectory()) {
Dictionary dic = new Dictionary(tempDir, "dictionary", affix, dictionary);
System.out.println(
tests[i]
+ "\t"
+ RamUsageTester.humanSizeOf(dic)
+ "\t("
+ "words="
+ RamUsageTester.humanSizeOf(dic.words)
+ ", "
+ "flags="
+ RamUsageTester.humanSizeOf(dic.flagLookup)
+ ", "
+ "strips="
+ RamUsageTester.humanSizeOf(dic.stripData)
+ ", "
+ "conditions="
+ RamUsageTester.humanSizeOf(dic.patterns)
+ ", "
+ "affixData="
+ RamUsageTester.humanSizeOf(dic.affixData)
+ ", "
+ "prefixes="
+ RamUsageTester.humanSizeOf(dic.prefixes)
+ ", "
+ "suffixes="
+ RamUsageTester.humanSizeOf(dic.suffixes)
+ ")");
}
}
}
}
public void testOneDictionary() throws Exception {
Path tmp = LuceneTestCase.createTempDir();
String toTest = "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi";
for (int i = 0; i < tests.length; i++) {
if (tests[i].equals(toTest)) {
Path f = DICTIONARY_HOME.resolve(tests[i]);
assert Files.exists(f);
IOUtils.rm(tmp);
Files.createDirectory(tmp);
try (InputStream in = Files.newInputStream(f)) {
TestUtil.unzip(in, tmp);
Path dicEntry = tmp.resolve(tests[i + 1]);
Path affEntry = tmp.resolve(tests[i + 2]);
try (InputStream dictionary = Files.newInputStream(dicEntry);
InputStream affix = Files.newInputStream(affEntry);
Directory tempDir = newDirectory()) {
new Dictionary(tempDir, "dictionary", affix, dictionary);
}
}
}
}
}
}