Analysis: Improve Hunspell error messages

The Hunspell service would throw a confusing error message if more than
one affix file was present.  This commit distinguishes between the two
error cases: where there are no affix files and when there are too many
affix files.

Also implements lazy dictionary loading, which was used in the tests
but not implemented.

Closes #6850
This commit is contained in:
Clinton Gormley 2014-07-14 12:13:32 +02:00
parent 74927adced
commit 6e70edb0a4
8 changed files with 124726 additions and 26 deletions

View File

@ -5,10 +5,10 @@ Basic support for hunspell stemming. Hunspell dictionaries will be
picked up from a dedicated hunspell directory on the filesystem
(defaults to `<path.conf>/hunspell`). Each dictionary is expected to
have its own directory named after its associated locale (language).
This dictionary directory is expected to hold both the `*.aff` and `*.dic`
files (all of which will automatically be picked up). For example,
assuming the default hunspell location is used, the following directory
layout will define the `en_US` dictionary:
This dictionary directory is expected to hold a single `*.aff` and
one or more `*.dic` files (all of which will automatically be picked up).
For example, assuming the default hunspell location is used, the
following directory layout will define the `en_US` dictionary:
[source,js]
--------------------------------------------------
@ -25,7 +25,7 @@ _elasticsearch.yml_.
Each dictionary can be configured with one setting:
`ignore_case`::
`ignore_case`::
If true, dictionary matching will be case insensitive
(defaults to `false`)
@ -67,20 +67,20 @@ settings:
The hunspell token filter accepts four options:
`locale`::
`locale`::
A locale for this filter. If this is unset, the `lang` or
`language` are used instead - so one of these has to be set.
`dictionary`::
`dictionary`::
The name of a dictionary. The path to your hunspell
dictionaries should be configured via
`indices.analysis.hunspell.dictionary.location` before.
`dedup`::
`dedup`::
If only unique terms should be returned, this needs to be
set to `true`. Defaults to `true`.
`longest_only`::
`longest_only`::
If only the longest term should be returned, set this to `true`.
Defaults to `false`: all possible stems are returned.
@ -88,6 +88,16 @@ NOTE: As opposed to the snowball stemmers (which are algorithm based)
this is a dictionary lookup based stemmer and therefore the quality of
the stemming is determined by the quality of the dictionary.
[float]
==== Dictionary loading
By default, the configured (`indices.analysis.hunspell.dictionary.location`)
or default Hunspell directory (`config/hunspell/`) is checked for dictionaries
when the node starts up, and any dictionaries are automatically loaded.
Dictionary loading can be deferred until they are actually used by setting
`indices.analysis.hunspell.dictionary.lazy` to `true`in the config file.
[float]
==== References

View File

@ -39,7 +39,7 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
private final boolean longestOnly;
@Inject
public HunspellTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings, HunspellService hunspellService) {
public HunspellTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings, HunspellService hunspellService) {
super(index, indexSettings, name, settings);
String locale = settings.get("locale", settings.get("language", settings.get("lang", null)));
@ -64,9 +64,9 @@ public class HunspellTokenFilterFactory extends AbstractTokenFilterFactory {
public boolean dedup() {
return dedup;
}
public boolean longestOnly() {
return longestOnly;
}
}
}

View File

@ -66,7 +66,9 @@ public class HunspellService extends AbstractComponent {
private final static DictionaryFileFilter DIC_FILE_FILTER = new DictionaryFileFilter();
private final static AffixFileFilter AFFIX_FILE_FILTER = new AffixFileFilter();
public final static String HUNSPELL_LAZY_LOAD = "indices.analysis.hunspell.dictionary.lazy";
public final static String HUNSPELL_IGNORE_CASE = "indices.analysis.hunspell.dictionary.ignore_case";
public final static String HUNSPELL_LOCATION = "indices.analysis.hunspell.dictionary.location";
private final LoadingCache<String, Dictionary> dictionaries;
private final Map<String, Dictionary> knownDictionaries;
@ -82,7 +84,7 @@ public class HunspellService extends AbstractComponent {
super(settings);
this.knownDictionaries = knownDictionaries;
this.hunspellDir = resolveHunspellDirectory(settings, env);
this.defaultIgnoreCase = settings.getAsBoolean("indices.analysis.hunspell.dictionary.ignore_case", false);
this.defaultIgnoreCase = settings.getAsBoolean(HUNSPELL_IGNORE_CASE, false);
dictionaries = CacheBuilder.newBuilder().build(new CacheLoader<String, Dictionary>() {
@Override
public Dictionary load(String locale) throws Exception {
@ -93,7 +95,9 @@ public class HunspellService extends AbstractComponent {
return dictionary;
}
});
scanAndLoadDictionaries();
if (!settings.getAsBoolean(HUNSPELL_LAZY_LOAD, false)) {
scanAndLoadDictionaries();
}
}
/**
@ -101,12 +105,12 @@ public class HunspellService extends AbstractComponent {
*
* @param locale The name of the locale
*/
public Dictionary getDictionary(String locale) {
public Dictionary getDictionary(String locale) {
return dictionaries.getUnchecked(locale);
}
private File resolveHunspellDirectory(Settings settings, Environment env) {
String location = settings.get("indices.analysis.hunspell.dictionary.location", null);
String location = settings.get(HUNSPELL_LOCATION, null);
if (location != null) {
return new File(location);
}
@ -120,7 +124,7 @@ public class HunspellService extends AbstractComponent {
if (hunspellDir.exists() && hunspellDir.isDirectory()) {
for (File file : hunspellDir.listFiles()) {
if (file.isDirectory()) {
if (file.list(AFFIX_FILE_FILTER).length > 0) { // just making sure it's indeed a dictionary dir
if (file.list(DIC_FILE_FILTER).length > 0) { // just making sure it's indeed a dictionary dir
dictionaries.getUnchecked(file.getName());
}
}
@ -153,9 +157,12 @@ public class HunspellService extends AbstractComponent {
boolean ignoreCase = nodeSettings.getAsBoolean("ignore_case", defaultIgnoreCase);
File[] affixFiles = dicDir.listFiles(AFFIX_FILE_FILTER);
if (affixFiles.length != 1) {
if (affixFiles.length == 0) {
throw new ElasticsearchException(String.format(Locale.ROOT, "Missing affix file for hunspell dictionary [%s]", locale));
}
if (affixFiles.length != 1) {
throw new ElasticsearchException(String.format(Locale.ROOT, "Too many affix files exist for hunspell dictionary [%s]", locale));
}
InputStream affixStream = null;
File[] dicFiles = dicDir.listFiles(DIC_FILE_FILTER);

View File

@ -19,16 +19,20 @@
package org.elasticsearch.indices.analyze;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.indices.analysis.HunspellService;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.elasticsearch.test.ElasticsearchIntegrationTest.ClusterScope;
import org.elasticsearch.test.ElasticsearchIntegrationTest.Scope;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.lang.reflect.Field;
import static org.elasticsearch.test.ElasticsearchIntegrationTest.*;
import static org.elasticsearch.indices.analysis.HunspellService.*;
import static org.hamcrest.Matchers.notNullValue;
/**
@ -41,8 +45,8 @@ public class HunspellServiceTests extends ElasticsearchIntegrationTest {
public void testLocaleDirectoryWithNodeLevelConfig() throws Exception {
Settings settings = ImmutableSettings.settingsBuilder()
.put("path.conf", getResource("/indices/analyze/conf_dir"))
.put("indices.analysis.hunspell.dictionary.lazy", true)
.put("indices.analysis.hunspell.dictionary.ignore_case", true)
.put(HUNSPELL_LAZY_LOAD, randomBoolean())
.put(HUNSPELL_IGNORE_CASE, true)
.build();
internalCluster().startNode(settings);
@ -55,8 +59,8 @@ public class HunspellServiceTests extends ElasticsearchIntegrationTest {
public void testLocaleDirectoryWithLocaleSpecificConfig() throws Exception {
Settings settings = ImmutableSettings.settingsBuilder()
.put("path.conf", getResource("/indices/analyze/conf_dir"))
.put("indices.analysis.hunspell.dictionary.lazy", true)
.put("indices.analysis.hunspell.dictionary.ignore_case", true)
.put(HUNSPELL_LAZY_LOAD, randomBoolean())
.put(HUNSPELL_IGNORE_CASE, true)
.put("indices.analysis.hunspell.dictionary.en_US.strict_affix_parsing", false)
.put("indices.analysis.hunspell.dictionary.en_US.ignore_case", false)
.build();
@ -77,14 +81,51 @@ public class HunspellServiceTests extends ElasticsearchIntegrationTest {
@Test
public void testCustomizeLocaleDirectory() throws Exception {
Settings settings = ImmutableSettings.settingsBuilder()
.put("indices.analysis.hunspell.dictionary.location", getResource("/indices/analyze/conf_dir/hunspell"))
.put(HUNSPELL_LOCATION, getResource("/indices/analyze/conf_dir/hunspell"))
.build();
internalCluster().startNode(settings);
Dictionary dictionary = internalCluster().getInstance(HunspellService.class).getDictionary("en_US");
assertThat(dictionary, notNullValue());
}
@Test
public void testDicWithNoAff() throws Exception {
Settings settings = ImmutableSettings.settingsBuilder()
.put("path.conf", getResource("/indices/analyze/no_aff_conf_dir"))
.put(HUNSPELL_LAZY_LOAD, randomBoolean())
.build();
Dictionary dictionary = null;
try {
internalCluster().startNode(settings);
dictionary = internalCluster().getInstance(HunspellService.class).getDictionary("en_US");
fail("Missing affix file didn't throw an error");
}
catch (Throwable t) {
assertNull(dictionary);
assertThat(ExceptionsHelper.unwrap(t, ElasticsearchException.class).toString(), Matchers.containsString("Missing affix file"));
}
}
@Test
public void testDicWithTwoAffs() throws Exception {
Settings settings = ImmutableSettings.settingsBuilder()
.put("path.conf", getResource("/indices/analyze/two_aff_conf_dir"))
.put(HUNSPELL_LAZY_LOAD, randomBoolean())
.build();
Dictionary dictionary = null;
try {
internalCluster().startNode(settings);
dictionary = internalCluster().getInstance(HunspellService.class).getDictionary("en_US");
fail("Multiple affix files didn't throw an error");
} catch (Throwable t) {
assertNull(dictionary);
assertThat(ExceptionsHelper.unwrap(t, ElasticsearchException.class).toString(), Matchers.containsString("Too many affix files"));
}
}
// TODO: open up a getter on Dictionary
private void assertIgnoreCase(boolean expected, Dictionary dictionary) throws Exception {
Field f = Dictionary.class.getDeclaredField("ignoreCase");

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,201 @@
SET ISO8859-1
TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'
NOSUGGEST !
# ordinal numbers
COMPOUNDMIN 1
# only in compounds: 1th, 2th, 3th
ONLYINCOMPOUND c
# compound rules:
# 1. [0-9]*1[0-9]th (10th, 11th, 12th, 56714th, etc.)
# 2. [0-9]*[02-9](1st|2nd|3rd|[4-9]th) (21st, 22nd, 123rd, 1234th, etc.)
COMPOUNDRULE 2
COMPOUNDRULE n*1t
COMPOUNDRULE n*mp
WORDCHARS 0123456789
PFX A Y 1
PFX A 0 re .
PFX I Y 1
PFX I 0 in .
PFX U Y 1
PFX U 0 un .
PFX C Y 1
PFX C 0 de .
PFX E Y 1
PFX E 0 dis .
PFX F Y 1
PFX F 0 con .
PFX K Y 1
PFX K 0 pro .
SFX V N 2
SFX V e ive e
SFX V 0 ive [^e]
SFX N Y 3
SFX N e ion e
SFX N y ication y
SFX N 0 en [^ey]
SFX X Y 3
SFX X e ions e
SFX X y ications y
SFX X 0 ens [^ey]
SFX H N 2
SFX H y ieth y
SFX H 0 th [^y]
SFX Y Y 1
SFX Y 0 ly .
SFX G Y 2
SFX G e ing e
SFX G 0 ing [^e]
SFX J Y 2
SFX J e ings e
SFX J 0 ings [^e]
SFX D Y 4
SFX D 0 d e
SFX D y ied [^aeiou]y
SFX D 0 ed [^ey]
SFX D 0 ed [aeiou]y
SFX T N 4
SFX T 0 st e
SFX T y iest [^aeiou]y
SFX T 0 est [aeiou]y
SFX T 0 est [^ey]
SFX R Y 4
SFX R 0 r e
SFX R y ier [^aeiou]y
SFX R 0 er [aeiou]y
SFX R 0 er [^ey]
SFX Z Y 4
SFX Z 0 rs e
SFX Z y iers [^aeiou]y
SFX Z 0 ers [aeiou]y
SFX Z 0 ers [^ey]
SFX S Y 4
SFX S y ies [^aeiou]y
SFX S 0 s [aeiou]y
SFX S 0 es [sxzh]
SFX S 0 s [^sxzhy]
SFX P Y 3
SFX P y iness [^aeiou]y
SFX P 0 ness [aeiou]y
SFX P 0 ness [^y]
SFX M Y 1
SFX M 0 's .
SFX B Y 3
SFX B 0 able [^aeiou]
SFX B 0 able ee
SFX B e able [^aeiou]e
SFX L Y 1
SFX L 0 ment .
REP 88
REP a ei
REP ei a
REP a ey
REP ey a
REP ai ie
REP ie ai
REP are air
REP are ear
REP are eir
REP air are
REP air ere
REP ere air
REP ere ear
REP ere eir
REP ear are
REP ear air
REP ear ere
REP eir are
REP eir ere
REP ch te
REP te ch
REP ch ti
REP ti ch
REP ch tu
REP tu ch
REP ch s
REP s ch
REP ch k
REP k ch
REP f ph
REP ph f
REP gh f
REP f gh
REP i igh
REP igh i
REP i uy
REP uy i
REP i ee
REP ee i
REP j di
REP di j
REP j gg
REP gg j
REP j ge
REP ge j
REP s ti
REP ti s
REP s ci
REP ci s
REP k cc
REP cc k
REP k qu
REP qu k
REP kw qu
REP o eau
REP eau o
REP o ew
REP ew o
REP oo ew
REP ew oo
REP ew ui
REP ui ew
REP oo ui
REP ui oo
REP ew u
REP u ew
REP oo u
REP u oo
REP u oe
REP oe u
REP u ieu
REP ieu u
REP ue ew
REP ew ue
REP uff ough
REP oo ieu
REP ieu oo
REP ier ear
REP ear ier
REP ear air
REP air ear
REP w qu
REP qu w
REP z ss
REP ss z
REP shun tion
REP shun sion
REP shun cion

View File

@ -0,0 +1,201 @@
SET ISO8859-1
TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'
NOSUGGEST !
# ordinal numbers
COMPOUNDMIN 1
# only in compounds: 1th, 2th, 3th
ONLYINCOMPOUND c
# compound rules:
# 1. [0-9]*1[0-9]th (10th, 11th, 12th, 56714th, etc.)
# 2. [0-9]*[02-9](1st|2nd|3rd|[4-9]th) (21st, 22nd, 123rd, 1234th, etc.)
COMPOUNDRULE 2
COMPOUNDRULE n*1t
COMPOUNDRULE n*mp
WORDCHARS 0123456789
PFX A Y 1
PFX A 0 re .
PFX I Y 1
PFX I 0 in .
PFX U Y 1
PFX U 0 un .
PFX C Y 1
PFX C 0 de .
PFX E Y 1
PFX E 0 dis .
PFX F Y 1
PFX F 0 con .
PFX K Y 1
PFX K 0 pro .
SFX V N 2
SFX V e ive e
SFX V 0 ive [^e]
SFX N Y 3
SFX N e ion e
SFX N y ication y
SFX N 0 en [^ey]
SFX X Y 3
SFX X e ions e
SFX X y ications y
SFX X 0 ens [^ey]
SFX H N 2
SFX H y ieth y
SFX H 0 th [^y]
SFX Y Y 1
SFX Y 0 ly .
SFX G Y 2
SFX G e ing e
SFX G 0 ing [^e]
SFX J Y 2
SFX J e ings e
SFX J 0 ings [^e]
SFX D Y 4
SFX D 0 d e
SFX D y ied [^aeiou]y
SFX D 0 ed [^ey]
SFX D 0 ed [aeiou]y
SFX T N 4
SFX T 0 st e
SFX T y iest [^aeiou]y
SFX T 0 est [aeiou]y
SFX T 0 est [^ey]
SFX R Y 4
SFX R 0 r e
SFX R y ier [^aeiou]y
SFX R 0 er [aeiou]y
SFX R 0 er [^ey]
SFX Z Y 4
SFX Z 0 rs e
SFX Z y iers [^aeiou]y
SFX Z 0 ers [aeiou]y
SFX Z 0 ers [^ey]
SFX S Y 4
SFX S y ies [^aeiou]y
SFX S 0 s [aeiou]y
SFX S 0 es [sxzh]
SFX S 0 s [^sxzhy]
SFX P Y 3
SFX P y iness [^aeiou]y
SFX P 0 ness [aeiou]y
SFX P 0 ness [^y]
SFX M Y 1
SFX M 0 's .
SFX B Y 3
SFX B 0 able [^aeiou]
SFX B 0 able ee
SFX B e able [^aeiou]e
SFX L Y 1
SFX L 0 ment .
REP 88
REP a ei
REP ei a
REP a ey
REP ey a
REP ai ie
REP ie ai
REP are air
REP are ear
REP are eir
REP air are
REP air ere
REP ere air
REP ere ear
REP ere eir
REP ear are
REP ear air
REP ear ere
REP eir are
REP eir ere
REP ch te
REP te ch
REP ch ti
REP ti ch
REP ch tu
REP tu ch
REP ch s
REP s ch
REP ch k
REP k ch
REP f ph
REP ph f
REP gh f
REP f gh
REP i igh
REP igh i
REP i uy
REP uy i
REP i ee
REP ee i
REP j di
REP di j
REP j gg
REP gg j
REP j ge
REP ge j
REP s ti
REP ti s
REP s ci
REP ci s
REP k cc
REP cc k
REP k qu
REP qu k
REP kw qu
REP o eau
REP eau o
REP o ew
REP ew o
REP oo ew
REP ew oo
REP ew ui
REP ui ew
REP oo ui
REP ui oo
REP ew u
REP u ew
REP oo u
REP u oo
REP u oe
REP oe u
REP u ieu
REP ieu u
REP ue ew
REP ew ue
REP uff ough
REP oo ieu
REP ieu oo
REP ier ear
REP ear ier
REP ear air
REP air ear
REP w qu
REP qu w
REP z ss
REP ss z
REP shun tion
REP shun sion
REP shun cion

File diff suppressed because it is too large Load Diff