mirror of https://github.com/apache/lucene.git
LUCENE-6775: Improved MorfologikFilterFactory to allow loading of custom dictionaries from ResourceLoader
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1702118 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
54f63231ee
commit
aeb41ee7e4
|
@ -144,6 +144,9 @@ Other
|
|||
* LUCENE-6761: MatchAllDocsQuery's Scorers do not expose approximations
|
||||
anymore. (Adrien Grand)
|
||||
|
||||
* LUCENE-6775: Improved MorfologikFilterFactory to allow loading of
|
||||
custom dictionaries from ResourceLoader. (Uwe Schindler)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-6732: Improve checker for invalid source patterns to also
|
||||
|
|
|
@ -17,15 +17,23 @@ package org.apache.lucene.analysis.morfologik;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import morfologik.stemming.Dictionary;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
|
||||
* dictionary is used as default. You can change dictionary resource
|
||||
* by dictionary-resource parameter.
|
||||
* by dictionary-resource parameter:
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
|
@ -34,35 +42,86 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* <p>Alternatively, you can pass in the filenames of FSA ({@code ".dict"} and features "{@code ".info"}" file
|
||||
* (if the features file is not given, its name is derived from the FSA file):
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.MorfologikFilterFactory" dictionary-fsa-file="mylang.dict" dictionary-features-file="mylang.info" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
|
||||
*/
|
||||
public class MorfologikFilterFactory extends TokenFilterFactory {
|
||||
public class MorfologikFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
/**
|
||||
* The default dictionary resource (for Polish).
|
||||
*/
|
||||
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
|
||||
|
||||
/**
|
||||
* Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
|
||||
*/
|
||||
private final String dictionaryResource;
|
||||
|
||||
/** Dictionary resource */
|
||||
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
|
||||
|
||||
/** Dictionary FSA file (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}. */
|
||||
public static final String DICTIONARY_FSA_FILE_ATTRIBUTE = "dictionary-fsa-file";
|
||||
|
||||
/** Dictionary features/properties file, loaded from {@link ResourceLoader}. If not given, this
|
||||
* loads the file with same name like {@link #DICTIONARY_FSA_FILE_ATTRIBUTE}, but with
|
||||
* {@code ".info"} suffix.
|
||||
*/
|
||||
public static final String DICTIONARY_FEATURES_FILE_ATTRIBUTE = "dictionary-features-file";
|
||||
|
||||
private final String dictionaryFsaFile, dictionaryFeaturesFile, dictionaryResource;
|
||||
private Dictionary dictionary; // initialized on inform()
|
||||
|
||||
/** Creates a new MorfologikFilterFactory */
|
||||
public MorfologikFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
|
||||
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
|
||||
// first check FSA and features (at least FSA must be given, features name is guessed):
|
||||
dictionaryFsaFile = get(args, DICTIONARY_FSA_FILE_ATTRIBUTE);
|
||||
dictionaryFeaturesFile = get(args, DICTIONARY_FEATURES_FILE_ATTRIBUTE,
|
||||
(dictionaryFsaFile == null) ? null : Dictionary.getExpectedFeaturesName(dictionaryFsaFile));
|
||||
|
||||
if (dictionaryFsaFile == null && dictionaryFeaturesFile == null) {
|
||||
// if we have no FSA/features combination, we resolve the classpath resource:
|
||||
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
|
||||
} else if (dictionaryFsaFile == null || dictionaryFeaturesFile == null) {
|
||||
// if we have incomplete FSA/features tuple in args
|
||||
throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Missing '%s' or '%s' attribute.",
|
||||
DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
|
||||
} else {
|
||||
dictionaryResource = null;
|
||||
if (get(args, DICTIONARY_RESOURCE_ATTRIBUTE) != null) {
|
||||
// fail if both is given: FSA/features files + classpath resource
|
||||
throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Cannot give '%s' and '%s'/'%s' at the same time.",
|
||||
DICTIONARY_RESOURCE_ATTRIBUTE, DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
|
||||
}
|
||||
}
|
||||
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
if (dictionaryFsaFile != null) {
|
||||
assert dictionaryFeaturesFile != null;
|
||||
assert dictionaryResource == null;
|
||||
try (final InputStream dictIn = loader.openResource(dictionaryFsaFile);
|
||||
final InputStream metaIn = loader.openResource(dictionaryFeaturesFile)) {
|
||||
this.dictionary = Dictionary.readAndClose(dictIn, metaIn);
|
||||
}
|
||||
} else {
|
||||
assert dictionaryResource != null;
|
||||
this.dictionary = MorfologikFilter.loadDictionaryResource(dictionaryResource);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream ts) {
|
||||
return new MorfologikFilter(ts, dictionaryResource);
|
||||
return new MorfologikFilter(ts, Objects.requireNonNull(dictionary, "MorfologikFilterFactory was not fully initialized."));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,17 +20,57 @@ package org.apache.lucene.analysis.morfologik;
|
|||
import java.io.StringReader;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
|
||||
/**
|
||||
* Test for {@link MorfologikFilterFactory}.
|
||||
*/
|
||||
public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
|
||||
public void testCreateDictionary() throws Exception {
|
||||
final ResourceLoader loader = new ClasspathResourceLoader(getClass());
|
||||
|
||||
public void testDefaultDictionary() throws Exception {
|
||||
StringReader reader = new StringReader("rowery bilety");
|
||||
MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
|
||||
factory.inform(loader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
|
||||
}
|
||||
|
||||
public void testResourceDictionary() throws Exception {
|
||||
StringReader reader = new StringReader("rowery bilety");
|
||||
Map<String,String> params = new HashMap<>();
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
|
||||
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
|
||||
}
|
||||
|
||||
public void testResourceLoaderDictionary1() throws Exception {
|
||||
StringReader reader = new StringReader("rowery bilety");
|
||||
Map<String,String> params = new HashMap<>();
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
|
||||
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
|
||||
}
|
||||
|
||||
public void testResourceLoaderDictionary2() throws Exception {
|
||||
StringReader reader = new StringReader("rowery bilety");
|
||||
Map<String,String> params = new HashMap<>();
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
|
||||
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = factory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
|
||||
|
@ -39,12 +79,48 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
|
|||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
HashMap<String,String> map = new HashMap<String,String>();
|
||||
map.put("bogusArg", "bogusValue");
|
||||
new MorfologikFilterFactory(map);
|
||||
HashMap<String,String> params = new HashMap<String,String>();
|
||||
params.put("bogusArg", "bogusValue");
|
||||
new MorfologikFilterFactory(params);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testIncompatibleArgs1() throws Exception {
|
||||
try {
|
||||
HashMap<String,String> params = new HashMap<String,String>();
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
|
||||
new MorfologikFilterFactory(params);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("at the same time"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testIncompatibleArgs2() throws Exception {
|
||||
try {
|
||||
HashMap<String,String> params = new HashMap<String,String>();
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
|
||||
new MorfologikFilterFactory(params);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("at the same time"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testMissingArgs1() throws Exception {
|
||||
try {
|
||||
HashMap<String,String> params = new HashMap<String,String>();
|
||||
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
|
||||
new MorfologikFilterFactory(params);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Missing"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue