LUCENE-6775: Improved MorfologikFilterFactory to allow loading of custom dictionaries from ResourceLoader

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1702118 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Uwe Schindler 2015-09-09 21:48:38 +00:00
parent 54f63231ee
commit aeb41ee7e4
3 changed files with 151 additions and 13 deletions

View File

@ -144,6 +144,9 @@ Other
* LUCENE-6761: MatchAllDocsQuery's Scorers do not expose approximations
anymore. (Adrien Grand)
* LUCENE-6775: Improved MorfologikFilterFactory to allow loading of
custom dictionaries from ResourceLoader. (Uwe Schindler)
Build
* LUCENE-6732: Improve checker for invalid source patterns to also

View File

@ -17,15 +17,23 @@ package org.apache.lucene.analysis.morfologik;
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import morfologik.stemming.Dictionary;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
* dictionary is used as default. You can change dictionary resource
* by dictionary-resource parameter.
* by dictionary-resource parameter:
* <pre class="prettyprint">
* &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
@ -34,35 +42,86 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* <p>Alternatively, you can pass in the filenames of FSA ({@code ".dict"} and features "{@code ".info"}" file
* (if the features file is not given, its name is derived from the FSA file):
* <pre class="prettyprint">
* &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.MorfologikFilterFactory" dictionary-fsa-file="mylang.dict" dictionary-features-file="mylang.info" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/
public class MorfologikFilterFactory extends TokenFilterFactory {
public class MorfologikFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
/**
* The default dictionary resource (for Polish).
*/
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
/**
* Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
*/
private final String dictionaryResource;
/** Dictionary resource */
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
/** Dictionary FSA file (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}. */
public static final String DICTIONARY_FSA_FILE_ATTRIBUTE = "dictionary-fsa-file";
/** Dictionary features/properties file, loaded from {@link ResourceLoader}. If not given, this
* loads the file with same name like {@link #DICTIONARY_FSA_FILE_ATTRIBUTE}, but with
* {@code ".info"} suffix.
*/
public static final String DICTIONARY_FEATURES_FILE_ATTRIBUTE = "dictionary-features-file";
private final String dictionaryFsaFile, dictionaryFeaturesFile, dictionaryResource;
private Dictionary dictionary; // initialized on inform()
/** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) {
super(args);
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
// first check FSA and features (at least FSA must be given, features name is guessed):
dictionaryFsaFile = get(args, DICTIONARY_FSA_FILE_ATTRIBUTE);
dictionaryFeaturesFile = get(args, DICTIONARY_FEATURES_FILE_ATTRIBUTE,
(dictionaryFsaFile == null) ? null : Dictionary.getExpectedFeaturesName(dictionaryFsaFile));
if (dictionaryFsaFile == null && dictionaryFeaturesFile == null) {
// if we have no FSA/features combination, we resolve the classpath resource:
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
} else if (dictionaryFsaFile == null || dictionaryFeaturesFile == null) {
// if we have incomplete FSA/features tuple in args
throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Missing '%s' or '%s' attribute.",
DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
} else {
dictionaryResource = null;
if (get(args, DICTIONARY_RESOURCE_ATTRIBUTE) != null) {
// fail if both is given: FSA/features files + classpath resource
throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Cannot give '%s' and '%s'/'%s' at the same time.",
DICTIONARY_RESOURCE_ATTRIBUTE, DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
}
}
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
if (dictionaryFsaFile != null) {
assert dictionaryFeaturesFile != null;
assert dictionaryResource == null;
try (final InputStream dictIn = loader.openResource(dictionaryFsaFile);
final InputStream metaIn = loader.openResource(dictionaryFeaturesFile)) {
this.dictionary = Dictionary.readAndClose(dictIn, metaIn);
}
} else {
assert dictionaryResource != null;
this.dictionary = MorfologikFilter.loadDictionaryResource(dictionaryResource);
}
}
@Override
public TokenStream create(TokenStream ts) {
return new MorfologikFilter(ts, dictionaryResource);
return new MorfologikFilter(ts, Objects.requireNonNull(dictionary, "MorfologikFilterFactory was not fully initialized."));
}
}

View File

@ -20,17 +20,57 @@ package org.apache.lucene.analysis.morfologik;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoader;
/**
* Test for {@link MorfologikFilterFactory}.
*/
public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
public void testCreateDictionary() throws Exception {
final ResourceLoader loader = new ClasspathResourceLoader(getClass());
public void testDefaultDictionary() throws Exception {
StringReader reader = new StringReader("rowery bilety");
MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
}
public void testResourceDictionary() throws Exception {
StringReader reader = new StringReader("rowery bilety");
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
}
public void testResourceLoaderDictionary1() throws Exception {
StringReader reader = new StringReader("rowery bilety");
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
}
public void testResourceLoaderDictionary2() throws Exception {
StringReader reader = new StringReader("rowery bilety");
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
@ -39,12 +79,48 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
HashMap<String,String> map = new HashMap<String,String>();
map.put("bogusArg", "bogusValue");
new MorfologikFilterFactory(map);
HashMap<String,String> params = new HashMap<String,String>();
params.put("bogusArg", "bogusValue");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
public void testIncompatibleArgs1() throws Exception {
try {
HashMap<String,String> params = new HashMap<String,String>();
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("at the same time"));
}
}
public void testIncompatibleArgs2() throws Exception {
try {
HashMap<String,String> params = new HashMap<String,String>();
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("at the same time"));
}
}
public void testMissingArgs1() throws Exception {
try {
HashMap<String,String> params = new HashMap<String,String>();
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Missing"));
}
}
}