LUCENE-6833: Upgraded Morfologik to version 2.0.1. The 'dictionary' attribute has been

reverted back and now points at the dictionary resource to be loaded instead of the default Polish dictionary.



git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1707457 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2015-10-08 09:17:27 +00:00
parent 4a78c824a5
commit 7e6ba558ff
24 changed files with 132 additions and 184 deletions

View File

@ -207,8 +207,12 @@ Other
* LUCENE-6761: MatchAllDocsQuery's Scorers do not expose approximations
anymore. (Adrien Grand)
* LUCENE-6775: Improved MorfologikFilterFactory to allow loading of
custom dictionaries from ResourceLoader. (Uwe Schindler)
* LUCENE-6775, LUCENE-6833: Improved MorfologikFilterFactory to allow
loading of custom dictionaries from ResourceLoader. Upgraded
Morfologik to version 2.0.1. The 'dictionary' attribute has been
reverted back and now points at the dictionary resource to be
loaded instead of the default Polish dictionary.
(Uwe Schindler, Dawid Weiss)
* LUCENE-6797: Make GeoCircle an interface and use a factory to create
it, to eventually handle degenerate cases (Karl Wright via Mike

View File

@ -18,9 +18,8 @@
-->
<project name="analyzers-morfologik" default="default">
<description>
Analyzer for indexing Polish
Analyzer for dictionary stemming, built-in Polish dictionary
</description>
<import file="../analysis-module-build.xml"/>
@ -30,6 +29,12 @@
<fileset dir="lib"/>
<path refid="base.classpath"/>
</path>
<path id="test.classpath">
<path refid="test.base.classpath" />
<pathelement path="src/test-files" />
</path>
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
</project>

View File

@ -20,6 +20,9 @@ package org.apache.lucene.analysis.morfologik;
import java.io.Reader;
import morfologik.stemming.Dictionary;
import morfologik.stemming.polish.PolishStemmer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
@ -30,27 +33,23 @@ import org.apache.lucene.analysis.standard.StandardTokenizer;
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
public class MorfologikAnalyzer extends Analyzer {
private final String dictionary;
private final Dictionary dictionary;
/**
* Builds an analyzer with an explicit dictionary resource.
* Builds an analyzer with an explicit {@link Dictionary} resource.
*
* @param dictionaryResource A constant specifying which dictionary to choose. The
* dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
* and have an associated <code>.info</code> metadata file. See the Morfologik project
* for details.
*
* @see <a href="http://morfologik.blogspot.com/">http://morfologik.blogspot.com/</a>
* @param dictionary A prebuilt automaton with inflected and base word forms.
* @see <a href="https://github.com/morfologik/">https://github.com/morfologik/</a>
*/
public MorfologikAnalyzer(final String dictionaryResource) {
this.dictionary = dictionaryResource;
public MorfologikAnalyzer(final Dictionary dictionary) {
this.dictionary = dictionary;
}
/**
* Builds an analyzer with the default Morfologik's Polish dictionary.
*/
public MorfologikAnalyzer() {
this(MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
this(new PolishStemmer().getDictionary());
}
/**

View File

@ -18,17 +18,16 @@ package org.apache.lucene.analysis.morfologik;
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Pattern;
import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryLookup;
import morfologik.stemming.IStemmer;
import morfologik.stemming.WordData;
import morfologik.stemming.polish.PolishStemmer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -70,20 +69,9 @@ public class MorfologikFilter extends TokenFilter {
* Creates a filter with the default (Polish) dictionary.
*/
public MorfologikFilter(final TokenStream in) {
this(in, DictionaryHolder.DEFAULT_DICT);
this(in, new PolishStemmer().getDictionary());
}
/**
* Creates a filter with a given dictionary resource.
*
* @param in input token stream.
* @param dictResource Dictionary resource name in classpath.
*/
public MorfologikFilter(final TokenStream in, final String dictResource) {
this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE.equals(dictResource) ?
DictionaryHolder.DEFAULT_DICT : loadDictionaryResource(dictResource));
}
/**
* Creates a filter with a given dictionary.
*
@ -180,23 +168,4 @@ public class MorfologikFilter extends TokenFilter {
tagsList.clear();
super.reset();
}
/** This method was added, because Morfologik uses context classloader and fails to load from our classloader (bug with absolute path). */
static Dictionary loadDictionaryResource(String resource) {
Objects.requireNonNull(resource, "Morfologik language code may not be null");
final String dictPath = "/morfologik/dictionaries/" + resource + ".dict";
final String metaPath = Dictionary.getExpectedFeaturesName(dictPath);
try (final InputStream dictIn = Objects.requireNonNull(Dictionary.class.getResourceAsStream(dictPath), "Unable to find Morfologik dictionary: " + dictPath);
final InputStream metaIn = Objects.requireNonNull(Dictionary.class.getResourceAsStream(metaPath), "Unable to find Morfologik metadata: " + metaPath)) {
return Dictionary.readAndClose(dictIn, metaIn);
} catch (IOException ioe) {
throw new RuntimeException("IOException while loading Morfologik dictionary and metadata.", ioe);
}
}
/** This holder is for the default Polish dictionary */
static final class DictionaryHolder {
static final Dictionary DEFAULT_DICT = loadDictionaryResource(MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
}
}

View File

@ -19,11 +19,12 @@ package org.apache.lucene.analysis.morfologik;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import morfologik.stemming.Dictionary;
import morfologik.stemming.DictionaryMetadata;
import morfologik.stemming.polish.PolishStemmer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
@ -31,75 +32,47 @@ import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
* dictionary is used as default. You can change dictionary resource
* by dictionary-resource parameter:
* Filter factory for {@link MorfologikFilter}.
*
* <p>An explicit resource name of the dictionary ({@code ".dict"}) can be
* provided via the <code>dictionary</code> attribute, as the example below demonstrates:
* <pre class="prettyprint">
* &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;fieldType name="text_mylang" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" /&gt;
* &lt;filter class="solr.MorfologikFilterFactory" dictionary="mylang.dict" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* <p>Alternatively, you can pass in the filenames of FSA ({@code ".dict"} and features "{@code ".info"}" file
* (if the features file is not given, its name is derived from the FSA file):
* <pre class="prettyprint">
* &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.MorfologikFilterFactory" dictionary-fsa-file="mylang.dict" dictionary-features-file="mylang.info" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
* <p>If the dictionary attribute is not provided, the Polish dictionary is loaded
* and used by default.
*
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/
public class MorfologikFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
/**
* The default dictionary resource (for Polish).
*/
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
/** Dictionary resource attribute (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}. */
public static final String DICTIONARY_ATTRIBUTE = "dictionary";
/** Dictionary resource */
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
/** {@link #DICTIONARY_ATTRIBUTE} value passed to {@link #inform}. */
private String resourceName;
/** Dictionary FSA file (should have {@code ".dict"} suffix), loaded from {@link ResourceLoader}. */
public static final String DICTIONARY_FSA_FILE_ATTRIBUTE = "dictionary-fsa-file";
/** Dictionary features/properties file, loaded from {@link ResourceLoader}. If not given, this
* loads the file with same name like {@link #DICTIONARY_FSA_FILE_ATTRIBUTE}, but with
* {@code ".info"} suffix.
*/
public static final String DICTIONARY_FEATURES_FILE_ATTRIBUTE = "dictionary-features-file";
private final String dictionaryFsaFile, dictionaryFeaturesFile, dictionaryResource;
private Dictionary dictionary; // initialized on inform()
/** Loaded {@link Dictionary}, initialized on {@link #inform(ResourceLoader)}. */
private Dictionary dictionary;
/** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) {
super(args);
// first check FSA and features (at least FSA must be given, features name is guessed):
dictionaryFsaFile = get(args, DICTIONARY_FSA_FILE_ATTRIBUTE);
dictionaryFeaturesFile = get(args, DICTIONARY_FEATURES_FILE_ATTRIBUTE,
(dictionaryFsaFile == null) ? null : Dictionary.getExpectedFeaturesName(dictionaryFsaFile));
if (dictionaryFsaFile == null && dictionaryFeaturesFile == null) {
// if we have no FSA/features combination, we resolve the classpath resource:
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
} else if (dictionaryFsaFile == null || dictionaryFeaturesFile == null) {
// if we have incomplete FSA/features tuple in args
throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Missing '%s' or '%s' attribute.",
DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
} else {
dictionaryResource = null;
if (get(args, DICTIONARY_RESOURCE_ATTRIBUTE) != null) {
// fail if both is given: FSA/features files + classpath resource
throw new IllegalArgumentException(String.format(Locale.ENGLISH, "Cannot give '%s' and '%s'/'%s' at the same time.",
DICTIONARY_RESOURCE_ATTRIBUTE, DICTIONARY_FSA_FILE_ATTRIBUTE, DICTIONARY_FEATURES_FILE_ATTRIBUTE));
}
// Be specific about no-longer-supported dictionary attribute.
final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
String dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE);
if (dictionaryResource != null && !dictionaryResource.isEmpty()) {
throw new IllegalArgumentException("The " + DICTIONARY_RESOURCE_ATTRIBUTE + " attribute is no "
+ "longer supported. Use the '" + DICTIONARY_ATTRIBUTE + "' attribute instead (see LUCENE-6833).");
}
resourceName = get(args, DICTIONARY_ATTRIBUTE);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -107,16 +80,14 @@ public class MorfologikFilterFactory extends TokenFilterFactory implements Resou
@Override
public void inform(ResourceLoader loader) throws IOException {
if (dictionaryFsaFile != null) {
assert dictionaryFeaturesFile != null;
assert dictionaryResource == null;
try (final InputStream dictIn = loader.openResource(dictionaryFsaFile);
final InputStream metaIn = loader.openResource(dictionaryFeaturesFile)) {
this.dictionary = Dictionary.readAndClose(dictIn, metaIn);
}
if (resourceName == null) {
// Get the dictionary lazily, does not hold up memory.
this.dictionary = new PolishStemmer().getDictionary();
} else {
assert dictionaryResource != null;
this.dictionary = MorfologikFilter.loadDictionaryResource(dictionaryResource);
try (InputStream dict = loader.openResource(resourceName);
InputStream meta = loader.openResource(DictionaryMetadata.getExpectedMetadataFileName(resourceName))) {
this.dictionary = Dictionary.read(dict, meta);
}
}
}

View File

@ -0,0 +1,24 @@
#
# An example stemming dictionary file for Morfologik filter.
#
# Compile with Morfologik-stemming, see
# https://github.com/morfologik/morfologik-stemming/wiki/Examples
#
# Author of the dictionary.
fsa.dict.author=Acme Inc.
# Date the dictionary data was assembled (not compilation time!).
fsa.dict.created=2015/10/08 09:16:00
# The license for the dictionary data.
fsa.dict.license=ASL 2.0
# Character encoding inside the automaton (and input file).
fsa.dict.encoding=UTF-8
# field separator (lemma;inflected;tag)
fsa.dict.separator=;
# type of base/lemma compression encoding before automaton compression.
fsa.dict.encoder=INFIX

View File

@ -0,0 +1,2 @@
lemma1;inflected1;tag1
lemma2;inflected2;tag2

View File

@ -17,6 +17,8 @@ package org.apache.lucene.analysis.morfologik;
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashMap;
@ -31,51 +33,59 @@ import org.apache.lucene.analysis.util.ResourceLoader;
* Test for {@link MorfologikFilterFactory}.
*/
public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
final ResourceLoader loader = new ClasspathResourceLoader(getClass());
private static class ForbidResourcesLoader implements ResourceLoader {
@Override
public InputStream openResource(String resource) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public <T> Class<? extends T> findClass(String cname, Class<T> expectedType) {
throw new UnsupportedOperationException();
}
@Override
public <T> T newInstance(String cname, Class<T> expectedType) {
throw new UnsupportedOperationException();
}
}
public void testDefaultDictionary() throws Exception {
StringReader reader = new StringReader("rowery bilety");
MorfologikFilterFactory factory = new MorfologikFilterFactory(Collections.<String,String>emptyMap());
factory.inform(loader);
factory.inform(new ForbidResourcesLoader());
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
}
public void testResourceDictionary() throws Exception {
StringReader reader = new StringReader("rowery bilety");
public void testExplicitDictionary() throws Exception {
final ResourceLoader loader = new ClasspathResourceLoader(TestMorfologikFilterFactory.class);
StringReader reader = new StringReader("inflected1 inflected2");
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
params.put(MorfologikFilterFactory.DICTIONARY_ATTRIBUTE, "custom-dictionary.dict");
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
assertTokenStreamContents(stream, new String[] {"lemma1", "lemma2"});
}
public void testResourceLoaderDictionary1() throws Exception {
StringReader reader = new StringReader("rowery bilety");
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
public void testMissingDictionary() throws Exception {
final ResourceLoader loader = new ClasspathResourceLoader(TestMorfologikFilterFactory.class);
try {
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_ATTRIBUTE, "missing-dictionary-resource.dict");
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
fail();
} catch (IOException e) {
assertTrue(e.getMessage().contains("Resource not found"));
}
}
public void testResourceLoaderDictionary2() throws Exception {
StringReader reader = new StringReader("rowery bilety");
Map<String,String> params = new HashMap<>();
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
MorfologikFilterFactory factory = new MorfologikFilterFactory(params);
factory.inform(loader);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = factory.create(stream);
assertTokenStreamContents(stream, new String[] {"rower", "bilet"});
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
@ -87,40 +97,4 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
public void testIncompatibleArgs1() throws Exception {
try {
HashMap<String,String> params = new HashMap<String,String>();
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("at the same time"));
}
}
public void testIncompatibleArgs2() throws Exception {
try {
HashMap<String,String> params = new HashMap<String,String>();
params.put(MorfologikFilterFactory.DICTIONARY_RESOURCE_ATTRIBUTE, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
params.put(MorfologikFilterFactory.DICTIONARY_FSA_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.dict");
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("at the same time"));
}
}
public void testMissingArgs1() throws Exception {
try {
HashMap<String,String> params = new HashMap<String,String>();
params.put(MorfologikFilterFactory.DICTIONARY_FEATURES_FILE_ATTRIBUTE, "/morfologik/dictionaries/pl.info");
new MorfologikFilterFactory(params);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Missing"));
}
}
}

View File

@ -216,7 +216,7 @@ org.bouncycastle.version = 1.45
/org.carrot2/carrot2-mini = 3.10.3
org.carrot2.morfologik.version = 1.10.0
org.carrot2.morfologik.version = 2.0.1
/org.carrot2/morfologik-fsa = ${org.carrot2.morfologik.version}
/org.carrot2/morfologik-polish = ${org.carrot2.morfologik.version}
/org.carrot2/morfologik-stemming = ${org.carrot2.morfologik.version}

View File

@ -1 +0,0 @@
87100c6baf60f096b42b9af06dafeb20f686cd02

View File

@ -0,0 +1 @@
23b4c04bb74f80e77573dc3ab84c8b4203f68d50

View File

@ -1,6 +1,6 @@
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2013 Dawid Weiss, Marcin Miłkowski
Copyright (c) 2007-2015 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,

View File

@ -1 +0,0 @@
0f8eeb58acb5a39e162c0d49fcf29a70744cc2bc

View File

@ -0,0 +1 @@
b35034de153a79d0afeeeee2ff883d548a178961

View File

@ -1 +0,0 @@
a74ad7ceb29ff1d8194eb161f5b2dfbd636626a5

View File

@ -0,0 +1 @@
df9434b431bbed20ded67ede439c7dfb1e29e9f8

View File

@ -1,6 +1,6 @@
Copyright (c) 2006 Dawid Weiss
Copyright (c) 2007-2013 Dawid Weiss, Marcin Miłkowski
Copyright (c) 2007-2015 Dawid Weiss, Marcin Miłkowski
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,

View File

@ -1 +0,0 @@
87100c6baf60f096b42b9af06dafeb20f686cd02

View File

@ -0,0 +1 @@
23b4c04bb74f80e77573dc3ab84c8b4203f68d50

View File

@ -1 +0,0 @@
0f8eeb58acb5a39e162c0d49fcf29a70744cc2bc

View File

@ -0,0 +1 @@
b35034de153a79d0afeeeee2ff883d548a178961

View File

@ -1 +0,0 @@
a74ad7ceb29ff1d8194eb161f5b2dfbd636626a5

View File

@ -0,0 +1 @@
df9434b431bbed20ded67ede439c7dfb1e29e9f8