LUCENE-5356: Morfologik filter can accept custom dictionary resources.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1580853 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2014-03-24 13:47:28 +00:00
parent dc2b46d161
commit ed22afea8a
6 changed files with 58 additions and 22 deletions

View File

@ -76,6 +76,9 @@ Changes in Runtime Behavior
New Features New Features
* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
(Michal Hlavac, Dawid Weiss)
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting * LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
on multi-valued field. (Robert Muir) on multi-valued field. (Robert Muir)

View File

@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a> * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/ */
public class MorfologikAnalyzer extends Analyzer { public class MorfologikAnalyzer extends Analyzer {
private final String dictionary;
private final Version version; private final Version version;
/** /**
* Builds an analyzer with the default Morfologik's dictionary (polimorf). * Builds an analyzer with the default Morfologik's dictionary (polimorf).
* *
* @param version * @param version Lucene compatibility version
* Lucene compatibility version * @param dictionaryResource A constant specifying which dictionary to choose. The
* dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
* and have an associated <code>.info</code> metadata file. See the Morfologik project
* for details.
*
* @see "http://morfologik.blogspot.com/"
*/ */
public MorfologikAnalyzer(final Version version) { public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
this.version = version; this.version = version;
this.dictionary = dictionaryResource;
}
public MorfologikAnalyzer(final Version version) {
this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
} }
/** /**
* Creates a * Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}. * which tokenizes all the text in the provided {@link Reader}.
* *
* @param field ignored field name * @param field ignored field name
* @return A * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with * built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter} and {@link MorfologikFilter}. * {@link StandardFilter} and {@link MorfologikFilter}.
*/ */
@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends Analyzer {
return new TokenStreamComponents( return new TokenStreamComponents(
src, src,
new MorfologikFilter(new StandardFilter(this.version, src), this.version)); new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
} }
} }

View File

@ -61,20 +61,29 @@ public class MorfologikFilter extends TokenFilter {
private int lemmaListIndex; private int lemmaListIndex;
/** /**
* Creates MorfologikFilter * Creates a filter with the default (Polish) dictionary.
* @param in input token stream
* @param version Lucene version compatibility for lowercasing.
*/ */
public MorfologikFilter(final TokenStream in, final Version version) { public MorfologikFilter(final TokenStream in, final Version version) {
this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
}
/**
* Creates a filter with a given dictionary resource.
*
* @param in input token stream.
* @param dict Dictionary resource from classpath.
* @param version Lucene version compatibility for lowercasing.
*/
public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
super(in); super(in);
this.input = in; this.input = in;
// SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources. // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
Thread me = Thread.currentThread(); Thread me = Thread.currentThread();
ClassLoader cl = me.getContextClassLoader(); ClassLoader cl = me.getContextClassLoader();
try { try {
me.setContextClassLoader(PolishStemmer.class.getClassLoader()); me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
this.stemmer = new PolishStemmer(); this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
this.charUtils = CharacterUtils.getInstance(version); this.charUtils = CharacterUtils.getInstance(version);
this.lemmaList = Collections.emptyList(); this.lemmaList = Collections.emptyList();
} finally { } finally {

View File

@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory;
/** /**
* Filter factory for {@link MorfologikFilter}. * Filter factory for {@link MorfologikFilter}. For backward compatibility polish
* dictionary is used as default. You can change dictionary resource
* by dictionary-resource parameter.
* <pre class="prettyprint"> * <pre class="prettyprint">
* &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt; * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt; * &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt; * &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.MorfologikFilterFactory" /&gt; * &lt;filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" /&gt;
* &lt;/analyzer&gt; * &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre> * &lt;/fieldType&gt;</pre>
* *
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a> * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/ */
public class MorfologikFilterFactory extends TokenFilterFactory { public class MorfologikFilterFactory extends TokenFilterFactory {
/**
* The default dictionary resource (for Polish).
*/
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
/**
* Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
*/
private final String dictionaryResource;
/** Schema attribute. */ /** Schema attribute. */
@Deprecated @Deprecated
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary"; public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
/** Dictionary resource */
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
/** Creates a new MorfologikFilterFactory */ /** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) { public MorfologikFilterFactory(Map<String,String> args) {
super(args); super(args);
@ -47,9 +62,12 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE); String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
if (dictionaryName != null && !dictionaryName.isEmpty()) { if (dictionaryName != null && !dictionaryName.isEmpty()) {
throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no " throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
+ "longer supported (Morfologik has one dictionary): " + dictionaryName); + "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
+ ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
} }
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
if (!args.isEmpty()) { if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args); throw new IllegalArgumentException("Unknown parameters: " + args);
} }
@ -57,6 +75,6 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
@Override @Override
public TokenStream create(TokenStream ts) { public TokenStream create(TokenStream ts) {
return new MorfologikFilter(ts, luceneMatchVersion); return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
} }
} }

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfologik;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;

View File

@ -22,7 +22,6 @@ import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
/** /**
@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
/** Test that bogus arguments result in exception */ /** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception { public void testBogusArguments() throws Exception {
try { try {
new MorfologikFilterFactory(new HashMap<String,String>() {{ HashMap<String,String> map = new HashMap<String,String>();
put("bogusArg", "bogusValue"); map.put("bogusArg", "bogusValue");
}}); new MorfologikFilterFactory(map);
fail(); fail();
} catch (IllegalArgumentException expected) { } catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters")); assertTrue(expected.getMessage().contains("Unknown parameters"));