LUCENE-5356: Morfologik filter can accept custom dictionary resources.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1580853 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Dawid Weiss 2014-03-24 13:47:28 +00:00
parent dc2b46d161
commit ed22afea8a
6 changed files with 58 additions and 22 deletions

View File

@ -76,6 +76,9 @@ Changes in Runtime Behavior
New Features
* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
(Michal Hlavac, Dawid Weiss)
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
on multi-valued field. (Robert Muir)

View File

@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
*/
public class MorfologikAnalyzer extends Analyzer {
private final String dictionary;
private final Version version;
/**
* Builds an analyzer with the default Morfologik's dictionary (polimorf).
*
* @param version
* Lucene compatibility version
* @param version Lucene compatibility version
* @param dictionaryResource A constant specifying which dictionary to choose. The
* dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
* and have an associated <code>.info</code> metadata file. See the Morfologik project
* for details.
*
* @see "http://morfologik.blogspot.com/"
*/
public MorfologikAnalyzer(final Version version) {
public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
this.version = version;
this.dictionary = dictionaryResource;
}
public MorfologikAnalyzer(final Version version) {
this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
}
/**
* Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @param field ignored field name
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter} and {@link MorfologikFilter}.
*/
@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends Analyzer {
return new TokenStreamComponents(
src,
new MorfologikFilter(new StandardFilter(this.version, src), this.version));
new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
}
}

View File

@ -61,20 +61,29 @@ public class MorfologikFilter extends TokenFilter {
private int lemmaListIndex;
/**
* Creates MorfologikFilter
* @param in input token stream
* @param version Lucene version compatibility for lowercasing.
* Creates a filter with the default (Polish) dictionary.
*/
public MorfologikFilter(final TokenStream in, final Version version) {
this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
}
/**
* Creates a filter with a given dictionary resource.
*
* @param in input token stream.
* @param dict Dictionary resource from classpath.
* @param version Lucene version compatibility for lowercasing.
*/
public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
super(in);
this.input = in;
// SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
Thread me = Thread.currentThread();
ClassLoader cl = me.getContextClassLoader();
try {
me.setContextClassLoader(PolishStemmer.class.getClassLoader());
this.stemmer = new PolishStemmer();
me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
this.charUtils = CharacterUtils.getInstance(version);
this.lemmaList = Collections.emptyList();
} finally {

View File

@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Filter factory for {@link MorfologikFilter}.
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
* dictionary is used as default. You can change dictionary resource
* by dictionary-resource parameter.
* <pre class="prettyprint">
* &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.MorfologikFilterFactory" /&gt;
* &lt;filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
*/
public class MorfologikFilterFactory extends TokenFilterFactory {
/**
* The default dictionary resource (for Polish).
*/
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
/**
* Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
*/
private final String dictionaryResource;
/** Schema attribute. */
@Deprecated
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
/** Dictionary resource */
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
/** Creates a new MorfologikFilterFactory */
public MorfologikFilterFactory(Map<String,String> args) {
super(args);
@ -47,9 +62,12 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
if (dictionaryName != null && !dictionaryName.isEmpty()) {
throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
+ "longer supported (Morfologik has one dictionary): " + dictionaryName);
+ "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
+ ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
}
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -57,6 +75,6 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
@Override
public TokenStream create(TokenStream ts) {
return new MorfologikFilter(ts, luceneMatchVersion);
return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
}
}

View File

@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfologik;
*/
import java.io.IOException;
import java.io.Reader;
import java.util.TreeSet;
import org.apache.lucene.analysis.Analyzer;

View File

@ -22,7 +22,6 @@ import java.util.Collections;
import java.util.HashMap;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
/**
@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
new MorfologikFilterFactory(new HashMap<String,String>() {{
put("bogusArg", "bogusValue");
}});
HashMap<String,String> map = new HashMap<String,String>();
map.put("bogusArg", "bogusValue");
new MorfologikFilterFactory(map);
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));