mirror of https://github.com/apache/lucene.git
LUCENE-5356: Morfologik filter can accept custom dictionary resources.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1580853 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dc2b46d161
commit
ed22afea8a
|
@ -76,6 +76,9 @@ Changes in Runtime Behavior
|
|||
|
||||
New Features
|
||||
|
||||
* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
|
||||
(Michal Hlavac, Dawid Weiss)
|
||||
|
||||
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
|
||||
on multi-valued field. (Robert Muir)
|
||||
|
||||
|
|
|
@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
|
|||
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||
*/
|
||||
public class MorfologikAnalyzer extends Analyzer {
|
||||
private final String dictionary;
|
||||
private final Version version;
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the default Morfologik's dictionary (polimorf).
|
||||
*
|
||||
* @param version
|
||||
* Lucene compatibility version
|
||||
* @param version Lucene compatibility version
|
||||
* @param dictionaryResource A constant specifying which dictionary to choose. The
|
||||
* dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
|
||||
* and have an associated <code>.info</code> metadata file. See the Morfologik project
|
||||
* for details.
|
||||
*
|
||||
* @see "http://morfologik.blogspot.com/"
|
||||
*/
|
||||
public MorfologikAnalyzer(final Version version) {
|
||||
public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
|
||||
this.version = version;
|
||||
this.dictionary = dictionaryResource;
|
||||
}
|
||||
public MorfologikAnalyzer(final Version version) {
|
||||
this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @param field ignored field name
|
||||
* @return A
|
||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link StandardFilter} and {@link MorfologikFilter}.
|
||||
*/
|
||||
|
@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends Analyzer {
|
|||
|
||||
return new TokenStreamComponents(
|
||||
src,
|
||||
new MorfologikFilter(new StandardFilter(this.version, src), this.version));
|
||||
new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -61,11 +61,20 @@ public class MorfologikFilter extends TokenFilter {
|
|||
private int lemmaListIndex;
|
||||
|
||||
/**
|
||||
* Creates MorfologikFilter
|
||||
* @param in input token stream
|
||||
* @param version Lucene version compatibility for lowercasing.
|
||||
* Creates a filter with the default (Polish) dictionary.
|
||||
*/
|
||||
public MorfologikFilter(final TokenStream in, final Version version) {
|
||||
this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a filter with a given dictionary resource.
|
||||
*
|
||||
* @param in input token stream.
|
||||
* @param dict Dictionary resource from classpath.
|
||||
* @param version Lucene version compatibility for lowercasing.
|
||||
*/
|
||||
public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
|
||||
super(in);
|
||||
this.input = in;
|
||||
|
||||
|
@ -73,8 +82,8 @@ public class MorfologikFilter extends TokenFilter {
|
|||
Thread me = Thread.currentThread();
|
||||
ClassLoader cl = me.getContextClassLoader();
|
||||
try {
|
||||
me.setContextClassLoader(PolishStemmer.class.getClassLoader());
|
||||
this.stemmer = new PolishStemmer();
|
||||
me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
|
||||
this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
|
||||
this.charUtils = CharacterUtils.getInstance(version);
|
||||
this.lemmaList = Collections.emptyList();
|
||||
} finally {
|
||||
|
|
|
@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Filter factory for {@link MorfologikFilter}.
|
||||
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
|
||||
* dictionary is used as default. You can change dictionary resource
|
||||
* by dictionary-resource parameter.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.MorfologikFilterFactory" />
|
||||
* <filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
|
||||
*/
|
||||
public class MorfologikFilterFactory extends TokenFilterFactory {
|
||||
/**
|
||||
* The default dictionary resource (for Polish).
|
||||
*/
|
||||
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
|
||||
|
||||
/**
|
||||
* Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
|
||||
*/
|
||||
private final String dictionaryResource;
|
||||
|
||||
/** Schema attribute. */
|
||||
@Deprecated
|
||||
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
|
||||
|
||||
/** Dictionary resource */
|
||||
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
|
||||
|
||||
/** Creates a new MorfologikFilterFactory */
|
||||
public MorfologikFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
|
@ -47,9 +62,12 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
|
|||
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
|
||||
if (dictionaryName != null && !dictionaryName.isEmpty()) {
|
||||
throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
|
||||
+ "longer supported (Morfologik has one dictionary): " + dictionaryName);
|
||||
+ "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
|
||||
+ ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
|
||||
}
|
||||
|
||||
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
|
||||
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -57,6 +75,6 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream ts) {
|
||||
return new MorfologikFilter(ts, luceneMatchVersion);
|
||||
return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfologik;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.util.Collections;
|
|||
import java.util.HashMap;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
|
@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
|
|||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
new MorfologikFilterFactory(new HashMap<String,String>() {{
|
||||
put("bogusArg", "bogusValue");
|
||||
}});
|
||||
HashMap<String,String> map = new HashMap<String,String>();
|
||||
map.put("bogusArg", "bogusValue");
|
||||
new MorfologikFilterFactory(map);
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
|
|
Loading…
Reference in New Issue