mirror of https://github.com/apache/lucene.git
LUCENE-5356: Morfologik filter can accept custom dictionary resources.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1580853 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
dc2b46d161
commit
ed22afea8a
|
@ -76,6 +76,9 @@ Changes in Runtime Behavior
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
|
* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
|
||||||
|
(Michal Hlavac, Dawid Weiss)
|
||||||
|
|
||||||
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
|
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
|
||||||
on multi-valued field. (Robert Muir)
|
on multi-valued field. (Robert Muir)
|
||||||
|
|
||||||
|
|
|
@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
|
||||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
* @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
|
||||||
*/
|
*/
|
||||||
public class MorfologikAnalyzer extends Analyzer {
|
public class MorfologikAnalyzer extends Analyzer {
|
||||||
|
private final String dictionary;
|
||||||
private final Version version;
|
private final Version version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the default Morfologik's dictionary (polimorf).
|
* Builds an analyzer with the default Morfologik's dictionary (polimorf).
|
||||||
*
|
*
|
||||||
* @param version
|
* @param version Lucene compatibility version
|
||||||
* Lucene compatibility version
|
* @param dictionaryResource A constant specifying which dictionary to choose. The
|
||||||
|
* dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
|
||||||
|
* and have an associated <code>.info</code> metadata file. See the Morfologik project
|
||||||
|
* for details.
|
||||||
|
*
|
||||||
|
* @see "http://morfologik.blogspot.com/"
|
||||||
*/
|
*/
|
||||||
public MorfologikAnalyzer(final Version version) {
|
public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
|
||||||
this.version = version;
|
this.version = version;
|
||||||
|
this.dictionary = dictionaryResource;
|
||||||
|
}
|
||||||
|
public MorfologikAnalyzer(final Version version) {
|
||||||
|
this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a
|
* Creates a
|
||||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
* which tokenizes all the text in the provided {@link Reader}.
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @param field ignored field name
|
* @param field ignored field name
|
||||||
* @return A
|
* @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
||||||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
|
|
||||||
* built from an {@link StandardTokenizer} filtered with
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* {@link StandardFilter} and {@link MorfologikFilter}.
|
* {@link StandardFilter} and {@link MorfologikFilter}.
|
||||||
*/
|
*/
|
||||||
|
@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends Analyzer {
|
||||||
|
|
||||||
return new TokenStreamComponents(
|
return new TokenStreamComponents(
|
||||||
src,
|
src,
|
||||||
new MorfologikFilter(new StandardFilter(this.version, src), this.version));
|
new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -61,20 +61,29 @@ public class MorfologikFilter extends TokenFilter {
|
||||||
private int lemmaListIndex;
|
private int lemmaListIndex;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates MorfologikFilter
|
* Creates a filter with the default (Polish) dictionary.
|
||||||
* @param in input token stream
|
|
||||||
* @param version Lucene version compatibility for lowercasing.
|
|
||||||
*/
|
*/
|
||||||
public MorfologikFilter(final TokenStream in, final Version version) {
|
public MorfologikFilter(final TokenStream in, final Version version) {
|
||||||
|
this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a filter with a given dictionary resource.
|
||||||
|
*
|
||||||
|
* @param in input token stream.
|
||||||
|
* @param dict Dictionary resource from classpath.
|
||||||
|
* @param version Lucene version compatibility for lowercasing.
|
||||||
|
*/
|
||||||
|
public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
|
||||||
super(in);
|
super(in);
|
||||||
this.input = in;
|
this.input = in;
|
||||||
|
|
||||||
// SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
|
// SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources.
|
||||||
Thread me = Thread.currentThread();
|
Thread me = Thread.currentThread();
|
||||||
ClassLoader cl = me.getContextClassLoader();
|
ClassLoader cl = me.getContextClassLoader();
|
||||||
try {
|
try {
|
||||||
me.setContextClassLoader(PolishStemmer.class.getClassLoader());
|
me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
|
||||||
this.stemmer = new PolishStemmer();
|
this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
|
||||||
this.charUtils = CharacterUtils.getInstance(version);
|
this.charUtils = CharacterUtils.getInstance(version);
|
||||||
this.lemmaList = Collections.emptyList();
|
this.lemmaList = Collections.emptyList();
|
||||||
} finally {
|
} finally {
|
||||||
|
|
|
@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter factory for {@link MorfologikFilter}.
|
* Filter factory for {@link MorfologikFilter}. For backward compatibility polish
|
||||||
|
* dictionary is used as default. You can change dictionary resource
|
||||||
|
* by dictionary-resource parameter.
|
||||||
* <pre class="prettyprint">
|
* <pre class="prettyprint">
|
||||||
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
|
* <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
|
||||||
* <analyzer>
|
* <analyzer>
|
||||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
* <filter class="solr.MorfologikFilterFactory" />
|
* <filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" />
|
||||||
* </analyzer>
|
* </analyzer>
|
||||||
* </fieldType></pre>
|
* </fieldType></pre>
|
||||||
*
|
*
|
||||||
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
|
* @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
|
||||||
*/
|
*/
|
||||||
public class MorfologikFilterFactory extends TokenFilterFactory {
|
public class MorfologikFilterFactory extends TokenFilterFactory {
|
||||||
|
/**
|
||||||
|
* The default dictionary resource (for Polish).
|
||||||
|
*/
|
||||||
|
public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details.
|
||||||
|
*/
|
||||||
|
private final String dictionaryResource;
|
||||||
|
|
||||||
/** Schema attribute. */
|
/** Schema attribute. */
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
|
public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";
|
||||||
|
|
||||||
|
/** Dictionary resource */
|
||||||
|
public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
|
||||||
|
|
||||||
/** Creates a new MorfologikFilterFactory */
|
/** Creates a new MorfologikFilterFactory */
|
||||||
public MorfologikFilterFactory(Map<String,String> args) {
|
public MorfologikFilterFactory(Map<String,String> args) {
|
||||||
super(args);
|
super(args);
|
||||||
|
@ -47,9 +62,12 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
|
||||||
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
|
String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
|
||||||
if (dictionaryName != null && !dictionaryName.isEmpty()) {
|
if (dictionaryName != null && !dictionaryName.isEmpty()) {
|
||||||
throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
|
throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
|
||||||
+ "longer supported (Morfologik has one dictionary): " + dictionaryName);
|
+ "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
|
||||||
|
+ ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
|
||||||
|
|
||||||
if (!args.isEmpty()) {
|
if (!args.isEmpty()) {
|
||||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
}
|
}
|
||||||
|
@ -57,6 +75,6 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream ts) {
|
public TokenStream create(TokenStream ts) {
|
||||||
return new MorfologikFilter(ts, luceneMatchVersion);
|
return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfologik;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.util.Collections;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
|
||||||
/** Test that bogus arguments result in exception */
|
/** Test that bogus arguments result in exception */
|
||||||
public void testBogusArguments() throws Exception {
|
public void testBogusArguments() throws Exception {
|
||||||
try {
|
try {
|
||||||
new MorfologikFilterFactory(new HashMap<String,String>() {{
|
HashMap<String,String> map = new HashMap<String,String>();
|
||||||
put("bogusArg", "bogusValue");
|
map.put("bogusArg", "bogusValue");
|
||||||
}});
|
new MorfologikFilterFactory(map);
|
||||||
fail();
|
fail();
|
||||||
} catch (IllegalArgumentException expected) {
|
} catch (IllegalArgumentException expected) {
|
||||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
|
Loading…
Reference in New Issue