diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index da6d0a2068c..faa29f8bd73 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -76,6 +76,9 @@ Changes in Runtime Behavior New Features +* LUCENE-5356: Morfologik filter can accept custom dictionary resources. + (Michal Hlavac, Dawid Weiss) + * LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting on multi-valued field. (Robert Muir) diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java index f64f54888c4..985ac90600e 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java @@ -31,26 +31,34 @@ import org.apache.lucene.util.Version; * @see Morfologik project page */ public class MorfologikAnalyzer extends Analyzer { + private final String dictionary; private final Version version; /** * Builds an analyzer with the default Morfologik's dictionary (polimorf). * - * @param version - * Lucene compatibility version + * @param version Lucene compatibility version + * @param dictionaryResource A constant specifying which dictionary to choose. The + * dictionary resource must be named morfologik/dictionaries/{dictionaryResource}.dict + * and have an associated .info metadata file. See the Morfologik project + * for details. + * + * @see "http://morfologik.blogspot.com/" */ - public MorfologikAnalyzer(final Version version) { + public MorfologikAnalyzer(final Version version, final String dictionaryResource) { this.version = version; + this.dictionary = dictionaryResource; + } + public MorfologikAnalyzer(final Version version) { + this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE); } - /** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * which tokenizes all the text in the provided {@link Reader}. * * @param field ignored field name - * @return A - * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} + * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} * built from an {@link StandardTokenizer} filtered with * {@link StandardFilter} and {@link MorfologikFilter}. */ @@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends Analyzer { return new TokenStreamComponents( src, - new MorfologikFilter(new StandardFilter(this.version, src), this.version)); + new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version)); } } diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java index 6bb08036400..08b4ce4dd3c 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java @@ -61,20 +61,29 @@ public class MorfologikFilter extends TokenFilter { private int lemmaListIndex; /** - * Creates MorfologikFilter - * @param in input token stream - * @param version Lucene version compatibility for lowercasing. + * Creates a filter with the default (Polish) dictionary. */ public MorfologikFilter(final TokenStream in, final Version version) { + this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version); + } + + /** + * Creates a filter with a given dictionary resource. + * + * @param in input token stream. + * @param dict Dictionary resource from classpath. + * @param version Lucene version compatibility for lowercasing. + */ + public MorfologikFilter(final TokenStream in, final String dict, final Version version) { super(in); this.input = in; - + // SOLR-4007: temporarily substitute context class loader to allow finding dictionary resources. Thread me = Thread.currentThread(); ClassLoader cl = me.getContextClassLoader(); try { - me.setContextClassLoader(PolishStemmer.class.getClassLoader()); - this.stemmer = new PolishStemmer(); + me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader()); + this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict)); this.charUtils = CharacterUtils.getInstance(version); this.lemmaList = Collections.emptyList(); } finally { diff --git a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java index 388a441ee3a..41f09473f32 100644 --- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java +++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java @@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.TokenFilterFactory; /** - * Filter factory for {@link MorfologikFilter}. + * Filter factory for {@link MorfologikFilter}. For backward compatibility polish + * dictionary is used as default. You can change dictionary resource + * by dictionary-resource parameter. *
  * <fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100">
  *   <analyzer>
  *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
- *     <filter class="solr.MorfologikFilterFactory" />
+ *     <filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" />
  *   </analyzer>
  * </fieldType>
* * @see Morfologik web site */ public class MorfologikFilterFactory extends TokenFilterFactory { + /** + * The default dictionary resource (for Polish). + */ + public static final String DEFAULT_DICTIONARY_RESOURCE = "pl"; + + /** + * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details. + */ + private final String dictionaryResource; + /** Schema attribute. */ @Deprecated public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary"; + /** Dictionary resource */ + public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource"; + /** Creates a new MorfologikFilterFactory */ public MorfologikFilterFactory(Map args) { super(args); @@ -47,9 +62,12 @@ public class MorfologikFilterFactory extends TokenFilterFactory { String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE); if (dictionaryName != null && !dictionaryName.isEmpty()) { throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no " - + "longer supported (Morfologik has one dictionary): " + dictionaryName); + + "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName + + ". Perhaps you wanted to use 'dictionary-resource' attribute instead?"); } + dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE); + if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -57,6 +75,6 @@ public class MorfologikFilterFactory extends TokenFilterFactory { @Override public TokenStream create(TokenStream ts) { - return new MorfologikFilter(ts, luceneMatchVersion); + return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion); } } diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java index 3624071c93f..2808caa096e 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java @@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfologik; */ import java.io.IOException; -import java.io.Reader; import java.util.TreeSet; import org.apache.lucene.analysis.Analyzer; diff --git a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java index 6465effa1a8..d755ab8d3bb 100644 --- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java +++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java @@ -22,7 +22,6 @@ import java.util.Collections; import java.util.HashMap; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; /** @@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase { /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { try { - new MorfologikFilterFactory(new HashMap() {{ - put("bogusArg", "bogusValue"); - }}); + HashMap map = new HashMap(); + map.put("bogusArg", "bogusValue"); + new MorfologikFilterFactory(map); fail(); } catch (IllegalArgumentException expected) { assertTrue(expected.getMessage().contains("Unknown parameters"));