LUCENE-5356: Morfologik filter can accept custom dictionary resources.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1580853 13f79535-47bb-0310-9956-ffa450edef68
2014-03-24 13:47:28 +00:00 · 2014-03-24 13:47:28 +00:00 · ed22afea8a
parent dc2b46d161
commit ed22afea8a
6 changed files with 58 additions and 22 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -76,6 +76,9 @@ Changes in Runtime Behavior

 New Features

+* LUCENE-5356: Morfologik filter can accept custom dictionary resources.
+  (Michal Hlavac, Dawid Weiss)
+
 * LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
  on multi-valued field. (Robert Muir)

--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikAnalyzer.java
@ -31,26 +31,34 @@ import org.apache.lucene.util.Version;
 * @see <a href="http://morfologik.blogspot.com/">Morfologik project page</a>
 */
 public class MorfologikAnalyzer extends Analyzer {
+  private final String dictionary;
  private final Version version;

  /**
   * Builds an analyzer with the default Morfologik's dictionary (polimorf).
   * 
-   * @param version
-   *          Lucene compatibility version
+   * @param version Lucene compatibility version
+   * @param dictionaryResource A constant specifying which dictionary to choose. The
+   * dictionary resource must be named <code>morfologik/dictionaries/{dictionaryResource}.dict</code>
+   * and have an associated <code>.info</code> metadata file. See the Morfologik project
+   * for details.
+   * 
+   * @see "http://morfologik.blogspot.com/"
   */
-  public MorfologikAnalyzer(final Version version) {
+  public MorfologikAnalyzer(final Version version, final String dictionaryResource) {
    this.version = version;
+      this.dictionary = dictionaryResource;
+  }
+  public MorfologikAnalyzer(final Version version) {
+    this(version, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE);
  }
-
  /**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @param field ignored field name
-   * @return A
-   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
+   * @return A {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter} and {@link MorfologikFilter}.
   */
@ -60,6 +68,6 @@ public class MorfologikAnalyzer extends Analyzer {
    
    return new TokenStreamComponents(
        src, 
-        new MorfologikFilter(new StandardFilter(this.version, src), this.version));
+        new MorfologikFilter(new StandardFilter(this.version, src), dictionary, this.version));
  }
 }
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilter.java
@ -61,11 +61,20 @@ public class MorfologikFilter extends TokenFilter {
  private int lemmaListIndex;

  /**
-   * Creates MorfologikFilter
-   * @param in   input token stream
-   * @param version Lucene version compatibility for lowercasing.
+   * Creates a filter with the default (Polish) dictionary.
   */
  public MorfologikFilter(final TokenStream in, final Version version) {
+    this(in, MorfologikFilterFactory.DEFAULT_DICTIONARY_RESOURCE, version);
+  }
+
+  /**
+   * Creates a filter with a given dictionary resource.
+   *
+   * @param in input token stream.
+   * @param dict Dictionary resource from classpath.
+   * @param version Lucene version compatibility for lowercasing.
+   */
+  public MorfologikFilter(final TokenStream in, final String dict, final Version version) {
    super(in);
    this.input = in;

@ -73,8 +82,8 @@ public class MorfologikFilter extends TokenFilter {
    Thread me = Thread.currentThread();
    ClassLoader cl = me.getContextClassLoader();
    try {
-      me.setContextClassLoader(PolishStemmer.class.getClassLoader());
-      this.stemmer = new PolishStemmer();
+      me.setContextClassLoader(morfologik.stemming.Dictionary.class.getClassLoader());
+      this.stemmer = new DictionaryLookup(morfologik.stemming.Dictionary.getForLanguage(dict));
      this.charUtils = CharacterUtils.getInstance(version);
      this.lemmaList = Collections.emptyList();
    } finally {
--- a/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
+++ b/lucene/analysis/morfologik/src/java/org/apache/lucene/analysis/morfologik/MorfologikFilterFactory.java
@ -23,22 +23,37 @@ import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.util.TokenFilterFactory;

 /**
- * Filter factory for {@link MorfologikFilter}.
+ * Filter factory for {@link MorfologikFilter}. For backward compatibility polish
+ * dictionary is used as default. You can change dictionary resource 
+ * by dictionary-resource parameter.
 * <pre class="prettyprint">
 * &lt;fieldType name="text_polish" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *     &lt;filter class="solr.MorfologikFilterFactory" /&gt;
+ *     &lt;filter class="solr.MorfologikFilterFactory" dictionary-resource="pl" /&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 * 
 * @see <a href="http://morfologik.blogspot.com/">Morfologik web site</a>
 */
 public class MorfologikFilterFactory extends TokenFilterFactory {
+  /**
+   * The default dictionary resource (for Polish). 
+   */
+  public static final String DEFAULT_DICTIONARY_RESOURCE = "pl";
+
+  /**
+   * Stemming dictionary resource. See {@link MorfologikAnalyzer} for more details. 
+   */
+  private final String dictionaryResource;
+
  /** Schema attribute. */
  @Deprecated
  public static final String DICTIONARY_SCHEMA_ATTRIBUTE = "dictionary";

+  /** Dictionary resource */
+  public static final String DICTIONARY_RESOURCE_ATTRIBUTE = "dictionary-resource";
+
  /** Creates a new MorfologikFilterFactory */
  public MorfologikFilterFactory(Map<String,String> args) {
    super(args);
@ -47,9 +62,12 @@ public class MorfologikFilterFactory extends TokenFilterFactory {
    String dictionaryName = get(args, DICTIONARY_SCHEMA_ATTRIBUTE);
    if (dictionaryName != null && !dictionaryName.isEmpty()) {
      throw new IllegalArgumentException("The " + DICTIONARY_SCHEMA_ATTRIBUTE + " attribute is no "
-          + "longer supported (Morfologik has one dictionary): " + dictionaryName);
+          + "longer supported (Morfologik now offers one unified Polish dictionary): " + dictionaryName
+          + ". Perhaps you wanted to use 'dictionary-resource' attribute instead?");
    }

+    dictionaryResource = get(args, DICTIONARY_RESOURCE_ATTRIBUTE, DEFAULT_DICTIONARY_RESOURCE);
+    
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -57,6 +75,6 @@ public class MorfologikFilterFactory extends TokenFilterFactory {

  @Override
  public TokenStream create(TokenStream ts) {
-    return new MorfologikFilter(ts, luceneMatchVersion);
+    return new MorfologikFilter(ts, dictionaryResource, luceneMatchVersion);
  }
 }
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikAnalyzer.java
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.morfologik;
 */

 import java.io.IOException;
-import java.io.Reader;
 import java.util.TreeSet;

 import org.apache.lucene.analysis.Analyzer;
--- a/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
+++ b/lucene/analysis/morfologik/src/test/org/apache/lucene/analysis/morfologik/TestMorfologikFilterFactory.java
@ -22,7 +22,6 @@ import java.util.Collections;
 import java.util.HashMap;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;

 /**
@ -40,9 +39,9 @@ public class TestMorfologikFilterFactory extends BaseTokenStreamTestCase {
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    try {
-      new MorfologikFilterFactory(new HashMap<String,String>() {{
-        put("bogusArg", "bogusValue");
-      }});
+      HashMap<String,String> map = new HashMap<String,String>();
+      map.put("bogusArg", "bogusValue");
+      new MorfologikFilterFactory(map);
      fail();
    } catch (IllegalArgumentException expected) {
      assertTrue(expected.getMessage().contains("Unknown parameters"));