SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1035996 13f79535-47bb-0310-9956-ffa450edef68
2010-11-17 12:26:15 +00:00 · 2010-11-17 12:26:15 +00:00 · de3d057abc
parent 204abfb17d
commit de3d057abc
5 changed files with 113 additions and 22 deletions
--- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
+++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.pl;
 * limitations under the License.
 */
 import java.io.BufferedInputStream;
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
@ -50,6 +48,9 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
  /** File containing default Polish stopwords. */
  public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
  /** File containing default Polish stemmer table. */
  public final static String DEFAULT_STEMMER_FILE = "stemmer_20000.tbl";
  /**
   * Returns an unmodifiable instance of the default stop words set.
   * @return default stop words set.
@ -76,16 +77,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
        throw new RuntimeException("Unable to load default stopword set", ex);
      }
      InputStream stream = PolishAnalyzer.class.getResourceAsStream("stemmer_20000.tbl");
      try {
-        DataInputStream in = new DataInputStream(new BufferedInputStream(stream));
+        DEFAULT_TABLE = StempelStemmer.load(PolishAnalyzer.class.getResourceAsStream(DEFAULT_STEMMER_FILE));
        String method = in.readUTF().toUpperCase();
        if (method.indexOf('M') < 0) {
          DEFAULT_TABLE = new org.egothor.stemmer.Trie(in);
        } else {
          DEFAULT_TABLE = new org.egothor.stemmer.MultiTrie2(in);
        }
        in.close();
      } catch (IOException ex) {
        // default set should always be present as it is part of the
        // distribution (JAR)
--- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java
+++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java
@ -20,6 +20,7 @@ import java.io.BufferedInputStream;
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Locale;
 import org.egothor.stemmer.Diff;
 import org.egothor.stemmer.Trie;
@ -45,17 +46,7 @@ public class StempelStemmer {
   * @param stemmerTable stemmer table.
   */
  public StempelStemmer(InputStream stemmerTable) throws IOException {
-    if (stemmerTable == null) return;
+    this(load(stemmerTable));
    DataInputStream in = new DataInputStream(new BufferedInputStream(
        stemmerTable));
    String method = in.readUTF().toUpperCase();
    if (method.indexOf('M') < 0) {
      stemmer = new org.egothor.stemmer.Trie(in);
    } else {
      stemmer = new org.egothor.stemmer.MultiTrie2(in);
    }
    in.close();
  }
  /**
@ -67,6 +58,24 @@ public class StempelStemmer {
    this.stemmer = stemmer;
  }
  /**
   * Load a stemmer table from an inputstream.
   */
  public static Trie load(InputStream stemmerTable) throws IOException {
    DataInputStream in = null;
    try {
      in = new DataInputStream(new BufferedInputStream(stemmerTable));
      String method = in.readUTF().toUpperCase(Locale.ENGLISH);
      if (method.indexOf('M') < 0) {
        return new org.egothor.stemmer.Trie(in);
      } else {
        return new org.egothor.stemmer.MultiTrie2(in);
      }
    } finally {
      in.close();
    }
  }
  /**
   * Stem a word. 
   * 
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -305,6 +305,8 @@ New Features
 * SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm 
  with good results for most languages.  (Tom Burton-West via rmuir)
 * SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
 Optimizations
 ----------------------
--- a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java
+++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java
@ -0,0 +1,49 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.stempel.StempelFilter;
 import org.apache.lucene.analysis.stempel.StempelStemmer;
 import org.apache.solr.common.ResourceLoader;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrException.ErrorCode;
 import org.apache.solr.util.plugin.ResourceLoaderAware;
 import org.egothor.stemmer.Trie;
 /**
 * Factory for {@link StempelFilter} using a Polish stemming table.
 */
 public class StempelPolishStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
  private Trie stemmer = null;
  private static final String STEMTABLE = "org/apache/lucene/analysis/pl/stemmer_20000.tbl";
  public TokenStream create(TokenStream input) {
    return new StempelFilter(input, new StempelStemmer(stemmer));
  }
  public void inform(ResourceLoader loader) {
    try {
      stemmer = StempelStemmer.load(loader.openResource(STEMTABLE));
    } catch (IOException e) {
      throw new SolrException(ErrorCode.SERVER_ERROR, "Could not load stem table: " + STEMTABLE);
    }
  }
 }
--- a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java
+++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java
@ -0,0 +1,38 @@
 package org.apache.solr.analysis;
 import java.io.StringReader;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 import org.apache.solr.core.SolrResourceLoader;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * Tests for {@link StempelPolishStemFilterFactory}
 */
 public class TestStempelPolishStemFilterFactory extends BaseTokenTestCase {
  public void testBasics() throws Exception {
    StringReader document = new StringReader("studenta studenci");
    StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory();
    factory.inform(new SolrResourceLoader(null, null));
    TokenStream ts = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, document));
    assertTokenStreamContents(ts,
        new String[] { "student", "student" });
  }
 }