From de3d057abc9f94cc80b0d6817b2c56fecf0c6b54 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 17 Nov 2010 12:26:15 +0000 Subject: [PATCH] SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1035996 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/pl/PolishAnalyzer.java | 15 ++---- .../analysis/stempel/StempelStemmer.java | 31 +++++++----- solr/CHANGES.txt | 2 + .../StempelPolishStemFilterFactory.java | 49 +++++++++++++++++++ .../TestStempelPolishStemFilterFactory.java | 38 ++++++++++++++ 5 files changed, 113 insertions(+), 22 deletions(-) create mode 100644 solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java create mode 100644 solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java diff --git a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java index 0b8c4e815a5..80d9d6d3a11 100644 --- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java +++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/pl/PolishAnalyzer.java @@ -17,8 +17,6 @@ package org.apache.lucene.analysis.pl; * limitations under the License. */ -import java.io.BufferedInputStream; -import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.io.Reader; @@ -50,6 +48,9 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { /** File containing default Polish stopwords. */ public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt"; + /** File containing default Polish stemmer table. */ + public final static String DEFAULT_STEMMER_FILE = "stemmer_20000.tbl"; + /** * Returns an unmodifiable instance of the default stop words set. * @return default stop words set. @@ -76,16 +77,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase { throw new RuntimeException("Unable to load default stopword set", ex); } - InputStream stream = PolishAnalyzer.class.getResourceAsStream("stemmer_20000.tbl"); try { - DataInputStream in = new DataInputStream(new BufferedInputStream(stream)); - String method = in.readUTF().toUpperCase(); - if (method.indexOf('M') < 0) { - DEFAULT_TABLE = new org.egothor.stemmer.Trie(in); - } else { - DEFAULT_TABLE = new org.egothor.stemmer.MultiTrie2(in); - } - in.close(); + DEFAULT_TABLE = StempelStemmer.load(PolishAnalyzer.class.getResourceAsStream(DEFAULT_STEMMER_FILE)); } catch (IOException ex) { // default set should always be present as it is part of the // distribution (JAR) diff --git a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java index 2da94e4fa2d..bf31660b37a 100644 --- a/modules/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java +++ b/modules/analysis/stempel/src/java/org/apache/lucene/analysis/stempel/StempelStemmer.java @@ -20,6 +20,7 @@ import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; +import java.util.Locale; import org.egothor.stemmer.Diff; import org.egothor.stemmer.Trie; @@ -45,17 +46,7 @@ public class StempelStemmer { * @param stemmerTable stemmer table. */ public StempelStemmer(InputStream stemmerTable) throws IOException { - if (stemmerTable == null) return; - - DataInputStream in = new DataInputStream(new BufferedInputStream( - stemmerTable)); - String method = in.readUTF().toUpperCase(); - if (method.indexOf('M') < 0) { - stemmer = new org.egothor.stemmer.Trie(in); - } else { - stemmer = new org.egothor.stemmer.MultiTrie2(in); - } - in.close(); + this(load(stemmerTable)); } /** @@ -66,6 +57,24 @@ public class StempelStemmer { public StempelStemmer(Trie stemmer) { this.stemmer = stemmer; } + + /** + * Load a stemmer table from an inputstream. + */ + public static Trie load(InputStream stemmerTable) throws IOException { + DataInputStream in = null; + try { + in = new DataInputStream(new BufferedInputStream(stemmerTable)); + String method = in.readUTF().toUpperCase(Locale.ENGLISH); + if (method.indexOf('M') < 0) { + return new org.egothor.stemmer.Trie(in); + } else { + return new org.egothor.stemmer.MultiTrie2(in); + } + } finally { + in.close(); + } + } /** * Stem a word. diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 78cb530bcde..8d37e9518c6 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -305,6 +305,8 @@ New Features * SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm with good results for most languages. (Tom Burton-West via rmuir) +* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir) + Optimizations ---------------------- diff --git a/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java new file mode 100644 index 00000000000..9467d89a0e6 --- /dev/null +++ b/solr/contrib/analysis-extras/src/java/org/apache/solr/analysis/StempelPolishStemFilterFactory.java @@ -0,0 +1,49 @@ +package org.apache.solr.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.stempel.StempelFilter; +import org.apache.lucene.analysis.stempel.StempelStemmer; +import org.apache.solr.common.ResourceLoader; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.util.plugin.ResourceLoaderAware; +import org.egothor.stemmer.Trie; + +/** + * Factory for {@link StempelFilter} using a Polish stemming table. + */ +public class StempelPolishStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { + private Trie stemmer = null; + private static final String STEMTABLE = "org/apache/lucene/analysis/pl/stemmer_20000.tbl"; + + public TokenStream create(TokenStream input) { + return new StempelFilter(input, new StempelStemmer(stemmer)); + } + + public void inform(ResourceLoader loader) { + try { + stemmer = StempelStemmer.load(loader.openResource(STEMTABLE)); + } catch (IOException e) { + throw new SolrException(ErrorCode.SERVER_ERROR, "Could not load stem table: " + STEMTABLE); + } + } +} diff --git a/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java new file mode 100644 index 00000000000..dee9aed3a80 --- /dev/null +++ b/solr/contrib/analysis-extras/src/test/org/apache/solr/analysis/TestStempelPolishStemFilterFactory.java @@ -0,0 +1,38 @@ +package org.apache.solr.analysis; + +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.solr.core.SolrResourceLoader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests for {@link StempelPolishStemFilterFactory} + */ +public class TestStempelPolishStemFilterFactory extends BaseTokenTestCase { + public void testBasics() throws Exception { + StringReader document = new StringReader("studenta studenci"); + StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory(); + factory.inform(new SolrResourceLoader(null, null)); + TokenStream ts = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, document)); + assertTokenStreamContents(ts, + new String[] { "student", "student" }); + } +}