mirror of https://github.com/apache/lucene.git
SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1035996 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
204abfb17d
commit
de3d057abc
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.pl;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.DataInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -50,6 +48,9 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
/** File containing default Polish stopwords. */
|
/** File containing default Polish stopwords. */
|
||||||
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
public final static String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
/** File containing default Polish stemmer table. */
|
||||||
|
public final static String DEFAULT_STEMMER_FILE = "stemmer_20000.tbl";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns an unmodifiable instance of the default stop words set.
|
* Returns an unmodifiable instance of the default stop words set.
|
||||||
* @return default stop words set.
|
* @return default stop words set.
|
||||||
|
@ -76,16 +77,8 @@ public final class PolishAnalyzer extends StopwordAnalyzerBase {
|
||||||
throw new RuntimeException("Unable to load default stopword set", ex);
|
throw new RuntimeException("Unable to load default stopword set", ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
InputStream stream = PolishAnalyzer.class.getResourceAsStream("stemmer_20000.tbl");
|
|
||||||
try {
|
try {
|
||||||
DataInputStream in = new DataInputStream(new BufferedInputStream(stream));
|
DEFAULT_TABLE = StempelStemmer.load(PolishAnalyzer.class.getResourceAsStream(DEFAULT_STEMMER_FILE));
|
||||||
String method = in.readUTF().toUpperCase();
|
|
||||||
if (method.indexOf('M') < 0) {
|
|
||||||
DEFAULT_TABLE = new org.egothor.stemmer.Trie(in);
|
|
||||||
} else {
|
|
||||||
DEFAULT_TABLE = new org.egothor.stemmer.MultiTrie2(in);
|
|
||||||
}
|
|
||||||
in.close();
|
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
// default set should always be present as it is part of the
|
// default set should always be present as it is part of the
|
||||||
// distribution (JAR)
|
// distribution (JAR)
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.io.BufferedInputStream;
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.egothor.stemmer.Diff;
|
import org.egothor.stemmer.Diff;
|
||||||
import org.egothor.stemmer.Trie;
|
import org.egothor.stemmer.Trie;
|
||||||
|
@ -45,17 +46,7 @@ public class StempelStemmer {
|
||||||
* @param stemmerTable stemmer table.
|
* @param stemmerTable stemmer table.
|
||||||
*/
|
*/
|
||||||
public StempelStemmer(InputStream stemmerTable) throws IOException {
|
public StempelStemmer(InputStream stemmerTable) throws IOException {
|
||||||
if (stemmerTable == null) return;
|
this(load(stemmerTable));
|
||||||
|
|
||||||
DataInputStream in = new DataInputStream(new BufferedInputStream(
|
|
||||||
stemmerTable));
|
|
||||||
String method = in.readUTF().toUpperCase();
|
|
||||||
if (method.indexOf('M') < 0) {
|
|
||||||
stemmer = new org.egothor.stemmer.Trie(in);
|
|
||||||
} else {
|
|
||||||
stemmer = new org.egothor.stemmer.MultiTrie2(in);
|
|
||||||
}
|
|
||||||
in.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -67,6 +58,24 @@ public class StempelStemmer {
|
||||||
this.stemmer = stemmer;
|
this.stemmer = stemmer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load a stemmer table from an inputstream.
|
||||||
|
*/
|
||||||
|
public static Trie load(InputStream stemmerTable) throws IOException {
|
||||||
|
DataInputStream in = null;
|
||||||
|
try {
|
||||||
|
in = new DataInputStream(new BufferedInputStream(stemmerTable));
|
||||||
|
String method = in.readUTF().toUpperCase(Locale.ENGLISH);
|
||||||
|
if (method.indexOf('M') < 0) {
|
||||||
|
return new org.egothor.stemmer.Trie(in);
|
||||||
|
} else {
|
||||||
|
return new org.egothor.stemmer.MultiTrie2(in);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stem a word.
|
* Stem a word.
|
||||||
*
|
*
|
||||||
|
|
|
@ -305,6 +305,8 @@ New Features
|
||||||
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm
|
* SOLR-2211: Added UAX29TokenizerFactory, which implements UAX#29, a unicode algorithm
|
||||||
with good results for most languages. (Tom Burton-West via rmuir)
|
with good results for most languages. (Tom Burton-West via rmuir)
|
||||||
|
|
||||||
|
* SOLR-2237: Added StempelPolishStemFilterFactory to contrib/analysis-extras (rmuir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.stempel.StempelFilter;
|
||||||
|
import org.apache.lucene.analysis.stempel.StempelStemmer;
|
||||||
|
import org.apache.solr.common.ResourceLoader;
|
||||||
|
import org.apache.solr.common.SolrException;
|
||||||
|
import org.apache.solr.common.SolrException.ErrorCode;
|
||||||
|
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||||
|
import org.egothor.stemmer.Trie;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link StempelFilter} using a Polish stemming table.
|
||||||
|
*/
|
||||||
|
public class StempelPolishStemFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
private Trie stemmer = null;
|
||||||
|
private static final String STEMTABLE = "org/apache/lucene/analysis/pl/stemmer_20000.tbl";
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream input) {
|
||||||
|
return new StempelFilter(input, new StempelStemmer(stemmer));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void inform(ResourceLoader loader) {
|
||||||
|
try {
|
||||||
|
stemmer = StempelStemmer.load(loader.openResource(STEMTABLE));
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not load stem table: " + STEMTABLE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,38 @@
|
||||||
|
package org.apache.solr.analysis;
|
||||||
|
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.solr.core.SolrResourceLoader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for {@link StempelPolishStemFilterFactory}
|
||||||
|
*/
|
||||||
|
public class TestStempelPolishStemFilterFactory extends BaseTokenTestCase {
|
||||||
|
public void testBasics() throws Exception {
|
||||||
|
StringReader document = new StringReader("studenta studenci");
|
||||||
|
StempelPolishStemFilterFactory factory = new StempelPolishStemFilterFactory();
|
||||||
|
factory.inform(new SolrResourceLoader(null, null));
|
||||||
|
TokenStream ts = factory.create(new WhitespaceTokenizer(DEFAULT_VERSION, document));
|
||||||
|
assertTokenStreamContents(ts,
|
||||||
|
new String[] { "student", "student" });
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue