Add Snowball stemmer as analyzer and filter

This commit is contained in:
harryf 2011-01-06 01:44:11 +01:00 committed by kimchy
parent 66d63055df
commit 4a45df88c6
5 changed files with 150 additions and 1 deletions

View File

@ -5,6 +5,7 @@
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-core/jars/lucene-core-3.0.3.jar!/" />
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-queries/jars/lucene-queries-3.0.3.jar!/" />
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-analyzers/jars/lucene-analyzers-3.0.3.jar!/" />
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-snowball/jars/lucene-snowball-3.0.3.jar!/" />
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-highlighter/jars/lucene-highlighter-3.0.3.jar!/" />
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-memory/jars/lucene-memory-3.0.3.jar!/" />
</CLASSES>

View File

@ -39,6 +39,7 @@ dependencies {
compile 'org.apache.lucene:lucene-core:3.0.3'
compile 'org.apache.lucene:lucene-analyzers:3.0.3'
compile 'org.apache.lucene:lucene-snowball:3.0.3'
compile 'org.apache.lucene:lucene-queries:3.0.3'
compile 'org.apache.lucene:lucene-fast-vector-highlighter:3.0.3'
compile 'org.apache.lucene:lucene-memory:3.0.3'
@ -129,4 +130,4 @@ uploadArchives {
pom.dependencies = pom.dependencies.findAll {dep -> !dep.artifactId.contains('jline') }
}
}
}
}

View File

@ -322,6 +322,8 @@ public class AnalysisModule extends AbstractModule {
private static class ExtendedProcessor extends AnalysisBinderProcessor {
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("arabicStem", ArabicStemTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("brazilianStem", BrazilianStemTokenFilterFactory.class);
@ -345,6 +347,7 @@ public class AnalysisModule extends AbstractModule {
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
analyzersBindings.processAnalyzer("pattern", PatternAnalyzerProvider.class);
analyzersBindings.processAnalyzer("snowball", SnowballAnalyzerProvider.class);
analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class);
analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class);

View File

@ -0,0 +1,96 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import java.util.Set;
/**
* Creates a SnowballAnalyzer initialized with stopwords and Snowball filter. Only
* supports Dutch, English (default), French, German and German2 where stopwords
* are readily available. For other languages available with the Lucene Snowball
* Stemmer, use them directly with the SnowballFilter and a CustomAnalyzer.
* Configuration of language is done with the "language" attribute or the analyzer.
* Also supports additional stopwords via "stopwords" attribute
*
* The SnowballAnalyzer comes with a StandardFilter, LowerCaseFilter, StopFilter
* and the SnowballFilter.
*
* @author harryf (Harry Fuecks)
*/
public class SnowballAnalyzerProvider extends AbstractIndexAnalyzerProvider<SnowballAnalyzer> {
private enum SupportedAnalyzer {
DUTCH {
public Set<?> getStopwords() {
return DutchAnalyzer.getDefaultStopSet();
}
},
ENGLISH {
public Set<?> getStopwords() {
return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
},
FRENCH {
public Set<?> getStopwords() {
return FrenchAnalyzer.getDefaultStopSet();
}
},
GERMAN {
public Set<?> getStopwords() {
return GermanAnalyzer.getDefaultStopSet();
}
},
GERMAN2 {
public Set<?> getStopwords() {
return GermanAnalyzer.getDefaultStopSet();
}
};
public abstract Set<?> getStopwords();
}
private final SnowballAnalyzer analyzer;
@Inject public SnowballAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
String language = settings.get("language", "English");
Set<?> stopWords = Analysis.parseStopWords(settings,
SupportedAnalyzer.valueOf(language.toUpperCase()).getStopwords());
analyzer = new SnowballAnalyzer(Lucene.VERSION, language, stopWords);
}
@Override public SnowballAnalyzer get() {
return this.analyzer;
}
}

View File

@ -0,0 +1,48 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
/**
* Real work actually done here by Sebastian on the ElasticSearch mailing list
* http://elasticsearch-users.115913.n3.nabble.com/Using-the-Snowball-stemmers-tp2126106p2127111.html
* @author harryf (Harry Fuecks)
*/
public class SnowballTokenFilterFactory extends AbstractTokenFilterFactory {
private String language;
@Inject public SnowballTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name);
this.language = settings.get("language");
}
@Override public TokenStream create(TokenStream tokenStream) {
return new SnowballFilter(tokenStream, language);
}
}