Add Snowball stemmer as analyzer and filter
This commit is contained in:
parent
66d63055df
commit
4a45df88c6
|
@ -5,6 +5,7 @@
|
|||
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-core/jars/lucene-core-3.0.3.jar!/" />
|
||||
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-queries/jars/lucene-queries-3.0.3.jar!/" />
|
||||
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-analyzers/jars/lucene-analyzers-3.0.3.jar!/" />
|
||||
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-snowball/jars/lucene-snowball-3.0.3.jar!/" />
|
||||
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-highlighter/jars/lucene-highlighter-3.0.3.jar!/" />
|
||||
<root url="jar://$GRADLE_REPOSITORY$/org.apache.lucene/lucene-memory/jars/lucene-memory-3.0.3.jar!/" />
|
||||
</CLASSES>
|
||||
|
|
|
@ -39,6 +39,7 @@ dependencies {
|
|||
|
||||
compile 'org.apache.lucene:lucene-core:3.0.3'
|
||||
compile 'org.apache.lucene:lucene-analyzers:3.0.3'
|
||||
compile 'org.apache.lucene:lucene-snowball:3.0.3'
|
||||
compile 'org.apache.lucene:lucene-queries:3.0.3'
|
||||
compile 'org.apache.lucene:lucene-fast-vector-highlighter:3.0.3'
|
||||
compile 'org.apache.lucene:lucene-memory:3.0.3'
|
||||
|
@ -129,4 +130,4 @@ uploadArchives {
|
|||
pom.dependencies = pom.dependencies.findAll {dep -> !dep.artifactId.contains('jline') }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -322,6 +322,8 @@ public class AnalysisModule extends AbstractModule {
|
|||
|
||||
private static class ExtendedProcessor extends AnalysisBinderProcessor {
|
||||
@Override public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) {
|
||||
tokenFiltersBindings.processTokenFilter("snowball", SnowballTokenFilterFactory.class);
|
||||
|
||||
tokenFiltersBindings.processTokenFilter("arabicStem", ArabicStemTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("brazilianStem", BrazilianStemTokenFilterFactory.class);
|
||||
|
@ -345,6 +347,7 @@ public class AnalysisModule extends AbstractModule {
|
|||
|
||||
@Override public void processAnalyzers(AnalyzersBindings analyzersBindings) {
|
||||
analyzersBindings.processAnalyzer("pattern", PatternAnalyzerProvider.class);
|
||||
analyzersBindings.processAnalyzer("snowball", SnowballAnalyzerProvider.class);
|
||||
|
||||
analyzersBindings.processAnalyzer("arabic", ArabicAnalyzerProvider.class);
|
||||
analyzersBindings.processAnalyzer("brazilian", BrazilianAnalyzerProvider.class);
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.de.GermanAnalyzer;
|
||||
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
|
||||
import org.apache.lucene.analysis.nl.DutchAnalyzer;
|
||||
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Creates a SnowballAnalyzer initialized with stopwords and Snowball filter. Only
|
||||
* supports Dutch, English (default), French, German and German2 where stopwords
|
||||
* are readily available. For other languages available with the Lucene Snowball
|
||||
* Stemmer, use them directly with the SnowballFilter and a CustomAnalyzer.
|
||||
* Configuration of language is done with the "language" attribute or the analyzer.
|
||||
* Also supports additional stopwords via "stopwords" attribute
|
||||
*
|
||||
* The SnowballAnalyzer comes with a StandardFilter, LowerCaseFilter, StopFilter
|
||||
* and the SnowballFilter.
|
||||
*
|
||||
* @author harryf (Harry Fuecks)
|
||||
*/
|
||||
public class SnowballAnalyzerProvider extends AbstractIndexAnalyzerProvider<SnowballAnalyzer> {
|
||||
|
||||
private enum SupportedAnalyzer {
|
||||
DUTCH {
|
||||
public Set<?> getStopwords() {
|
||||
return DutchAnalyzer.getDefaultStopSet();
|
||||
}
|
||||
},
|
||||
ENGLISH {
|
||||
public Set<?> getStopwords() {
|
||||
return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
|
||||
}
|
||||
},
|
||||
FRENCH {
|
||||
public Set<?> getStopwords() {
|
||||
return FrenchAnalyzer.getDefaultStopSet();
|
||||
}
|
||||
},
|
||||
GERMAN {
|
||||
public Set<?> getStopwords() {
|
||||
return GermanAnalyzer.getDefaultStopSet();
|
||||
}
|
||||
},
|
||||
GERMAN2 {
|
||||
public Set<?> getStopwords() {
|
||||
return GermanAnalyzer.getDefaultStopSet();
|
||||
}
|
||||
};
|
||||
|
||||
public abstract Set<?> getStopwords();
|
||||
}
|
||||
|
||||
private final SnowballAnalyzer analyzer;
|
||||
|
||||
@Inject public SnowballAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name);
|
||||
|
||||
String language = settings.get("language", "English");
|
||||
Set<?> stopWords = Analysis.parseStopWords(settings,
|
||||
SupportedAnalyzer.valueOf(language.toUpperCase()).getStopwords());
|
||||
|
||||
analyzer = new SnowballAnalyzer(Lucene.VERSION, language, stopWords);
|
||||
}
|
||||
|
||||
@Override public SnowballAnalyzer get() {
|
||||
return this.analyzer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
* Licensed to Elastic Search and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. Elastic Search licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
|
||||
/**
|
||||
* Real work actually done here by Sebastian on the ElasticSearch mailing list
|
||||
* http://elasticsearch-users.115913.n3.nabble.com/Using-the-Snowball-stemmers-tp2126106p2127111.html
|
||||
* @author harryf (Harry Fuecks)
|
||||
*/
|
||||
public class SnowballTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private String language;
|
||||
|
||||
@Inject public SnowballTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name);
|
||||
this.language = settings.get("language");
|
||||
}
|
||||
|
||||
@Override public TokenStream create(TokenStream tokenStream) {
|
||||
return new SnowballFilter(tokenStream, language);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue