diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemFilter.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemFilter.java new file mode 100644 index 00000000000..bf3cd672741 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemFilter.java @@ -0,0 +1,53 @@ +package org.apache.lucene.analysis.cz; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words. + * + *
NOTE: Input is expected to be in lowercase, + * but with diacritical marks
+ */ +// LUCENE MONITOR +public final class CzechStemFilter extends TokenFilter { + private final CzechStemmer stemmer; + private final TermAttribute termAtt; + + public CzechStemFilter(TokenStream input) { + super(input); + stemmer = new CzechStemmer(); + termAtt = addAttribute(TermAttribute.class); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength()); + termAtt.setTermLength(newlen); + return true; + } else { + return false; + } + } +} \ No newline at end of file diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemmer.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemmer.java new file mode 100644 index 00000000000..a658ab0ceca --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemmer.java @@ -0,0 +1,182 @@ +package org.apache.lucene.analysis.cz; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Light Stemmer for Czech. + *+ * Implements the algorithm described in: + * + * Indexing and stemming approaches for the Czech language + * + * http://portal.acm.org/citation.cfm?id=1598600 + *
+ */ +// LUCENE MONITOR +public class CzechStemmer { + + /** + * Stem an input buffer of Czech text. + * + * @param s input buffer + * @param len length of input buffer + * @return length of input buffer after normalization + * + *NOTE: Input is expected to be in lowercase, + * but with diacritical marks
+ */ + public int stem(char s[], int len) { + len = removeCase(s, len); + len = removePossessives(s, len); + len = normalize(s, len); + return len; + } + + private boolean endsWith(char s[], int len, String suffix) { + int suffixLen = suffix.length(); + if (suffixLen > len) + return false; + + for (int i = suffixLen - 1; i >= 0; i--) + if (s[len - (suffixLen - i)] != suffix.charAt(i)) + return false; + + return true; + } + + private int removeCase(char s[], int len) { + if (len > 7 && endsWith(s, len, "atech")) + return len - 5; + + if (len > 6 && + (endsWith(s, len,"ětem") || + endsWith(s, len,"etem") || + endsWith(s, len,"atům"))) + return len - 4; + + if (len > 5 && + (endsWith(s, len, "ech") || + endsWith(s, len, "ich") || + endsWith(s, len, "ích") || + endsWith(s, len, "ého") || + endsWith(s, len, "ěmi") || + endsWith(s, len, "emi") || + endsWith(s, len, "ému") || + endsWith(s, len, "ěte") || + endsWith(s, len, "ete") || + endsWith(s, len, "ěti") || + endsWith(s, len, "eti") || + endsWith(s, len, "ího") || + endsWith(s, len, "iho") || + endsWith(s, len, "ími") || + endsWith(s, len, "ímu") || + endsWith(s, len, "imu") || + endsWith(s, len, "ách") || + endsWith(s, len, "ata") || + endsWith(s, len, "aty") || + endsWith(s, len, "ých") || + endsWith(s, len, "ama") || + endsWith(s, len, "ami") || + endsWith(s, len, "ové") || + endsWith(s, len, "ovi") || + endsWith(s, len, "ými"))) + return len - 3; + + if (len > 4 && + (endsWith(s, len, "em") || + endsWith(s, len, "es") || + endsWith(s, len, "ém") || + endsWith(s, len, "ím") || + endsWith(s, len, "ům") || + endsWith(s, len, "at") || + endsWith(s, len, "ám") || + endsWith(s, len, "os") || + endsWith(s, len, "us") || + endsWith(s, len, "ým") || + endsWith(s, len, "mi") || + endsWith(s, len, "ou"))) + return len - 2; + + if (len > 3) { + switch (s[len - 1]) { + case 'a': + case 'e': + case 'i': + case 'o': + case 'u': + case 'ů': + case 'y': + case 'á': + case 'é': + case 'í': + case 'ý': + case 'ě': + return len - 1; + } + } + + return len; + } + + private int removePossessives(char s[], int len) { + if (len > 5 && + (endsWith(s, len, "ov") || + endsWith(s, len, "in") || + endsWith(s, len, "ův"))) + return len - 2; + + return len; + } + + private int normalize(char s[], int len) { + if (endsWith(s, len, "čt")) { // čt -> ck + s[len - 2] = 'c'; + s[len - 1] = 'k'; + return len; + } + + if (endsWith(s, len, "št")) { // št -> sk + s[len - 2] = 's'; + s[len - 1] = 'k'; + return len; + } + + switch(s[len - 1]) { + case 'c': // [cč] -> k + case 'č': + s[len - 1] = 'k'; + return len; + case 'z': // [zž] -> h + case 'ž': + s[len - 1] = 'h'; + return len; + } + + if (len > 1 && s[len - 2] == 'e') { + s[len - 2] = s[len - 1]; // e* > * + return len - 1; + } + + if (len > 2 && s[len - 2] == 'ů') { + s[len - 2] = 'o'; // *ů* -> *o* + return len; + } + + return len; + } +} diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index 6721746df93..94a429b4e33 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -328,6 +328,8 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("brazilianStem", BrazilianStemTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("czechStem", CzechStemTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("czech_stem", CzechStemTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("dutchStem", DutchStemTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("dutch_stem", DutchStemTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("frenchStem", FrenchStemTokenFilterFactory.class); diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CzechStemTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CzechStemTokenFilterFactory.java new file mode 100644 index 00000000000..767e706d1ab --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CzechStemTokenFilterFactory.java @@ -0,0 +1,20 @@ +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cz.CzechStemFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +public class CzechStemTokenFilterFactory extends AbstractTokenFilterFactory { + + @Inject public CzechStemTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name); + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new CzechStemFilter(tokenStream); + } +} diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java index 236d07cee96..5815cfef38c 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java @@ -82,5 +82,14 @@ public class AnalysisModuleTests { // verify aliases analyzer = analysisService.analyzer("alias1").analyzer(); assertThat(analyzer, instanceOf(StandardAnalyzer.class)); + + // verify Czech stemmer + analyzer = analysisService.analyzer("czechAnalyzerWithStemmer").analyzer(); + assertThat(analyzer, instanceOf(CustomAnalyzer.class)); + CustomAnalyzer czechstemmeranalyzer = (CustomAnalyzer) analyzer; + assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class)); + assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4)); + assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class)); + } } diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json index 66f24fab1f1..aaf859479f7 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json @@ -37,6 +37,10 @@ "custom2" : { "tokenizer" : "standard", "char_filter" : ["html_strip", "my_html"] + }, + "czechAnalyzerWithStemmer" : { + "tokenizer" : "standard", + "filter" : ["standard", "lowercase", "stop", "czech_stem"] } } } diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml index ec8acfddf4d..f06f489d6dd 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml @@ -27,3 +27,6 @@ index : custom2 : tokenizer : standard char_filter : [html_strip, my_html] + czechAnalyzerWithStemmer : + tokenizer : standard + filter : [standard, lowercase, stop, czech_stem]