added Czech Stemmer, closes #635

2011-01-20 04:14:04 +01:00 · 2011-01-20 04:14:04 +01:00 · e0fa15a365
parent 04f8b55686
commit e0fa15a365
7 changed files with 273 additions and 0 deletions
--- a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemFilter.java
@ -0,0 +1,53 @@
+package org.apache.lucene.analysis.cz;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link TokenFilter} that applies {@link CzechStemmer} to stem Czech words.
+ *
+ * <p><b>NOTE</b>: Input is expected to be in lowercase,
+ * but with diacritical marks</p>
+ */
+// LUCENE MONITOR
+public final class CzechStemFilter extends TokenFilter {
+  private final CzechStemmer stemmer;
+  private final TermAttribute termAtt;
+
+  public CzechStemFilter(TokenStream input) {
+    super(input);
+    stemmer = new CzechStemmer();
+    termAtt = addAttribute(TermAttribute.class);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
+      termAtt.setTermLength(newlen);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemmer.java
+++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/cz/CzechStemmer.java
@ -0,0 +1,182 @@
+package org.apache.lucene.analysis.cz;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Light Stemmer for Czech.
+ * <p>
+ * Implements the algorithm described in:  
+ * <i>
+ * Indexing and stemming approaches for the Czech language
+ * </i>
+ * http://portal.acm.org/citation.cfm?id=1598600
+ * </p>
+ */
+// LUCENE MONITOR
+public class CzechStemmer {
+  
+  /**
+   * Stem an input buffer of Czech text.
+   * 
+   * @param s input buffer
+   * @param len length of input buffer
+   * @return length of input buffer after normalization
+   * 
+   * <p><b>NOTE</b>: Input is expected to be in lowercase, 
+   * but with diacritical marks</p>
+   */
+  public int stem(char s[], int len) {
+    len = removeCase(s, len);
+    len = removePossessives(s, len);
+    len = normalize(s, len);
+    return len;
+  }
+
+  private boolean endsWith(char s[], int len, String suffix) {
+     int suffixLen = suffix.length();
+     if (suffixLen > len)
+       return false;
+
+     for (int i = suffixLen - 1; i >= 0; i--)
+       if (s[len - (suffixLen - i)] != suffix.charAt(i))
+         return false;
+
+     return true;
+   }  
+
+  private int removeCase(char s[], int len) {  
+    if (len > 7 && endsWith(s, len, "atech"))
+      return len - 5;
+    
+    if (len > 6 && 
+        (endsWith(s, len,"ětem") ||
+        endsWith(s, len,"etem") ||
+        endsWith(s, len,"atům")))
+      return len - 4;
+        
+    if (len > 5 && 
+        (endsWith(s, len, "ech") ||
+        endsWith(s, len, "ich") ||
+        endsWith(s, len, "ích") ||
+        endsWith(s, len, "ého") ||
+        endsWith(s, len, "ěmi") ||
+        endsWith(s, len, "emi") ||
+        endsWith(s, len, "ému") ||
+        endsWith(s, len, "ěte") ||
+        endsWith(s, len, "ete") ||
+        endsWith(s, len, "ěti") ||
+        endsWith(s, len, "eti") ||
+        endsWith(s, len, "ího") ||
+        endsWith(s, len, "iho") ||
+        endsWith(s, len, "ími") ||
+        endsWith(s, len, "ímu") ||
+        endsWith(s, len, "imu") ||
+        endsWith(s, len, "ách") ||
+        endsWith(s, len, "ata") ||
+        endsWith(s, len, "aty") ||
+        endsWith(s, len, "ých") ||
+        endsWith(s, len, "ama") ||
+        endsWith(s, len, "ami") ||
+        endsWith(s, len, "ové") ||
+        endsWith(s, len, "ovi") ||
+        endsWith(s, len, "ými")))
+      return len - 3;
+    
+    if (len > 4 && 
+        (endsWith(s, len, "em") ||
+        endsWith(s, len, "es") ||
+        endsWith(s, len, "ém") ||
+        endsWith(s, len, "ím") ||
+        endsWith(s, len, "ům") ||
+        endsWith(s, len, "at") ||
+        endsWith(s, len, "ám") ||
+        endsWith(s, len, "os") ||
+        endsWith(s, len, "us") ||
+        endsWith(s, len, "ým") ||
+        endsWith(s, len, "mi") ||
+        endsWith(s, len, "ou")))
+      return len - 2;
+    
+    if (len > 3) {
+      switch (s[len - 1]) {
+        case 'a':
+        case 'e':
+        case 'i':
+        case 'o':
+        case 'u':
+        case 'ů':
+        case 'y':
+        case 'á':
+        case 'é':
+        case 'í':
+        case 'ý':
+        case 'ě':
+          return len - 1;
+      }
+    }
+    
+    return len;
+  }
+  
+  private int removePossessives(char s[], int len) {
+    if (len > 5 &&
+        (endsWith(s, len, "ov") ||
+        endsWith(s, len, "in") ||
+        endsWith(s, len, "ův")))
+      return len - 2;
+
+    return len;
+  }
+  
+  private int normalize(char s[], int len) {
+    if (endsWith(s, len, "čt")) { // čt -> ck
+      s[len - 2] = 'c';
+      s[len - 1] = 'k';
+      return len;
+    }
+    
+    if (endsWith(s, len, "št")) { // št -> sk
+      s[len - 2] = 's';
+      s[len - 1] = 'k';
+      return len;
+    }
+    
+    switch(s[len - 1]) {
+      case 'c': // [cč] -> k
+      case 'č':
+        s[len - 1] = 'k';
+        return len;
+      case 'z': // [zž] -> h
+      case 'ž':
+        s[len - 1] = 'h';
+        return len;
+    }
+    
+    if (len > 1 && s[len - 2] == 'e') {
+      s[len - 2] = s[len - 1]; // e* > *
+      return len - 1;
+    }
+    
+    if (len > 2 && s[len - 2] == 'ů') {
+      s[len - 2] = 'o'; // *ů* -> *o*
+      return len;
+    }
+
+    return len;
+  }
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@ -328,6 +328,8 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("arabic_stem", ArabicStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("brazilianStem", BrazilianStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("brazilian_stem", BrazilianStemTokenFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("czechStem", CzechStemTokenFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("czech_stem", CzechStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("dutchStem", DutchStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("dutch_stem", DutchStemTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("frenchStem", FrenchStemTokenFilterFactory.class);
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CzechStemTokenFilterFactory.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/CzechStemTokenFilterFactory.java
@ -0,0 +1,20 @@
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cz.CzechStemFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+
+public class CzechStemTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    @Inject public CzechStemTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name);
+    }
+
+    @Override public TokenStream create(TokenStream tokenStream) {
+        return new CzechStemFilter(tokenStream);
+    }
+}
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/AnalysisModuleTests.java
@ -82,5 +82,14 @@ public class AnalysisModuleTests {
        // verify aliases
        analyzer = analysisService.analyzer("alias1").analyzer();
        assertThat(analyzer, instanceOf(StandardAnalyzer.class));
+
+        // verify Czech stemmer
+        analyzer = analysisService.analyzer("czechAnalyzerWithStemmer").analyzer();
+        assertThat(analyzer, instanceOf(CustomAnalyzer.class));
+        CustomAnalyzer czechstemmeranalyzer = (CustomAnalyzer) analyzer;
+        assertThat(czechstemmeranalyzer.tokenizerFactory(), instanceOf(StandardTokenizerFactory.class));
+        assertThat(czechstemmeranalyzer.tokenFilters().length, equalTo(4));
+        assertThat(czechstemmeranalyzer.tokenFilters()[3], instanceOf(CzechStemTokenFilterFactory.class));
+
    }
 }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.json
@ -37,6 +37,10 @@
                "custom2" : {
                    "tokenizer" : "standard",
                    "char_filter" : ["html_strip", "my_html"]
+                },
+                "czechAnalyzerWithStemmer" : {
+                    "tokenizer" : "standard",
+                    "filter" : ["standard", "lowercase", "stop", "czech_stem"]
                }
            }
        }
--- a/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml
+++ b/modules/elasticsearch/src/test/java/org/elasticsearch/index/analysis/test1.yml
@ -27,3 +27,6 @@ index :
      custom2 :
        tokenizer : standard
        char_filter : [html_strip, my_html]
+      czechAnalyzerWithStemmer :
+        tokenizer : standard
+        filter : [standard, lowercase, stop, czech_stem]