LUCENE-6875: New Serbian Filter. (Nikola Smolenski via Robert Muir, Dawid Weiss)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1713720 13f79535-47bb-0310-9956-ffa450edef68
2015-11-10 19:46:52 +00:00 · 2015-11-10 19:46:52 +00:00 · 1c2b43b269
parent a50a31c4bd
commit 1c2b43b269
6 changed files with 284 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -96,6 +96,9 @@ Changes in Runtime Behavior

 New Features

+* LUCENE-6875: New Serbian Filter. (Nikola Smolenski via Robert Muir, 
+  Dawid Weiss)
+
 * LUCENE-6720: New FunctionRangeQuery wrapper around ValueSourceScorer
  (returned from ValueSource/FunctionValues.getRangeScorer()). (David Smiley)

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.sr;
 * limitations under the License.
 */

+import java.util.Arrays;
 import java.util.Map;

 import org.apache.lucene.analysis.TokenStream;
@ -31,15 +32,19 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
- *     &lt;filter class="solr.SerbianNormalizationFilterFactory"/&gt;
+ *     &lt;filter class="solr.SerbianNormalizationFilterFactory"
+ *       haircut="bald"/&gt; 
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre> 
 */
 public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+  final String haircut;

  /** Creates a new SerbianNormalizationFilterFactory */
  public SerbianNormalizationFilterFactory(Map<String,String> args) {
    super(args);
+
+	this.haircut = get(args, "haircut", Arrays.asList( "bald", "regular" ), "bald");
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -47,7 +52,11 @@ public class SerbianNormalizationFilterFactory extends TokenFilterFactory implem

  @Override
  public TokenStream create(TokenStream input) {
-    return new SerbianNormalizationFilter(input);
+	if( this.haircut.equals( "regular" ) ) {
+	    return new SerbianNormalizationRegularFilter(input);
+	} else {
+	    return new SerbianNormalizationFilter(input);
+	}
  }

  @Override
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
@ -0,0 +1,165 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalizes Serbian Cyrillic to Latin.
+ *
+ * Note that it expects lowercased input.
+ */
+public final class SerbianNormalizationRegularFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  public SerbianNormalizationRegularFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char buffer[] = termAtt.buffer();
+      int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        final char c = buffer[i];
+        switch(c) {
+        case 'а':
+          buffer[i] = 'a';
+          break;
+        case 'б':
+          buffer[i] = 'b';
+          break;
+        case 'в':
+          buffer[i] = 'v';
+          break;
+        case 'г':
+          buffer[i] = 'g';
+          break;
+        case 'д':
+          buffer[i] = 'd';
+          break;
+        case 'ђ':
+          buffer[i] = 'đ';
+          break;
+        case 'е':
+          buffer[i] = 'e';
+          break;
+        case 'ж':
+          buffer[i] = 'ž';
+          break;
+        case 'з':
+          buffer[i] = 'z';
+          break;
+        case 'и':
+          buffer[i] = 'i';
+          break;
+        case 'ј':
+          buffer[i] = 'j';
+          break;
+        case 'к':
+          buffer[i] = 'k';
+          break;
+        case 'л':
+          buffer[i] = 'l';
+          break;
+        case 'љ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'l';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'м':
+          buffer[i] = 'm';
+          break;
+        case 'н':
+          buffer[i] = 'n';
+          break;
+        case 'њ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'n';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'о':
+          buffer[i] = 'o';
+          break;
+        case 'п':
+          buffer[i] = 'p';
+          break;
+        case 'р':
+          buffer[i] = 'r';
+          break;
+        case 'с':
+          buffer[i] = 's';
+          break;
+        case 'т':
+          buffer[i] = 't';
+          break;
+        case 'ћ':
+          buffer[i] = 'ć';
+          break;
+        case 'у':
+          buffer[i] = 'u';
+          break;
+        case 'ф':
+          buffer[i] = 'f';
+          break;
+        case 'х':
+          buffer[i] = 'h';
+          break;
+        case 'ц':
+          buffer[i] = 'c';
+          break;
+        case 'ч':
+          buffer[i] = 'č';
+          break;
+        case 'џ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'd';
+          buffer[++i] = 'ž';
+          length++;
+          break;
+        case 'ш':
+          buffer[i] = 'š';
+          break;
+        default:
+          break;
+        }
+      }
+      termAtt.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAllAnalyzersHaveFactories.java
@ -22,6 +22,7 @@ import java.io.StringReader;
 import java.lang.reflect.Modifier;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.IdentityHashMap;
 import java.util.List;
 import java.util.Map;
@ -48,6 +49,7 @@ import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
 import org.apache.lucene.analysis.sinks.TeeSinkTokenFilter;
 import org.apache.lucene.analysis.snowball.SnowballFilter;
+import org.apache.lucene.analysis.sr.SerbianNormalizationRegularFilter;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.ResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
@ -102,7 +104,13 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
      SetKeywordMarkerFilter.class
    );
  }
-  
+
+  // The following token filters are excused from having their factory.
+  private static final Set<Class<?>> tokenFiltersWithoutFactory = new HashSet<>();
+  static {
+    tokenFiltersWithoutFactory.add(SerbianNormalizationRegularFilter.class);
+  }
+
  private static final ResourceLoader loader = new StringMockResourceLoader("");
  
  public void test() throws Exception {
@ -117,12 +125,13 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
        || testComponents.contains(c)
        || crazyComponents.contains(c)
        || oddlyNamedComponents.contains(c)
+        || tokenFiltersWithoutFactory.contains(c)
        || c.isAnnotationPresent(Deprecated.class) // deprecated ones are typically back compat hacks
        || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
      ) {
        continue;
      }
-      
+
      Map<String,String> args = new HashMap<>();
      args.put("luceneMatchVersion", Version.LATEST.toString());
      
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
@ -36,7 +36,15 @@ public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactor
    stream = tokenFilterFactory("SerbianNormalization").create(stream);
    assertTokenStreamContents(stream, new String[] { "djura" });
  }
-  
+
+  public void testRegularStemming() throws Exception {
+    Reader reader = new StringReader("ђура");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer)stream).setReader(reader);
+    stream = tokenFilterFactory("SerbianNormalization", "haircut", "regular").create(stream);
+    assertTokenStreamContents(stream, new String[] { "đura" });
+  }
+   
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    try {
@ -46,4 +54,5 @@ public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactor
      assertTrue(expected.getMessage().contains("Unknown parameters"));
    }
  }
+
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
@ -0,0 +1,84 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Tests {@link SerbianNormalizationFilter}
+ */
+public class TestSerbianNormalizationRegularFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer;
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+        final TokenStream stream = new SerbianNormalizationRegularFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, stream);
+      }
+    };
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    analyzer.close();
+    super.tearDown();
+  }
+  
+  /**
+   * Tests Cyrillic text.
+   */
+  public void testCyrillic() throws IOException {
+    checkOneTerm(analyzer, "абвгдђежзијклљмнњопрстћуфхцчџш", "abvgdđežzijklljmnnjoprstćufhcčdžš");
+  }
+
+  /**
+   * Tests Latin text.
+   */
+  public void testLatin() throws IOException {
+    checkOneTerm(analyzer, "abcčćddžđefghijklljmnnjoprsštuvzž", "abcčćddžđefghijklljmnnjoprsštuvzž");
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new SerbianNormalizationRegularFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+    a.close();
+  }
+}