LUCENE-6053: add Serbian analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1638220 13f79535-47bb-0310-9956-ffa450edef68
2014-11-11 17:28:35 +00:00 · 2014-11-11 17:28:35 +00:00 · 67c1aaa9f8
parent 030afc5c7d
commit 67c1aaa9f8
7 changed files with 377 additions and 0 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -77,6 +77,8 @@ New Features
  improved exception handling, and indirect norms encoding for sparse fields.
  (Mike McCandless, Ryan Ernst, Robert Muir)

+* LUCENE-6053: Add Serbian analyzer.  (Nikola Smolenski via Robert Muir, Mike McCandless)
+
 API Changes

 * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilter.java
@ -0,0 +1,174 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalizes Serbian Cyrillic and Latin characters to "bald" Latin.
+ *
+ * Cyrillic characters are first converted to Latin; then, Latin characters
+ * have their diacritics removed, with the exception of đ which is converted to
+ * dj.
+ *
+ * Note that it expects lowercased input.
+ */
+public final class SerbianNormalizationFilter extends TokenFilter {
+
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  public SerbianNormalizationFilter(TokenStream input) {
+    super(input);
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char buffer[] = termAtt.buffer();
+      int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        final char c = buffer[i];
+        switch(c) {
+        case 'а':
+          buffer[i] = 'a';
+          break;
+        case 'б':
+          buffer[i] = 'b';
+          break;
+        case 'в':
+          buffer[i] = 'v';
+          break;
+        case 'г':
+          buffer[i] = 'g';
+          break;
+        case 'д':
+          buffer[i] = 'd';
+          break;
+        case 'ђ':
+        case 'đ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'd';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'е':
+          buffer[i] = 'e';
+          break;
+        case 'ж':
+        case 'з':
+        case 'ž':
+          buffer[i] = 'z';
+          break;
+        case 'и':
+          buffer[i] = 'i';
+          break;
+        case 'ј':
+          buffer[i] = 'j';
+          break;
+        case 'к':
+          buffer[i] = 'k';
+          break;
+        case 'л':
+          buffer[i] = 'l';
+          break;
+        case 'љ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'l';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'м':
+          buffer[i] = 'm';
+          break;
+        case 'н':
+          buffer[i] = 'n';
+          break;
+        case 'њ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'n';
+          buffer[++i] = 'j';
+          length++;
+          break;
+        case 'о':
+          buffer[i] = 'o';
+          break;
+        case 'п':
+          buffer[i] = 'p';
+          break;
+        case 'р':
+          buffer[i] = 'r';
+          break;
+        case 'с':
+          buffer[i] = 's';
+          break;
+        case 'т':
+          buffer[i] = 't';
+          break;
+        case 'ћ':
+        case 'ц':
+        case 'ч':
+        case 'č':
+        case 'ć':
+          buffer[i] = 'c';
+          break;
+        case 'у':
+          buffer[i] = 'u';
+          break;
+        case 'ф':
+          buffer[i] = 'f';
+          break;
+        case 'х':
+          buffer[i] = 'h';
+          break;
+        case 'џ':
+          buffer = termAtt.resizeBuffer(1+length);
+          if (i < length) {
+            System.arraycopy(buffer, i, buffer, i+1, (length-i));
+          }
+          buffer[i] = 'd';
+          buffer[++i] = 'z';
+          length++;
+          break;
+        case 'ш':
+        case 'š':
+          buffer[i] = 's';
+          break;
+        default:
+          break;
+        }
+      }
+      termAtt.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+import org.apache.lucene.analysis.util.MultiTermAwareComponent;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for {@link SerbianNormalizationFilter}.
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_srnorm" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.SerbianNormalizationFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre> 
+ */
+public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
+
+  /** Creates a new SerbianNormalizationFilterFactory */
+  public SerbianNormalizationFilterFactory(Map<String,String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new SerbianNormalizationFilter(input);
+  }
+
+  @Override
+  public AbstractAnalysisFactory getMultiTermComponent() {
+    return this;
+  }
+
+}
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Analyzer for Serbian.
+</body>
+</html>
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@ -91,6 +91,7 @@ org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
 org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
 org.apache.lucene.analysis.shingle.ShingleFilterFactory
 org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
+org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
 org.apache.lucene.analysis.standard.ClassicFilterFactory
 org.apache.lucene.analysis.standard.StandardFilterFactory
 org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilter.java
@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+
+/**
+ * Tests {@link SerbianNormalizationFilter}
+ */
+public class TestSerbianNormalizationFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String field) {
+      final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+      final TokenStream stream = new SerbianNormalizationFilter(tokenizer);
+      return new TokenStreamComponents(tokenizer, stream);
+    }
+  };
+  
+  /**
+   * Tests Cyrillic text.
+   */
+  public void testCyrillic() throws IOException {
+    checkOneTerm(analyzer, "абвгдђежзијклљмнњопрстћуфхцчџш", "abvgddjezzijklljmnnjoprstcufhccdzs");
+  }
+
+  /**
+   * Tests Latin text.
+   */
+  public void testLatin() throws IOException {
+    checkOneTerm(analyzer, "abcčćddžđefghijklljmnnjoprsštuvzž", "abcccddzdjefghijklljmnnjoprsstuvzz");
+  }
+
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
+  }
+  
+  public void testEmptyTerm() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName) {
+        Tokenizer tokenizer = new KeywordTokenizer();
+        return new TokenStreamComponents(tokenizer, new SerbianNormalizationFilter(tokenizer));
+      }
+    };
+    checkOneTerm(a, "", "");
+  }
+}
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
@ -0,0 +1,49 @@
+package org.apache.lucene.analysis.sr;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
+
+/**
+ * Simple tests to ensure the Serbian normalization factory is working.
+ */
+public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
+  public void testStemming() throws Exception {
+    Reader reader = new StringReader("đura");
+    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
+    ((Tokenizer)stream).setReader(reader);
+    stream = tokenFilterFactory("SerbianNormalization").create(stream);
+    assertTokenStreamContents(stream, new String[] { "djura" });
+  }
+  
+  /** Test that bogus arguments result in exception */
+  public void testBogusArguments() throws Exception {
+    try {
+      tokenFilterFactory("SerbianNormalization", "bogusArg", "bogusValue");
+      fail();
+    } catch (IllegalArgumentException expected) {
+      assertTrue(expected.getMessage().contains("Unknown parameters"));
+    }
+  }
+}