LUCENE-6875: New Serbian Filter. (Nikola Smolenski via Robert Muir, Dawid Weiss)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1713712 13f79535-47bb-0310-9956-ffa450edef68
2015-11-10 18:45:42 +00:00 · 2015-11-10 18:45:42 +00:00 · e1041edfa4
parent f25c890709
commit e1041edfa4
5 changed files with 273 additions and 3 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -96,6 +96,9 @@ Changes in Runtime Behavior
 New Features
 * LUCENE-6875: New Serbian normalization filter. (Nikola Smolenski via 
  Robert Muir, Dawid Weiss)
 * LUCENE-6720: New FunctionRangeQuery wrapper around ValueSourceScorer
  (returned from ValueSource/FunctionValues.getRangeScorer()). (David Smiley)
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java
@ -17,6 +17,7 @@ package org.apache.lucene.analysis.sr;
 * limitations under the License.
 */
 import java.util.Arrays;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
@ -31,15 +32,19 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
 *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
- *     &lt;filter class="solr.SerbianNormalizationFilterFactory"/&gt;
+ *     &lt;filter class="solr.SerbianNormalizationFilterFactory"
 *       haircut="bald"/&gt; 
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre> 
 */
 public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
  final String haircut;
  /** Creates a new SerbianNormalizationFilterFactory */
  public SerbianNormalizationFilterFactory(Map<String,String> args) {
    super(args);
 	this.haircut = get(args, "haircut", Arrays.asList( "bald", "regular" ), "bald");
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
@ -47,8 +52,12 @@ public class SerbianNormalizationFilterFactory extends TokenFilterFactory implem
  @Override
  public TokenStream create(TokenStream input) {
 	if( this.haircut.equals( "regular" ) ) {
 	    return new SerbianNormalizationRegularFilter(input);
 	} else {
 	    return new SerbianNormalizationFilter(input);
 	}
  }
  @Override
  public AbstractAnalysisFactory getMultiTermComponent() {
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java
@ -0,0 +1,165 @@
 package org.apache.lucene.analysis.sr;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 /**
 * Normalizes Serbian Cyrillic to Latin.
 *
 * Note that it expects lowercased input.
 */
 public final class SerbianNormalizationRegularFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  public SerbianNormalizationRegularFilter(TokenStream input) {
    super(input);
  }
  @Override
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      char buffer[] = termAtt.buffer();
      int length = termAtt.length();
      for (int i = 0; i < length; i++) {
        final char c = buffer[i];
        switch(c) {
        case 'а':
          buffer[i] = 'a';
          break;
        case 'б':
          buffer[i] = 'b';
          break;
        case 'в':
          buffer[i] = 'v';
          break;
        case 'г':
          buffer[i] = 'g';
          break;
        case 'д':
          buffer[i] = 'd';
          break;
        case 'ђ':
          buffer[i] = 'đ';
          break;
        case 'е':
          buffer[i] = 'e';
          break;
        case 'ж':
          buffer[i] = 'ž';
          break;
        case 'з':
          buffer[i] = 'z';
          break;
        case 'и':
          buffer[i] = 'i';
          break;
        case 'ј':
          buffer[i] = 'j';
          break;
        case 'к':
          buffer[i] = 'k';
          break;
        case 'л':
          buffer[i] = 'l';
          break;
        case 'љ':
          buffer = termAtt.resizeBuffer(1+length);
          if (i < length) {
            System.arraycopy(buffer, i, buffer, i+1, (length-i));
          }
          buffer[i] = 'l';
          buffer[++i] = 'j';
          length++;
          break;
        case 'м':
          buffer[i] = 'm';
          break;
        case 'н':
          buffer[i] = 'n';
          break;
        case 'њ':
          buffer = termAtt.resizeBuffer(1+length);
          if (i < length) {
            System.arraycopy(buffer, i, buffer, i+1, (length-i));
          }
          buffer[i] = 'n';
          buffer[++i] = 'j';
          length++;
          break;
        case 'о':
          buffer[i] = 'o';
          break;
        case 'п':
          buffer[i] = 'p';
          break;
        case 'р':
          buffer[i] = 'r';
          break;
        case 'с':
          buffer[i] = 's';
          break;
        case 'т':
          buffer[i] = 't';
          break;
        case 'ћ':
          buffer[i] = 'ć';
          break;
        case 'у':
          buffer[i] = 'u';
          break;
        case 'ф':
          buffer[i] = 'f';
          break;
        case 'х':
          buffer[i] = 'h';
          break;
        case 'ц':
          buffer[i] = 'c';
          break;
        case 'ч':
          buffer[i] = 'č';
          break;
        case 'џ':
          buffer = termAtt.resizeBuffer(1+length);
          if (i < length) {
            System.arraycopy(buffer, i, buffer, i+1, (length-i));
          }
          buffer[i] = 'd';
          buffer[++i] = 'ž';
          length++;
          break;
        case 'ш':
          buffer[i] = 'š';
          break;
        default:
          break;
        }
      }
      termAtt.setLength(length);
      return true;
    } else {
      return false;
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java
@ -37,6 +37,14 @@ public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactor
    assertTokenStreamContents(stream, new String[] { "djura" });
  }
  public void testRegularStemming() throws Exception {
    Reader reader = new StringReader("ђура");
    TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    ((Tokenizer)stream).setReader(reader);
    stream = tokenFilterFactory("SerbianNormalization", "haircut", "regular").create(stream);
    assertTokenStreamContents(stream, new String[] { "đura" });
  }
  /** Test that bogus arguments result in exception */
  public void testBogusArguments() throws Exception {
    try {
@ -46,4 +54,5 @@ public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactor
      assertTrue(expected.getMessage().contains("Unknown parameters"));
    }
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java
@ -0,0 +1,84 @@
 package org.apache.lucene.analysis.sr;
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.core.KeywordTokenizer;
 /**
 * Tests {@link SerbianNormalizationFilter}
 */
 public class TestSerbianNormalizationRegularFilter extends BaseTokenStreamTestCase {
  private Analyzer analyzer;
  @Override
  public void setUp() throws Exception {
    super.setUp();
    analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
        final TokenStream stream = new SerbianNormalizationRegularFilter(tokenizer);
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
  }
  @Override
  public void tearDown() throws Exception {
    analyzer.close();
    super.tearDown();
  }
  /**
   * Tests Cyrillic text.
   */
  public void testCyrillic() throws IOException {
    checkOneTerm(analyzer, "абвгдђежзијклљмнњопрстћуфхцчџш", "abvgdđežzijklljmnnjoprstćufhcčdžš");
  }
  /**
   * Tests Latin text.
   */
  public void testLatin() throws IOException {
    checkOneTerm(analyzer, "abcčćddžđefghijklljmnnjoprsštuvzž", "abcčćddžđefghijklljmnnjoprsštuvzž");
  }
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
  }
  public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer tokenizer = new KeywordTokenizer();
        return new TokenStreamComponents(tokenizer, new SerbianNormalizationRegularFilter(tokenizer));
      }
    };
    checkOneTerm(a, "", "");
    a.close();
  }
 }