From e1041edfa40b9b9823ec98a909d7d280d91a5620 Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Tue, 10 Nov 2015 18:45:42 +0000 Subject: [PATCH] LUCENE-6875: New Serbian Filter. (Nikola Smolenski via Robert Muir, Dawid Weiss) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1713712 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../sr/SerbianNormalizationFilterFactory.java | 13 +- .../sr/SerbianNormalizationRegularFilter.java | 165 ++++++++++++++++++ ...TestSerbianNormalizationFilterFactory.java | 11 +- ...TestSerbianNormalizationRegularFilter.java | 84 +++++++++ 5 files changed, 273 insertions(+), 3 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7bfe534233a..91e2e75e236 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -96,6 +96,9 @@ Changes in Runtime Behavior New Features +* LUCENE-6875: New Serbian normalization filter. (Nikola Smolenski via + Robert Muir, Dawid Weiss) + * LUCENE-6720: New FunctionRangeQuery wrapper around ValueSourceScorer (returned from ValueSource/FunctionValues.getRangeScorer()). (David Smiley) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java index 8aa62ce47f7..70036321095 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationFilterFactory.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.sr; * limitations under the License. */ +import java.util.Arrays; import java.util.Map; import org.apache.lucene.analysis.TokenStream; @@ -31,15 +32,19 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * <analyzer> * <tokenizer class="solr.StandardTokenizerFactory"/> * <filter class="solr.LowerCaseFilterFactory"/> - * <filter class="solr.SerbianNormalizationFilterFactory"/> + * <filter class="solr.SerbianNormalizationFilterFactory" + * haircut="bald"/> * </analyzer> * </fieldType> */ public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + final String haircut; /** Creates a new SerbianNormalizationFilterFactory */ public SerbianNormalizationFilterFactory(Map args) { super(args); + + this.haircut = get(args, "haircut", Arrays.asList( "bald", "regular" ), "bald"); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } @@ -47,7 +52,11 @@ public class SerbianNormalizationFilterFactory extends TokenFilterFactory implem @Override public TokenStream create(TokenStream input) { - return new SerbianNormalizationFilter(input); + if( this.haircut.equals( "regular" ) ) { + return new SerbianNormalizationRegularFilter(input); + } else { + return new SerbianNormalizationFilter(input); + } } @Override diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java new file mode 100644 index 00000000000..a0800cf7785 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/sr/SerbianNormalizationRegularFilter.java @@ -0,0 +1,165 @@ +package org.apache.lucene.analysis.sr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +/** + * Normalizes Serbian Cyrillic to Latin. + * + * Note that it expects lowercased input. + */ +public final class SerbianNormalizationRegularFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public SerbianNormalizationRegularFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + char buffer[] = termAtt.buffer(); + int length = termAtt.length(); + for (int i = 0; i < length; i++) { + final char c = buffer[i]; + switch(c) { + case 'а': + buffer[i] = 'a'; + break; + case 'б': + buffer[i] = 'b'; + break; + case 'в': + buffer[i] = 'v'; + break; + case 'г': + buffer[i] = 'g'; + break; + case 'д': + buffer[i] = 'd'; + break; + case 'ђ': + buffer[i] = 'đ'; + break; + case 'е': + buffer[i] = 'e'; + break; + case 'ж': + buffer[i] = 'ž'; + break; + case 'з': + buffer[i] = 'z'; + break; + case 'и': + buffer[i] = 'i'; + break; + case 'ј': + buffer[i] = 'j'; + break; + case 'к': + buffer[i] = 'k'; + break; + case 'л': + buffer[i] = 'l'; + break; + case 'љ': + buffer = termAtt.resizeBuffer(1+length); + if (i < length) { + System.arraycopy(buffer, i, buffer, i+1, (length-i)); + } + buffer[i] = 'l'; + buffer[++i] = 'j'; + length++; + break; + case 'м': + buffer[i] = 'm'; + break; + case 'н': + buffer[i] = 'n'; + break; + case 'њ': + buffer = termAtt.resizeBuffer(1+length); + if (i < length) { + System.arraycopy(buffer, i, buffer, i+1, (length-i)); + } + buffer[i] = 'n'; + buffer[++i] = 'j'; + length++; + break; + case 'о': + buffer[i] = 'o'; + break; + case 'п': + buffer[i] = 'p'; + break; + case 'р': + buffer[i] = 'r'; + break; + case 'с': + buffer[i] = 's'; + break; + case 'т': + buffer[i] = 't'; + break; + case 'ћ': + buffer[i] = 'ć'; + break; + case 'у': + buffer[i] = 'u'; + break; + case 'ф': + buffer[i] = 'f'; + break; + case 'х': + buffer[i] = 'h'; + break; + case 'ц': + buffer[i] = 'c'; + break; + case 'ч': + buffer[i] = 'č'; + break; + case 'џ': + buffer = termAtt.resizeBuffer(1+length); + if (i < length) { + System.arraycopy(buffer, i, buffer, i+1, (length-i)); + } + buffer[i] = 'd'; + buffer[++i] = 'ž'; + length++; + break; + case 'ш': + buffer[i] = 'š'; + break; + default: + break; + } + } + termAtt.setLength(length); + return true; + } else { + return false; + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java index 060624961ec..3bea320d6da 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationFilterFactory.java @@ -36,7 +36,15 @@ public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactor stream = tokenFilterFactory("SerbianNormalization").create(stream); assertTokenStreamContents(stream, new String[] { "djura" }); } - + + public void testRegularStemming() throws Exception { + Reader reader = new StringReader("ђура"); + TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false); + ((Tokenizer)stream).setReader(reader); + stream = tokenFilterFactory("SerbianNormalization", "haircut", "regular").create(stream); + assertTokenStreamContents(stream, new String[] { "đura" }); + } + /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { try { @@ -46,4 +54,5 @@ public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactor assertTrue(expected.getMessage().contains("Unknown parameters")); } } + } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java new file mode 100644 index 00000000000..586e867993f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/sr/TestSerbianNormalizationRegularFilter.java @@ -0,0 +1,84 @@ +package org.apache.lucene.analysis.sr; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; + +/** + * Tests {@link SerbianNormalizationFilter} + */ +public class TestSerbianNormalizationRegularFilter extends BaseTokenStreamTestCase { + private Analyzer analyzer; + + @Override + public void setUp() throws Exception { + super.setUp(); + analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + final TokenStream stream = new SerbianNormalizationRegularFilter(tokenizer); + return new TokenStreamComponents(tokenizer, stream); + } + }; + } + + @Override + public void tearDown() throws Exception { + analyzer.close(); + super.tearDown(); + } + + /** + * Tests Cyrillic text. + */ + public void testCyrillic() throws IOException { + checkOneTerm(analyzer, "абвгдђежзијклљмнњопрстћуфхцчџш", "abvgdđežzijklljmnnjoprstćufhcčdžš"); + } + + /** + * Tests Latin text. + */ + public void testLatin() throws IOException { + checkOneTerm(analyzer, "abcčćddžđefghijklljmnnjoprsštuvzž", "abcčćddžđefghijklljmnnjoprsštuvzž"); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new SerbianNormalizationRegularFilter(tokenizer)); + } + }; + checkOneTerm(a, "", ""); + a.close(); + } +}