LUCENE-6053: add Serbian analyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1638220 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2014-11-11 17:28:35 +00:00
parent 030afc5c7d
commit 67c1aaa9f8
7 changed files with 377 additions and 0 deletions

View File

@ -77,6 +77,8 @@ New Features
improved exception handling, and indirect norms encoding for sparse fields.
(Mike McCandless, Ryan Ernst, Robert Muir)
* LUCENE-6053: Add Serbian analyzer. (Nikola Smolenski via Robert Muir, Mike McCandless)
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and

View File

@ -0,0 +1,174 @@
package org.apache.lucene.analysis.sr;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* Normalizes Serbian Cyrillic and Latin characters to "bald" Latin.
*
* Cyrillic characters are first converted to Latin; then, Latin characters
* have their diacritics removed, with the exception of đ which is converted to
* dj.
*
* Note that it expects lowercased input.
*/
public final class SerbianNormalizationFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
public SerbianNormalizationFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
char buffer[] = termAtt.buffer();
int length = termAtt.length();
for (int i = 0; i < length; i++) {
final char c = buffer[i];
switch(c) {
case 'а':
buffer[i] = 'a';
break;
case 'б':
buffer[i] = 'b';
break;
case 'в':
buffer[i] = 'v';
break;
case 'г':
buffer[i] = 'g';
break;
case 'д':
buffer[i] = 'd';
break;
case 'ђ':
case 'đ':
buffer = termAtt.resizeBuffer(1+length);
if (i < length) {
System.arraycopy(buffer, i, buffer, i+1, (length-i));
}
buffer[i] = 'd';
buffer[++i] = 'j';
length++;
break;
case 'е':
buffer[i] = 'e';
break;
case 'ж':
case 'з':
case 'ž':
buffer[i] = 'z';
break;
case 'и':
buffer[i] = 'i';
break;
case 'ј':
buffer[i] = 'j';
break;
case 'к':
buffer[i] = 'k';
break;
case 'л':
buffer[i] = 'l';
break;
case 'љ':
buffer = termAtt.resizeBuffer(1+length);
if (i < length) {
System.arraycopy(buffer, i, buffer, i+1, (length-i));
}
buffer[i] = 'l';
buffer[++i] = 'j';
length++;
break;
case 'м':
buffer[i] = 'm';
break;
case 'н':
buffer[i] = 'n';
break;
case 'њ':
buffer = termAtt.resizeBuffer(1+length);
if (i < length) {
System.arraycopy(buffer, i, buffer, i+1, (length-i));
}
buffer[i] = 'n';
buffer[++i] = 'j';
length++;
break;
case 'о':
buffer[i] = 'o';
break;
case 'п':
buffer[i] = 'p';
break;
case 'р':
buffer[i] = 'r';
break;
case 'с':
buffer[i] = 's';
break;
case 'т':
buffer[i] = 't';
break;
case 'ћ':
case 'ц':
case 'ч':
case 'č':
case 'ć':
buffer[i] = 'c';
break;
case 'у':
buffer[i] = 'u';
break;
case 'ф':
buffer[i] = 'f';
break;
case 'х':
buffer[i] = 'h';
break;
case 'џ':
buffer = termAtt.resizeBuffer(1+length);
if (i < length) {
System.arraycopy(buffer, i, buffer, i+1, (length-i));
}
buffer[i] = 'd';
buffer[++i] = 'z';
length++;
break;
case 'ш':
case 'š':
buffer[i] = 's';
break;
default:
break;
}
}
termAtt.setLength(length);
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.sr;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.AbstractAnalysisFactory;
import org.apache.lucene.analysis.util.MultiTermAwareComponent;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* Factory for {@link SerbianNormalizationFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_srnorm" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.SerbianNormalizationFilterFactory"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class SerbianNormalizationFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent {
/** Creates a new SerbianNormalizationFilterFactory */
public SerbianNormalizationFilterFactory(Map<String,String> args) {
super(args);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public TokenStream create(TokenStream input) {
return new SerbianNormalizationFilter(input);
}
@Override
public AbstractAnalysisFactory getMultiTermComponent() {
return this;
}
}

View File

@ -0,0 +1,22 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html><head></head>
<body>
Analyzer for Serbian.
</body>
</html>

View File

@ -91,6 +91,7 @@ org.apache.lucene.analysis.reverse.ReverseStringFilterFactory
org.apache.lucene.analysis.ru.RussianLightStemFilterFactory
org.apache.lucene.analysis.shingle.ShingleFilterFactory
org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory
org.apache.lucene.analysis.sr.SerbianNormalizationFilterFactory
org.apache.lucene.analysis.standard.ClassicFilterFactory
org.apache.lucene.analysis.standard.StandardFilterFactory
org.apache.lucene.analysis.sv.SwedishLightStemFilterFactory

View File

@ -0,0 +1,71 @@
package org.apache.lucene.analysis.sr;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
/**
* Tests {@link SerbianNormalizationFilter}
*/
public class TestSerbianNormalizationFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String field) {
final Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
final TokenStream stream = new SerbianNormalizationFilter(tokenizer);
return new TokenStreamComponents(tokenizer, stream);
}
};
/**
* Tests Cyrillic text.
*/
public void testCyrillic() throws IOException {
checkOneTerm(analyzer, "абвгдђежзијклљмнњопрстћуфхцчџш", "abvgddjezzijklljmnnjoprstcufhccdzs");
}
/**
* Tests Latin text.
*/
public void testLatin() throws IOException {
checkOneTerm(analyzer, "abcčćddžđefghijklljmnnjoprsštuvzž", "abcccddzdjefghijklljmnnjoprsstuvzz");
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new SerbianNormalizationFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.analysis.sr;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Simple tests to ensure the Serbian normalization factory is working.
*/
public class TestSerbianNormalizationFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("đura");
TokenStream stream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
((Tokenizer)stream).setReader(reader);
stream = tokenFilterFactory("SerbianNormalization").create(stream);
assertTokenStreamContents(stream, new String[] { "djura" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("SerbianNormalization", "bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}