LUCENE-10096: Tamil Analyzer (#292)

Add Tamil analyzer based on snowball stemmer and TamilNLP stopwords
This commit is contained in:
Robert Muir 2021-09-10 21:02:11 -04:00 committed by GitHub
parent 8bce765218
commit 24aa45dc3e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 347 additions and 0 deletions

View File

@ -19,6 +19,8 @@ New Features
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
System Requirements
* LUCENE-8738: Move to Java 11 as minimum Java version.

View File

@ -0,0 +1,133 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ta;
import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.tartarus.snowball.ext.TamilStemmer;
/**
* Analyzer for Tamil.
*
* @since 9.0
*/
public final class TamilAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
/**
* File containing default Tamil stopwords.
*
* <p>Default stopword list is from https://github.com/AshokR/TamilNLP (Apache 2 License)
*/
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
*
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
* static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET =
loadStopwordSet(false, TamilAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new UncheckedIOException("Unable to load default stopword set", ex);
}
}
}
/**
* Builds an analyzer with the given stop words
*
* @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set
*/
public TamilAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words
*
* @param stopwords a stopword set
*/
public TamilAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public TamilAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
* the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided, {@link SnowballFilter}, and Tamil Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
result = new StopFilter(result, stopwords);
result = new SnowballFilter(result, new TamilStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
result = new IndicNormalizationFilter(result);
return result;
}
}

View File

@ -0,0 +1,19 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Analyzer for Tamil. */
package org.apache.lucene.analysis.ta;

View File

@ -0,0 +1,126 @@
# tamil stopwords from https://github.com/AshokR/TamilNLP
ஒரு
என்று
மற்றும்
இந்த
இது
என்ற
கொண்டு
என்பது
பல
ஆகும்
அல்லது
அவர்
நான்
உள்ள
அந்த
இவர்
என
முதல்
என்ன
இருந்து
சில
என்
போன்ற
வேண்டும்
வந்து
இதன்
அது
அவன்
தான்
பலரும்
என்னும்
மேலும்
பின்னர்
கொண்ட
இருக்கும்
தனது
உள்ளது
போது
என்றும்
அதன்
தன்
பிறகு
அவர்கள்
வரை
அவள்
நீ
ஆகிய
இருந்தது
உள்ளன
வந்த
இருந்த
மிகவும்
இங்கு
மீது
ஓர்
இவை
இந்தக்
பற்றி
வரும்
வேறு
இரு
இதில்
போல்
இப்போது
அவரது
மட்டும்
இந்தப்
எனும்
மேல்
பின்
சேர்ந்த
ஆகியோர்
எனக்கு
இன்னும்
அந்தப்
அன்று
ஒரே
மிக
அங்கு
பல்வேறு
விட்டு
பெரும்
அதை
பற்றிய
உன்
அதிக
அந்தக்
பேர்
இதனால்
அவை
அதே
ஏன்
முறை
யார்
என்பதை
எல்லாம்
மட்டுமே
இங்கே
அங்கே
இடம்
இடத்தில்
அதில்
நாம்
அதற்கு
எனவே
பிற
சிறு
மற்ற
விட
எந்த
எனவும்
எனப்படும்
எனினும்
அடுத்த
இதனை
இதை
கொள்ள
இந்தத்
இதற்கு
அதனால்
தவிர
போல
வரையில்
சற்று
எனக்

View File

@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ta;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
/** Tests the TamilAnalyzer */
public class TestTamilAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the stopwords file is missing in classpath */
public void testResourcesAvailable() {
new TamilAnalyzer().close();
}
/** test that snowball stemmer is hooked in */
public void testStemming() throws Exception {
Analyzer a = new TamilAnalyzer();
// friend
checkOneTerm(a, "நண்பன்", "நண்");
// friends
checkOneTerm(a, "நண்பர்கள்", "நண்");
a.close();
}
public void testExclusionSet() throws Exception {
CharArraySet exclusionSet = new CharArraySet(asSet("நண்பர்கள்"), false);
Analyzer a = new TamilAnalyzer(TamilAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTerm(a, "நண்பர்கள்", "நண்பர்கள்");
a.close();
}
/** test we fold digits to latin-1 */
public void testDigits() throws Exception {
TamilAnalyzer a = new TamilAnalyzer();
checkOneTerm(a, "௧௨௩௪", "1234");
a.close();
}
/** tamil doesn't have case, but test we case-fold any latin-1 etc */
public void testLowerCase() throws Exception {
TamilAnalyzer a = new TamilAnalyzer();
checkOneTerm(a, "FIFA", "fifa");
a.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new TamilAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
}