mirror of https://github.com/apache/lucene.git
LUCENE-10096: Tamil Analyzer (#292)
Add Tamil analyzer based on snowball stemmer and TamilNLP stopwords
This commit is contained in:
parent
8bce765218
commit
24aa45dc3e
|
@ -19,6 +19,8 @@ New Features
|
||||||
|
|
||||||
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
|
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||||
|
|
||||||
System Requirements
|
System Requirements
|
||||||
|
|
||||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||||
|
|
|
@ -0,0 +1,133 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ta;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.UncheckedIOException;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||||
|
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.tartarus.snowball.ext.TamilStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Tamil.
|
||||||
|
*
|
||||||
|
* @since 9.0
|
||||||
|
*/
|
||||||
|
public final class TamilAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File containing default Tamil stopwords.
|
||||||
|
*
|
||||||
|
* <p>Default stopword list is from https://github.com/AshokR/TamilNLP (Apache 2 License)
|
||||||
|
*/
|
||||||
|
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
private static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
*
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static CharArraySet getDefaultStopSet() {
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
|
||||||
|
* static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET =
|
||||||
|
loadStopwordSet(false, TamilAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new UncheckedIOException("Unable to load default stopword set", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a stemming exclusion set
|
||||||
|
*/
|
||||||
|
public TamilAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
|
super(stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public TamilAnalyzer(CharArraySet stopwords) {
|
||||||
|
this(stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
|
||||||
|
public TamilAnalyzer() {
|
||||||
|
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
|
||||||
|
* the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
|
||||||
|
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
|
||||||
|
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||||
|
* provided, {@link SnowballFilter}, and Tamil Stop words
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
final Tokenizer source = new StandardTokenizer();
|
||||||
|
TokenStream result = new LowerCaseFilter(source);
|
||||||
|
result = new DecimalDigitFilter(result);
|
||||||
|
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
result = new StopFilter(result, stopwords);
|
||||||
|
result = new SnowballFilter(result, new TamilStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
|
TokenStream result = new LowerCaseFilter(in);
|
||||||
|
result = new DecimalDigitFilter(result);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Analyzer for Tamil. */
|
||||||
|
package org.apache.lucene.analysis.ta;
|
|
@ -0,0 +1,126 @@
|
||||||
|
# tamil stopwords from https://github.com/AshokR/TamilNLP
|
||||||
|
ஒரு
|
||||||
|
என்று
|
||||||
|
மற்றும்
|
||||||
|
இந்த
|
||||||
|
இது
|
||||||
|
என்ற
|
||||||
|
கொண்டு
|
||||||
|
என்பது
|
||||||
|
பல
|
||||||
|
ஆகும்
|
||||||
|
அல்லது
|
||||||
|
அவர்
|
||||||
|
நான்
|
||||||
|
உள்ள
|
||||||
|
அந்த
|
||||||
|
இவர்
|
||||||
|
என
|
||||||
|
முதல்
|
||||||
|
என்ன
|
||||||
|
இருந்து
|
||||||
|
சில
|
||||||
|
என்
|
||||||
|
போன்ற
|
||||||
|
வேண்டும்
|
||||||
|
வந்து
|
||||||
|
இதன்
|
||||||
|
அது
|
||||||
|
அவன்
|
||||||
|
தான்
|
||||||
|
பலரும்
|
||||||
|
என்னும்
|
||||||
|
மேலும்
|
||||||
|
பின்னர்
|
||||||
|
கொண்ட
|
||||||
|
இருக்கும்
|
||||||
|
தனது
|
||||||
|
உள்ளது
|
||||||
|
போது
|
||||||
|
என்றும்
|
||||||
|
அதன்
|
||||||
|
தன்
|
||||||
|
பிறகு
|
||||||
|
அவர்கள்
|
||||||
|
வரை
|
||||||
|
அவள்
|
||||||
|
நீ
|
||||||
|
ஆகிய
|
||||||
|
இருந்தது
|
||||||
|
உள்ளன
|
||||||
|
வந்த
|
||||||
|
இருந்த
|
||||||
|
மிகவும்
|
||||||
|
இங்கு
|
||||||
|
மீது
|
||||||
|
ஓர்
|
||||||
|
இவை
|
||||||
|
இந்தக்
|
||||||
|
பற்றி
|
||||||
|
வரும்
|
||||||
|
வேறு
|
||||||
|
இரு
|
||||||
|
இதில்
|
||||||
|
போல்
|
||||||
|
இப்போது
|
||||||
|
அவரது
|
||||||
|
மட்டும்
|
||||||
|
இந்தப்
|
||||||
|
எனும்
|
||||||
|
மேல்
|
||||||
|
பின்
|
||||||
|
சேர்ந்த
|
||||||
|
ஆகியோர்
|
||||||
|
எனக்கு
|
||||||
|
இன்னும்
|
||||||
|
அந்தப்
|
||||||
|
அன்று
|
||||||
|
ஒரே
|
||||||
|
மிக
|
||||||
|
அங்கு
|
||||||
|
பல்வேறு
|
||||||
|
விட்டு
|
||||||
|
பெரும்
|
||||||
|
அதை
|
||||||
|
பற்றிய
|
||||||
|
உன்
|
||||||
|
அதிக
|
||||||
|
அந்தக்
|
||||||
|
பேர்
|
||||||
|
இதனால்
|
||||||
|
அவை
|
||||||
|
அதே
|
||||||
|
ஏன்
|
||||||
|
முறை
|
||||||
|
யார்
|
||||||
|
என்பதை
|
||||||
|
எல்லாம்
|
||||||
|
மட்டுமே
|
||||||
|
இங்கே
|
||||||
|
அங்கே
|
||||||
|
இடம்
|
||||||
|
இடத்தில்
|
||||||
|
அதில்
|
||||||
|
நாம்
|
||||||
|
அதற்கு
|
||||||
|
எனவே
|
||||||
|
பிற
|
||||||
|
சிறு
|
||||||
|
மற்ற
|
||||||
|
விட
|
||||||
|
எந்த
|
||||||
|
எனவும்
|
||||||
|
எனப்படும்
|
||||||
|
எனினும்
|
||||||
|
அடுத்த
|
||||||
|
இதனை
|
||||||
|
இதை
|
||||||
|
கொள்ள
|
||||||
|
இந்தத்
|
||||||
|
இதற்கு
|
||||||
|
அதனால்
|
||||||
|
தவிர
|
||||||
|
போல
|
||||||
|
வரையில்
|
||||||
|
சற்று
|
||||||
|
எனக்
|
|
@ -0,0 +1,67 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ta;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
|
/** Tests the TamilAnalyzer */
|
||||||
|
public class TestTamilAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new TamilAnalyzer().close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test that snowball stemmer is hooked in */
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Analyzer a = new TamilAnalyzer();
|
||||||
|
// friend
|
||||||
|
checkOneTerm(a, "நண்பன்", "நண்");
|
||||||
|
// friends
|
||||||
|
checkOneTerm(a, "நண்பர்கள்", "நண்");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testExclusionSet() throws Exception {
|
||||||
|
CharArraySet exclusionSet = new CharArraySet(asSet("நண்பர்கள்"), false);
|
||||||
|
Analyzer a = new TamilAnalyzer(TamilAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTerm(a, "நண்பர்கள்", "நண்பர்கள்");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test we fold digits to latin-1 */
|
||||||
|
public void testDigits() throws Exception {
|
||||||
|
TamilAnalyzer a = new TamilAnalyzer();
|
||||||
|
checkOneTerm(a, "௧௨௩௪", "1234");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** tamil doesn't have case, but test we case-fold any latin-1 etc */
|
||||||
|
public void testLowerCase() throws Exception {
|
||||||
|
TamilAnalyzer a = new TamilAnalyzer();
|
||||||
|
checkOneTerm(a, "FIFA", "fifa");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
Analyzer analyzer = new TamilAnalyzer();
|
||||||
|
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue