mirror of https://github.com/apache/lucene.git
LUCENE-10096: Tamil Analyzer (#292)
Add Tamil analyzer based on snowball stemmer and TamilNLP stopwords
This commit is contained in:
parent
8bce765218
commit
24aa45dc3e
|
@ -19,6 +19,8 @@ New Features
|
|||
|
||||
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||
|
||||
* LUCENE-10096: Add TamilAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||
|
||||
System Requirements
|
||||
|
||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ta;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.UncheckedIOException;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.tartarus.snowball.ext.TamilStemmer;
|
||||
|
||||
/**
|
||||
* Analyzer for Tamil.
|
||||
*
|
||||
* @since 9.0
|
||||
*/
|
||||
public final class TamilAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* File containing default Tamil stopwords.
|
||||
*
|
||||
* <p>Default stopword list is from https://github.com/AshokR/TamilNLP (Apache 2 License)
|
||||
*/
|
||||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
private static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
*
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static CharArraySet getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
|
||||
* static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
loadStopwordSet(false, TamilAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new UncheckedIOException("Unable to load default stopword set", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a stemming exclusion set
|
||||
*/
|
||||
public TamilAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public TamilAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
|
||||
public TamilAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
|
||||
* the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
|
||||
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
|
||||
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided, {@link SnowballFilter}, and Tamil Stop words
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
result = new DecimalDigitFilter(result);
|
||||
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
result = new SnowballFilter(result, new TamilStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new LowerCaseFilter(in);
|
||||
result = new DecimalDigitFilter(result);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Analyzer for Tamil. */
|
||||
package org.apache.lucene.analysis.ta;
|
|
@ -0,0 +1,126 @@
|
|||
# tamil stopwords from https://github.com/AshokR/TamilNLP
|
||||
ஒரு
|
||||
என்று
|
||||
மற்றும்
|
||||
இந்த
|
||||
இது
|
||||
என்ற
|
||||
கொண்டு
|
||||
என்பது
|
||||
பல
|
||||
ஆகும்
|
||||
அல்லது
|
||||
அவர்
|
||||
நான்
|
||||
உள்ள
|
||||
அந்த
|
||||
இவர்
|
||||
என
|
||||
முதல்
|
||||
என்ன
|
||||
இருந்து
|
||||
சில
|
||||
என்
|
||||
போன்ற
|
||||
வேண்டும்
|
||||
வந்து
|
||||
இதன்
|
||||
அது
|
||||
அவன்
|
||||
தான்
|
||||
பலரும்
|
||||
என்னும்
|
||||
மேலும்
|
||||
பின்னர்
|
||||
கொண்ட
|
||||
இருக்கும்
|
||||
தனது
|
||||
உள்ளது
|
||||
போது
|
||||
என்றும்
|
||||
அதன்
|
||||
தன்
|
||||
பிறகு
|
||||
அவர்கள்
|
||||
வரை
|
||||
அவள்
|
||||
நீ
|
||||
ஆகிய
|
||||
இருந்தது
|
||||
உள்ளன
|
||||
வந்த
|
||||
இருந்த
|
||||
மிகவும்
|
||||
இங்கு
|
||||
மீது
|
||||
ஓர்
|
||||
இவை
|
||||
இந்தக்
|
||||
பற்றி
|
||||
வரும்
|
||||
வேறு
|
||||
இரு
|
||||
இதில்
|
||||
போல்
|
||||
இப்போது
|
||||
அவரது
|
||||
மட்டும்
|
||||
இந்தப்
|
||||
எனும்
|
||||
மேல்
|
||||
பின்
|
||||
சேர்ந்த
|
||||
ஆகியோர்
|
||||
எனக்கு
|
||||
இன்னும்
|
||||
அந்தப்
|
||||
அன்று
|
||||
ஒரே
|
||||
மிக
|
||||
அங்கு
|
||||
பல்வேறு
|
||||
விட்டு
|
||||
பெரும்
|
||||
அதை
|
||||
பற்றிய
|
||||
உன்
|
||||
அதிக
|
||||
அந்தக்
|
||||
பேர்
|
||||
இதனால்
|
||||
அவை
|
||||
அதே
|
||||
ஏன்
|
||||
முறை
|
||||
யார்
|
||||
என்பதை
|
||||
எல்லாம்
|
||||
மட்டுமே
|
||||
இங்கே
|
||||
அங்கே
|
||||
இடம்
|
||||
இடத்தில்
|
||||
அதில்
|
||||
நாம்
|
||||
அதற்கு
|
||||
எனவே
|
||||
பிற
|
||||
சிறு
|
||||
மற்ற
|
||||
விட
|
||||
எந்த
|
||||
எனவும்
|
||||
எனப்படும்
|
||||
எனினும்
|
||||
அடுத்த
|
||||
இதனை
|
||||
இதை
|
||||
கொள்ள
|
||||
இந்தத்
|
||||
இதற்கு
|
||||
அதனால்
|
||||
தவிர
|
||||
போல
|
||||
வரையில்
|
||||
சற்று
|
||||
எனக்
|
|
@ -0,0 +1,67 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ta;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
/** Tests the TamilAnalyzer */
|
||||
public class TestTamilAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new TamilAnalyzer().close();
|
||||
}
|
||||
|
||||
/** test that snowball stemmer is hooked in */
|
||||
public void testStemming() throws Exception {
|
||||
Analyzer a = new TamilAnalyzer();
|
||||
// friend
|
||||
checkOneTerm(a, "நண்பன்", "நண்");
|
||||
// friends
|
||||
checkOneTerm(a, "நண்பர்கள்", "நண்");
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testExclusionSet() throws Exception {
|
||||
CharArraySet exclusionSet = new CharArraySet(asSet("நண்பர்கள்"), false);
|
||||
Analyzer a = new TamilAnalyzer(TamilAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTerm(a, "நண்பர்கள்", "நண்பர்கள்");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** test we fold digits to latin-1 */
|
||||
public void testDigits() throws Exception {
|
||||
TamilAnalyzer a = new TamilAnalyzer();
|
||||
checkOneTerm(a, "௧௨௩௪", "1234");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** tamil doesn't have case, but test we case-fold any latin-1 etc */
|
||||
public void testLowerCase() throws Exception {
|
||||
TamilAnalyzer a = new TamilAnalyzer();
|
||||
checkOneTerm(a, "FIFA", "fifa");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer analyzer = new TamilAnalyzer();
|
||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||
analyzer.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue