LUCENE-10095: Nepali Analyzer (#290)

Add Nepali analyzer based on snowball stemmer and NLTK stopwords
This commit is contained in:
Robert Muir 2021-09-10 20:45:23 -04:00 committed by GitHub
parent cc8c4283dd
commit 8bce765218
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 489 additions and 2 deletions

View File

@ -15,6 +15,10 @@ New Features
* LUCENE-9589: Swedish Minimal Stemmer (janhoy)
* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
System Requirements
* LUCENE-8738: Move to Java 11 as minimum Java version.
@ -217,8 +221,6 @@ Improvements
with doc values and points. In this case, there is an assumption that the same data is
stored in these points and doc values (Mayya Sharipova, Jim Ferenczi, Adrien Grand)
* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
* LUCENE-9449: Enhance DocComparator to provide an iterator over competitive
documents when searching with "after". This iterator can quickly position
on the desired "after" document skipping all documents and segments before

View File

@ -0,0 +1,134 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ne;
import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.tartarus.snowball.ext.NepaliStemmer;
/**
* Analyzer for Nepali.
*
* @since 9.0
*/
public final class NepaliAnalyzer extends StopwordAnalyzerBase {
private final CharArraySet stemExclusionSet;
/**
* File containing default Nepali stopwords.
*
* <p>Default stopword list is from the Apache2 python project NLTK:
* https://github.com/nltk/nltk_data
*/
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
private static final String STOPWORDS_COMMENT = "#";
/**
* Returns an unmodifiable instance of the default stop-words set.
*
* @return an unmodifiable instance of the default stop-words set.
*/
public static CharArraySet getDefaultStopSet() {
return DefaultSetHolder.DEFAULT_STOP_SET;
}
/**
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
* static final set the first time.;
*/
private static class DefaultSetHolder {
static final CharArraySet DEFAULT_STOP_SET;
static {
try {
DEFAULT_STOP_SET =
loadStopwordSet(false, NepaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
} catch (IOException ex) {
// default set should always be present as it is part of the
// distribution (JAR)
throw new UncheckedIOException("Unable to load default stopword set", ex);
}
}
}
/**
* Builds an analyzer with the given stop words
*
* @param stopwords a stopword set
* @param stemExclusionSet a stemming exclusion set
*/
public NepaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
super(stopwords);
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
}
/**
* Builds an analyzer with the given stop words
*
* @param stopwords a stopword set
*/
public NepaliAnalyzer(CharArraySet stopwords) {
this(stopwords, CharArraySet.EMPTY_SET);
}
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
public NepaliAnalyzer() {
this(DefaultSetHolder.DEFAULT_STOP_SET);
}
/**
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
* the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided, {@link SnowballFilter}, and Nepali Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new IndicNormalizationFilter(result);
result = new StopFilter(result, stopwords);
result = new SnowballFilter(result, new NepaliStemmer());
return new TokenStreamComponents(source, result);
}
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
result = new IndicNormalizationFilter(result);
return result;
}
}

View File

@ -0,0 +1,19 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Analyzer for Nepali. */
package org.apache.lucene.analysis.ne;

View File

@ -0,0 +1,256 @@
# nepali stopwords from https://github.com/nltk/nltk_data
पनि
छन्
लागि
भएको
गरेको
भने
गर्न
गर्ने
हो
तथा
यो
रहेको
उनले
थियो
हुने
गरेका
थिए
गर्दै
तर
नै
को
मा
हुन्
भन्ने
हुन
गरी
हुन्छ
अब
के
रहेका
गरेर
छैन
दिए
भए
यस
ले
गर्नु
औं
सो
त्यो
कि
जुन
यी
का
गरि
ती
छु
छौं
लाई
नि
उप
अक्सर
आदि
कसरी
क्रमशः
चाले
अगाडी
अझै
अनुसार
अन्तर्गत
अन्य
अन्यत्र
अन्यथा
अरु
अरुलाई
अर्को
अर्थात
अर्थात्
अलग
आए
आजको
ओठ
आत्म
आफू
आफूलाई
आफ्नै
आफ्नो
आयो
उदाहरण
उनको
उहालाई
एउटै
एक
एकदम
कतै
कम से कम
कसै
कसैले
कहाँबाट
कहिलेकाहीं
का
किन
किनभने
कुनै
कुरा
कृपया
केही
कोही
गए
गरौं
गर्छ
गर्छु
गर्नुपर्छ
गयौ
गैर
चार
चाहनुहुन्छ
चाहन्छु
चाहिए
छू
जताततै
जब
जबकि
जसको
जसबाट
जसमा
जसलाई
जसले
जस्तै
जस्तो
जस्तोसुकै
जहाँ
जान
जाहिर
जे
जो
ठीक
तत्काल
तदनुसार
तपाईको
तपाई
पर्याप्त
पहिले
पहिलो
पहिल्यै
पाँच
पाँचौं
तल
तापनी
तिनी
तिनीहरू
तिनीहरुको
तिनिहरुलाई
तिमी
तिर
तीन
तुरुन्तै
तेस्रो
तेस्कारण
पूर्व
प्रति
प्रतेक
प्लस
फेरी
बने
त्सपछि
त्सैले
त्यहाँ
थिएन
दिनुभएको
दिनुहुन्छ
दुई
देखि
बरु
बारे
बाहिर
देखिन्छ
देखियो
देखे
देखेको
देखेर
दोस्रो
धेरै
नजिकै
नत्र
नयाँ
निम्ति
बाहेक
बीच
बीचमा
भन
निम्न
निम्नानुसार
निर्दिष्ट
नौ
पक्का
पक्कै
पछि
पछिल्लो
पटक
पर्छ
पर्थ्यो
भन्छन्
भन्
भन्छु
भन्दा
भन्नुभयो
भर
भित्र
भित्री
मलाई
मात्र
माथि
मुख्य
मेरो
यति
यथोचित
यदि
यद्यपि
यसको
यसपछि
यसबाहेक
यसरी
यसो
यस्तो
यहाँ
यहाँसम्म
या
रही
राखे
राख्छ
राम्रो
रूप
लगभग
वरीपरी
वास्तवमा
बिरुद्ध
बिशेष
सायद
शायद
संग
संगै
सक्छ
सट्टा
सधै
सबै
सबैलाई
समय
सम्भव
सम्म
सही
साँच्चै
सात
साथ
साथै
सारा
सोही
स्पष्ट
हरे
हरेक

View File

@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ne;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CharArraySet;
/** Tests the NepaliAnalyzer */
public class TestNepaliAnalyzer extends BaseTokenStreamTestCase {
/** This test fails with NPE when the stopwords file is missing in classpath */
public void testResourcesAvailable() {
new NepaliAnalyzer().close();
}
/** test that snowball stemmer is hooked in correctly */
public void testStemming() throws Exception {
Analyzer a = new NepaliAnalyzer();
// friend
checkOneTerm(a, "मित्र", "मित्र");
// friends
checkOneTerm(a, "मित्रहरु", "मित्र");
a.close();
}
public void testStopwords() throws Exception {
Analyzer a = new NepaliAnalyzer();
assertAnalyzesTo(
a,
"सबै व्यक्तिहरू जन्मजात स्वतन्त्र हुन् ती सबैको समान अधिकार र महत्व",
new String[] {"व्यक्ति", "जन्मजात", "स्वतन्त्र", "सबै", "समान", "अधिकार", "महत्व"});
a.close();
}
/** nepali has no case, but any latin-1 etc should be casefolded */
public void testLowerCase() throws Exception {
Analyzer a = new NepaliAnalyzer();
checkOneTerm(a, "FIFA", "fifa");
a.close();
}
public void testExclusionSet() throws Exception {
CharArraySet exclusionSet = new CharArraySet(asSet("मित्रहरु"), false);
Analyzer a = new NepaliAnalyzer(NepaliAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTerm(a, "मित्रहरु", "मित्रहरु");
a.close();
}
/** test we fold digits to latin-1 */
public void testDigits() throws Exception {
NepaliAnalyzer a = new NepaliAnalyzer();
checkOneTerm(a, "१२३४", "1234");
a.close();
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new NepaliAnalyzer();
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
analyzer.close();
}
}