mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 05:19:17 +00:00
LUCENE-10095: Nepali Analyzer (#290)
Add Nepali analyzer based on snowball stemmer and NLTK stopwords
This commit is contained in:
parent
cc8c4283dd
commit
8bce765218
@ -15,6 +15,10 @@ New Features
|
|||||||
|
|
||||||
* LUCENE-9589: Swedish Minimal Stemmer (janhoy)
|
* LUCENE-9589: Swedish Minimal Stemmer (janhoy)
|
||||||
|
|
||||||
|
* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
|
||||||
|
|
||||||
|
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||||
|
|
||||||
System Requirements
|
System Requirements
|
||||||
|
|
||||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||||
@ -217,8 +221,6 @@ Improvements
|
|||||||
with doc values and points. In this case, there is an assumption that the same data is
|
with doc values and points. In this case, there is an assumption that the same data is
|
||||||
stored in these points and doc values (Mayya Sharipova, Jim Ferenczi, Adrien Grand)
|
stored in these points and doc values (Mayya Sharipova, Jim Ferenczi, Adrien Grand)
|
||||||
|
|
||||||
* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
|
|
||||||
|
|
||||||
* LUCENE-9449: Enhance DocComparator to provide an iterator over competitive
|
* LUCENE-9449: Enhance DocComparator to provide an iterator over competitive
|
||||||
documents when searching with "after". This iterator can quickly position
|
documents when searching with "after". This iterator can quickly position
|
||||||
on the desired "after" document skipping all documents and segments before
|
on the desired "after" document skipping all documents and segments before
|
||||||
|
@ -0,0 +1,134 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ne;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.UncheckedIOException;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||||
|
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.tartarus.snowball.ext.NepaliStemmer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Analyzer for Nepali.
|
||||||
|
*
|
||||||
|
* @since 9.0
|
||||||
|
*/
|
||||||
|
public final class NepaliAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
private final CharArraySet stemExclusionSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File containing default Nepali stopwords.
|
||||||
|
*
|
||||||
|
* <p>Default stopword list is from the Apache2 python project NLTK:
|
||||||
|
* https://github.com/nltk/nltk_data
|
||||||
|
*/
|
||||||
|
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||||
|
|
||||||
|
private static final String STOPWORDS_COMMENT = "#";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an unmodifiable instance of the default stop-words set.
|
||||||
|
*
|
||||||
|
* @return an unmodifiable instance of the default stop-words set.
|
||||||
|
*/
|
||||||
|
public static CharArraySet getDefaultStopSet() {
|
||||||
|
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
|
||||||
|
* static final set the first time.;
|
||||||
|
*/
|
||||||
|
private static class DefaultSetHolder {
|
||||||
|
static final CharArraySet DEFAULT_STOP_SET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
DEFAULT_STOP_SET =
|
||||||
|
loadStopwordSet(false, NepaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// default set should always be present as it is part of the
|
||||||
|
// distribution (JAR)
|
||||||
|
throw new UncheckedIOException("Unable to load default stopword set", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
* @param stemExclusionSet a stemming exclusion set
|
||||||
|
*/
|
||||||
|
public NepaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||||
|
super(stopwords);
|
||||||
|
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words
|
||||||
|
*
|
||||||
|
* @param stopwords a stopword set
|
||||||
|
*/
|
||||||
|
public NepaliAnalyzer(CharArraySet stopwords) {
|
||||||
|
this(stopwords, CharArraySet.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
|
||||||
|
public NepaliAnalyzer() {
|
||||||
|
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
|
||||||
|
* the text in the provided {@link Reader}.
|
||||||
|
*
|
||||||
|
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
|
||||||
|
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
|
||||||
|
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||||
|
* provided, {@link SnowballFilter}, and Nepali Stop words
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
final Tokenizer source = new StandardTokenizer();
|
||||||
|
TokenStream result = new LowerCaseFilter(source);
|
||||||
|
result = new DecimalDigitFilter(result);
|
||||||
|
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
result = new StopFilter(result, stopwords);
|
||||||
|
result = new SnowballFilter(result, new NepaliStemmer());
|
||||||
|
return new TokenStreamComponents(source, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
|
TokenStream result = new LowerCaseFilter(in);
|
||||||
|
result = new DecimalDigitFilter(result);
|
||||||
|
result = new IndicNormalizationFilter(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Analyzer for Nepali. */
|
||||||
|
package org.apache.lucene.analysis.ne;
|
@ -0,0 +1,256 @@
|
|||||||
|
# nepali stopwords from https://github.com/nltk/nltk_data
|
||||||
|
छ
|
||||||
|
र
|
||||||
|
पनि
|
||||||
|
छन्
|
||||||
|
लागि
|
||||||
|
भएको
|
||||||
|
गरेको
|
||||||
|
भने
|
||||||
|
गर्न
|
||||||
|
गर्ने
|
||||||
|
हो
|
||||||
|
तथा
|
||||||
|
यो
|
||||||
|
रहेको
|
||||||
|
उनले
|
||||||
|
थियो
|
||||||
|
हुने
|
||||||
|
गरेका
|
||||||
|
थिए
|
||||||
|
गर्दै
|
||||||
|
तर
|
||||||
|
नै
|
||||||
|
को
|
||||||
|
मा
|
||||||
|
हुन्
|
||||||
|
भन्ने
|
||||||
|
हुन
|
||||||
|
गरी
|
||||||
|
त
|
||||||
|
हुन्छ
|
||||||
|
अब
|
||||||
|
के
|
||||||
|
रहेका
|
||||||
|
गरेर
|
||||||
|
छैन
|
||||||
|
दिए
|
||||||
|
भए
|
||||||
|
यस
|
||||||
|
ले
|
||||||
|
गर्नु
|
||||||
|
औं
|
||||||
|
सो
|
||||||
|
त्यो
|
||||||
|
कि
|
||||||
|
जुन
|
||||||
|
यी
|
||||||
|
का
|
||||||
|
गरि
|
||||||
|
ती
|
||||||
|
न
|
||||||
|
छु
|
||||||
|
छौं
|
||||||
|
लाई
|
||||||
|
नि
|
||||||
|
उप
|
||||||
|
अक्सर
|
||||||
|
आदि
|
||||||
|
कसरी
|
||||||
|
क्रमशः
|
||||||
|
चाले
|
||||||
|
अगाडी
|
||||||
|
अझै
|
||||||
|
अनुसार
|
||||||
|
अन्तर्गत
|
||||||
|
अन्य
|
||||||
|
अन्यत्र
|
||||||
|
अन्यथा
|
||||||
|
अरु
|
||||||
|
अरुलाई
|
||||||
|
अर्को
|
||||||
|
अर्थात
|
||||||
|
अर्थात्
|
||||||
|
अलग
|
||||||
|
आए
|
||||||
|
आजको
|
||||||
|
ओठ
|
||||||
|
आत्म
|
||||||
|
आफू
|
||||||
|
आफूलाई
|
||||||
|
आफ्नै
|
||||||
|
आफ्नो
|
||||||
|
आयो
|
||||||
|
उदाहरण
|
||||||
|
उनको
|
||||||
|
उहालाई
|
||||||
|
एउटै
|
||||||
|
एक
|
||||||
|
एकदम
|
||||||
|
कतै
|
||||||
|
कम से कम
|
||||||
|
कसै
|
||||||
|
कसैले
|
||||||
|
कहाँबाट
|
||||||
|
कहिलेकाहीं
|
||||||
|
का
|
||||||
|
किन
|
||||||
|
किनभने
|
||||||
|
कुनै
|
||||||
|
कुरा
|
||||||
|
कृपया
|
||||||
|
केही
|
||||||
|
कोही
|
||||||
|
गए
|
||||||
|
गरौं
|
||||||
|
गर्छ
|
||||||
|
गर्छु
|
||||||
|
गर्नुपर्छ
|
||||||
|
गयौ
|
||||||
|
गैर
|
||||||
|
चार
|
||||||
|
चाहनुहुन्छ
|
||||||
|
चाहन्छु
|
||||||
|
चाहिए
|
||||||
|
छू
|
||||||
|
जताततै
|
||||||
|
जब
|
||||||
|
जबकि
|
||||||
|
जसको
|
||||||
|
जसबाट
|
||||||
|
जसमा
|
||||||
|
जसलाई
|
||||||
|
जसले
|
||||||
|
जस्तै
|
||||||
|
जस्तो
|
||||||
|
जस्तोसुकै
|
||||||
|
जहाँ
|
||||||
|
जान
|
||||||
|
जाहिर
|
||||||
|
जे
|
||||||
|
जो
|
||||||
|
ठीक
|
||||||
|
तत्काल
|
||||||
|
तदनुसार
|
||||||
|
तपाईको
|
||||||
|
तपाई
|
||||||
|
पर्याप्त
|
||||||
|
पहिले
|
||||||
|
पहिलो
|
||||||
|
पहिल्यै
|
||||||
|
पाँच
|
||||||
|
पाँचौं
|
||||||
|
तल
|
||||||
|
तापनी
|
||||||
|
तिनी
|
||||||
|
तिनीहरू
|
||||||
|
तिनीहरुको
|
||||||
|
तिनिहरुलाई
|
||||||
|
तिमी
|
||||||
|
तिर
|
||||||
|
तीन
|
||||||
|
तुरुन्तै
|
||||||
|
तेस्रो
|
||||||
|
तेस्कारण
|
||||||
|
पूर्व
|
||||||
|
प्रति
|
||||||
|
प्रतेक
|
||||||
|
प्लस
|
||||||
|
फेरी
|
||||||
|
बने
|
||||||
|
त्सपछि
|
||||||
|
त्सैले
|
||||||
|
त्यहाँ
|
||||||
|
थिएन
|
||||||
|
दिनुभएको
|
||||||
|
दिनुहुन्छ
|
||||||
|
दुई
|
||||||
|
देखि
|
||||||
|
बरु
|
||||||
|
बारे
|
||||||
|
बाहिर
|
||||||
|
देखिन्छ
|
||||||
|
देखियो
|
||||||
|
देखे
|
||||||
|
देखेको
|
||||||
|
देखेर
|
||||||
|
दोस्रो
|
||||||
|
धेरै
|
||||||
|
नजिकै
|
||||||
|
नत्र
|
||||||
|
नयाँ
|
||||||
|
निम्ति
|
||||||
|
बाहेक
|
||||||
|
बीच
|
||||||
|
बीचमा
|
||||||
|
भन
|
||||||
|
निम्न
|
||||||
|
निम्नानुसार
|
||||||
|
निर्दिष्ट
|
||||||
|
नौ
|
||||||
|
पक्का
|
||||||
|
पक्कै
|
||||||
|
पछि
|
||||||
|
पछिल्लो
|
||||||
|
पटक
|
||||||
|
पर्छ
|
||||||
|
पर्थ्यो
|
||||||
|
भन्छन्
|
||||||
|
भन्
|
||||||
|
भन्छु
|
||||||
|
भन्दा
|
||||||
|
भन्नुभयो
|
||||||
|
भर
|
||||||
|
भित्र
|
||||||
|
भित्री
|
||||||
|
म
|
||||||
|
मलाई
|
||||||
|
मात्र
|
||||||
|
माथि
|
||||||
|
मुख्य
|
||||||
|
मेरो
|
||||||
|
यति
|
||||||
|
यथोचित
|
||||||
|
यदि
|
||||||
|
यद्यपि
|
||||||
|
यसको
|
||||||
|
यसपछि
|
||||||
|
यसबाहेक
|
||||||
|
यसरी
|
||||||
|
यसो
|
||||||
|
यस्तो
|
||||||
|
यहाँ
|
||||||
|
यहाँसम्म
|
||||||
|
या
|
||||||
|
रही
|
||||||
|
राखे
|
||||||
|
राख्छ
|
||||||
|
राम्रो
|
||||||
|
रूप
|
||||||
|
लगभग
|
||||||
|
वरीपरी
|
||||||
|
वास्तवमा
|
||||||
|
बिरुद्ध
|
||||||
|
बिशेष
|
||||||
|
सायद
|
||||||
|
शायद
|
||||||
|
संग
|
||||||
|
संगै
|
||||||
|
सक्छ
|
||||||
|
सट्टा
|
||||||
|
सधै
|
||||||
|
सबै
|
||||||
|
सबैलाई
|
||||||
|
समय
|
||||||
|
सम्भव
|
||||||
|
सम्म
|
||||||
|
सही
|
||||||
|
साँच्चै
|
||||||
|
सात
|
||||||
|
साथ
|
||||||
|
साथै
|
||||||
|
सारा
|
||||||
|
सोही
|
||||||
|
स्पष्ट
|
||||||
|
हरे
|
||||||
|
हरेक
|
@ -0,0 +1,76 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.ne;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
|
||||||
|
/** Tests the NepaliAnalyzer */
|
||||||
|
public class TestNepaliAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
/** This test fails with NPE when the stopwords file is missing in classpath */
|
||||||
|
public void testResourcesAvailable() {
|
||||||
|
new NepaliAnalyzer().close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test that snowball stemmer is hooked in correctly */
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Analyzer a = new NepaliAnalyzer();
|
||||||
|
// friend
|
||||||
|
checkOneTerm(a, "मित्र", "मित्र");
|
||||||
|
// friends
|
||||||
|
checkOneTerm(a, "मित्रहरु", "मित्र");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testStopwords() throws Exception {
|
||||||
|
Analyzer a = new NepaliAnalyzer();
|
||||||
|
assertAnalyzesTo(
|
||||||
|
a,
|
||||||
|
"सबै व्यक्तिहरू जन्मजात स्वतन्त्र हुन् ती सबैको समान अधिकार र महत्व",
|
||||||
|
new String[] {"व्यक्ति", "जन्मजात", "स्वतन्त्र", "सबै", "समान", "अधिकार", "महत्व"});
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** nepali has no case, but any latin-1 etc should be casefolded */
|
||||||
|
public void testLowerCase() throws Exception {
|
||||||
|
Analyzer a = new NepaliAnalyzer();
|
||||||
|
checkOneTerm(a, "FIFA", "fifa");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testExclusionSet() throws Exception {
|
||||||
|
CharArraySet exclusionSet = new CharArraySet(asSet("मित्रहरु"), false);
|
||||||
|
Analyzer a = new NepaliAnalyzer(NepaliAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
|
checkOneTerm(a, "मित्रहरु", "मित्रहरु");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** test we fold digits to latin-1 */
|
||||||
|
public void testDigits() throws Exception {
|
||||||
|
NepaliAnalyzer a = new NepaliAnalyzer();
|
||||||
|
checkOneTerm(a, "१२३४", "1234");
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
Analyzer analyzer = new NepaliAnalyzer();
|
||||||
|
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||||
|
analyzer.close();
|
||||||
|
}
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user