mirror of
https://github.com/apache/lucene.git
synced 2025-02-28 05:19:17 +00:00
LUCENE-10095: Nepali Analyzer (#290)
Add Nepali analyzer based on snowball stemmer and NLTK stopwords
This commit is contained in:
parent
cc8c4283dd
commit
8bce765218
@ -15,6 +15,10 @@ New Features
|
||||
|
||||
* LUCENE-9589: Swedish Minimal Stemmer (janhoy)
|
||||
|
||||
* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
|
||||
|
||||
* LUCENE-10095: Add NepaliAnalyzer based on the snowball stemmer. (Robert Muir)
|
||||
|
||||
System Requirements
|
||||
|
||||
* LUCENE-8738: Move to Java 11 as minimum Java version.
|
||||
@ -217,8 +221,6 @@ Improvements
|
||||
with doc values and points. In this case, there is an assumption that the same data is
|
||||
stored in these points and doc values (Mayya Sharipova, Jim Ferenczi, Adrien Grand)
|
||||
|
||||
* LUCENE-9313: Add SerbianAnalyzer based on the snowball stemmer. (Dragan Ivanovic)
|
||||
|
||||
* LUCENE-9449: Enhance DocComparator to provide an iterator over competitive
|
||||
documents when searching with "after". This iterator can quickly position
|
||||
on the desired "after" document skipping all documents and segments before
|
||||
|
@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ne;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.UncheckedIOException;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.tartarus.snowball.ext.NepaliStemmer;
|
||||
|
||||
/**
|
||||
* Analyzer for Nepali.
|
||||
*
|
||||
* @since 9.0
|
||||
*/
|
||||
public final class NepaliAnalyzer extends StopwordAnalyzerBase {
|
||||
private final CharArraySet stemExclusionSet;
|
||||
|
||||
/**
|
||||
* File containing default Nepali stopwords.
|
||||
*
|
||||
* <p>Default stopword list is from the Apache2 python project NLTK:
|
||||
* https://github.com/nltk/nltk_data
|
||||
*/
|
||||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
|
||||
|
||||
private static final String STOPWORDS_COMMENT = "#";
|
||||
|
||||
/**
|
||||
* Returns an unmodifiable instance of the default stop-words set.
|
||||
*
|
||||
* @return an unmodifiable instance of the default stop-words set.
|
||||
*/
|
||||
public static CharArraySet getDefaultStopSet() {
|
||||
return DefaultSetHolder.DEFAULT_STOP_SET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class accesses the
|
||||
* static final set the first time.;
|
||||
*/
|
||||
private static class DefaultSetHolder {
|
||||
static final CharArraySet DEFAULT_STOP_SET;
|
||||
|
||||
static {
|
||||
try {
|
||||
DEFAULT_STOP_SET =
|
||||
loadStopwordSet(false, NepaliAnalyzer.class, DEFAULT_STOPWORD_FILE, STOPWORDS_COMMENT);
|
||||
} catch (IOException ex) {
|
||||
// default set should always be present as it is part of the
|
||||
// distribution (JAR)
|
||||
throw new UncheckedIOException("Unable to load default stopword set", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
* @param stemExclusionSet a stemming exclusion set
|
||||
*/
|
||||
public NepaliAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) {
|
||||
super(stopwords);
|
||||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet));
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words
|
||||
*
|
||||
* @param stopwords a stopword set
|
||||
*/
|
||||
public NepaliAnalyzer(CharArraySet stopwords) {
|
||||
this(stopwords, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
|
||||
/** Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. */
|
||||
public NepaliAnalyzer() {
|
||||
this(DefaultSetHolder.DEFAULT_STOP_SET);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} used to tokenize all
|
||||
* the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built from a {@link
|
||||
* StandardTokenizer} filtered with {@link LowerCaseFilter}, {@link DecimalDigitFilter},
|
||||
* {@link IndicNormalizationFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is
|
||||
* provided, {@link SnowballFilter}, and Nepali Stop words
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
final Tokenizer source = new StandardTokenizer();
|
||||
TokenStream result = new LowerCaseFilter(source);
|
||||
result = new DecimalDigitFilter(result);
|
||||
if (!stemExclusionSet.isEmpty()) result = new SetKeywordMarkerFilter(result, stemExclusionSet);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
result = new StopFilter(result, stopwords);
|
||||
result = new SnowballFilter(result, new NepaliStemmer());
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||
TokenStream result = new LowerCaseFilter(in);
|
||||
result = new DecimalDigitFilter(result);
|
||||
result = new IndicNormalizationFilter(result);
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/** Analyzer for Nepali. */
|
||||
package org.apache.lucene.analysis.ne;
|
@ -0,0 +1,256 @@
|
||||
# nepali stopwords from https://github.com/nltk/nltk_data
|
||||
छ
|
||||
र
|
||||
पनि
|
||||
छन्
|
||||
लागि
|
||||
भएको
|
||||
गरेको
|
||||
भने
|
||||
गर्न
|
||||
गर्ने
|
||||
हो
|
||||
तथा
|
||||
यो
|
||||
रहेको
|
||||
उनले
|
||||
थियो
|
||||
हुने
|
||||
गरेका
|
||||
थिए
|
||||
गर्दै
|
||||
तर
|
||||
नै
|
||||
को
|
||||
मा
|
||||
हुन्
|
||||
भन्ने
|
||||
हुन
|
||||
गरी
|
||||
त
|
||||
हुन्छ
|
||||
अब
|
||||
के
|
||||
रहेका
|
||||
गरेर
|
||||
छैन
|
||||
दिए
|
||||
भए
|
||||
यस
|
||||
ले
|
||||
गर्नु
|
||||
औं
|
||||
सो
|
||||
त्यो
|
||||
कि
|
||||
जुन
|
||||
यी
|
||||
का
|
||||
गरि
|
||||
ती
|
||||
न
|
||||
छु
|
||||
छौं
|
||||
लाई
|
||||
नि
|
||||
उप
|
||||
अक्सर
|
||||
आदि
|
||||
कसरी
|
||||
क्रमशः
|
||||
चाले
|
||||
अगाडी
|
||||
अझै
|
||||
अनुसार
|
||||
अन्तर्गत
|
||||
अन्य
|
||||
अन्यत्र
|
||||
अन्यथा
|
||||
अरु
|
||||
अरुलाई
|
||||
अर्को
|
||||
अर्थात
|
||||
अर्थात्
|
||||
अलग
|
||||
आए
|
||||
आजको
|
||||
ओठ
|
||||
आत्म
|
||||
आफू
|
||||
आफूलाई
|
||||
आफ्नै
|
||||
आफ्नो
|
||||
आयो
|
||||
उदाहरण
|
||||
उनको
|
||||
उहालाई
|
||||
एउटै
|
||||
एक
|
||||
एकदम
|
||||
कतै
|
||||
कम से कम
|
||||
कसै
|
||||
कसैले
|
||||
कहाँबाट
|
||||
कहिलेकाहीं
|
||||
का
|
||||
किन
|
||||
किनभने
|
||||
कुनै
|
||||
कुरा
|
||||
कृपया
|
||||
केही
|
||||
कोही
|
||||
गए
|
||||
गरौं
|
||||
गर्छ
|
||||
गर्छु
|
||||
गर्नुपर्छ
|
||||
गयौ
|
||||
गैर
|
||||
चार
|
||||
चाहनुहुन्छ
|
||||
चाहन्छु
|
||||
चाहिए
|
||||
छू
|
||||
जताततै
|
||||
जब
|
||||
जबकि
|
||||
जसको
|
||||
जसबाट
|
||||
जसमा
|
||||
जसलाई
|
||||
जसले
|
||||
जस्तै
|
||||
जस्तो
|
||||
जस्तोसुकै
|
||||
जहाँ
|
||||
जान
|
||||
जाहिर
|
||||
जे
|
||||
जो
|
||||
ठीक
|
||||
तत्काल
|
||||
तदनुसार
|
||||
तपाईको
|
||||
तपाई
|
||||
पर्याप्त
|
||||
पहिले
|
||||
पहिलो
|
||||
पहिल्यै
|
||||
पाँच
|
||||
पाँचौं
|
||||
तल
|
||||
तापनी
|
||||
तिनी
|
||||
तिनीहरू
|
||||
तिनीहरुको
|
||||
तिनिहरुलाई
|
||||
तिमी
|
||||
तिर
|
||||
तीन
|
||||
तुरुन्तै
|
||||
तेस्रो
|
||||
तेस्कारण
|
||||
पूर्व
|
||||
प्रति
|
||||
प्रतेक
|
||||
प्लस
|
||||
फेरी
|
||||
बने
|
||||
त्सपछि
|
||||
त्सैले
|
||||
त्यहाँ
|
||||
थिएन
|
||||
दिनुभएको
|
||||
दिनुहुन्छ
|
||||
दुई
|
||||
देखि
|
||||
बरु
|
||||
बारे
|
||||
बाहिर
|
||||
देखिन्छ
|
||||
देखियो
|
||||
देखे
|
||||
देखेको
|
||||
देखेर
|
||||
दोस्रो
|
||||
धेरै
|
||||
नजिकै
|
||||
नत्र
|
||||
नयाँ
|
||||
निम्ति
|
||||
बाहेक
|
||||
बीच
|
||||
बीचमा
|
||||
भन
|
||||
निम्न
|
||||
निम्नानुसार
|
||||
निर्दिष्ट
|
||||
नौ
|
||||
पक्का
|
||||
पक्कै
|
||||
पछि
|
||||
पछिल्लो
|
||||
पटक
|
||||
पर्छ
|
||||
पर्थ्यो
|
||||
भन्छन्
|
||||
भन्
|
||||
भन्छु
|
||||
भन्दा
|
||||
भन्नुभयो
|
||||
भर
|
||||
भित्र
|
||||
भित्री
|
||||
म
|
||||
मलाई
|
||||
मात्र
|
||||
माथि
|
||||
मुख्य
|
||||
मेरो
|
||||
यति
|
||||
यथोचित
|
||||
यदि
|
||||
यद्यपि
|
||||
यसको
|
||||
यसपछि
|
||||
यसबाहेक
|
||||
यसरी
|
||||
यसो
|
||||
यस्तो
|
||||
यहाँ
|
||||
यहाँसम्म
|
||||
या
|
||||
रही
|
||||
राखे
|
||||
राख्छ
|
||||
राम्रो
|
||||
रूप
|
||||
लगभग
|
||||
वरीपरी
|
||||
वास्तवमा
|
||||
बिरुद्ध
|
||||
बिशेष
|
||||
सायद
|
||||
शायद
|
||||
संग
|
||||
संगै
|
||||
सक्छ
|
||||
सट्टा
|
||||
सधै
|
||||
सबै
|
||||
सबैलाई
|
||||
समय
|
||||
सम्भव
|
||||
सम्म
|
||||
सही
|
||||
साँच्चै
|
||||
सात
|
||||
साथ
|
||||
साथै
|
||||
सारा
|
||||
सोही
|
||||
स्पष्ट
|
||||
हरे
|
||||
हरेक
|
@ -0,0 +1,76 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.ne;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
|
||||
/** Tests the NepaliAnalyzer */
|
||||
public class TestNepaliAnalyzer extends BaseTokenStreamTestCase {
|
||||
/** This test fails with NPE when the stopwords file is missing in classpath */
|
||||
public void testResourcesAvailable() {
|
||||
new NepaliAnalyzer().close();
|
||||
}
|
||||
|
||||
/** test that snowball stemmer is hooked in correctly */
|
||||
public void testStemming() throws Exception {
|
||||
Analyzer a = new NepaliAnalyzer();
|
||||
// friend
|
||||
checkOneTerm(a, "मित्र", "मित्र");
|
||||
// friends
|
||||
checkOneTerm(a, "मित्रहरु", "मित्र");
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testStopwords() throws Exception {
|
||||
Analyzer a = new NepaliAnalyzer();
|
||||
assertAnalyzesTo(
|
||||
a,
|
||||
"सबै व्यक्तिहरू जन्मजात स्वतन्त्र हुन् ती सबैको समान अधिकार र महत्व",
|
||||
new String[] {"व्यक्ति", "जन्मजात", "स्वतन्त्र", "सबै", "समान", "अधिकार", "महत्व"});
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** nepali has no case, but any latin-1 etc should be casefolded */
|
||||
public void testLowerCase() throws Exception {
|
||||
Analyzer a = new NepaliAnalyzer();
|
||||
checkOneTerm(a, "FIFA", "fifa");
|
||||
a.close();
|
||||
}
|
||||
|
||||
public void testExclusionSet() throws Exception {
|
||||
CharArraySet exclusionSet = new CharArraySet(asSet("मित्रहरु"), false);
|
||||
Analyzer a = new NepaliAnalyzer(NepaliAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||
checkOneTerm(a, "मित्रहरु", "मित्रहरु");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** test we fold digits to latin-1 */
|
||||
public void testDigits() throws Exception {
|
||||
NepaliAnalyzer a = new NepaliAnalyzer();
|
||||
checkOneTerm(a, "१२३४", "1234");
|
||||
a.close();
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer analyzer = new NepaliAnalyzer();
|
||||
checkRandomData(random(), analyzer, 200 * RANDOM_MULTIPLIER);
|
||||
analyzer.close();
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user