Added Bengali Analyzer to Elasticsearch with respect to the lucene update(PR#238)
This commit is contained in:
parent
a978ddf37b
commit
a40c474e10
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.logging.log4j.Logger;
|
|||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
||||
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
|
||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
||||
import org.apache.lucene.analysis.ckb.SoraniAnalyzer;
|
||||
|
@ -119,6 +120,7 @@ public class Analysis {
|
|||
namedStopWords.put("_arabic_", ArabicAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_armenian_", ArmenianAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_basque_", BasqueAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_bengali_", BengaliAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_brazilian_", BrazilianAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_bulgarian_", BulgarianAnalyzer.getDefaultStopSet());
|
||||
namedStopWords.put("_catalan_", CatalanAnalyzer.getDefaultStopSet());
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
public class BengaliAnalyzerProvider extends AbstractIndexAnalyzerProvider<BengaliAnalyzer> {
|
||||
|
||||
private final BengaliAnalyzer analyzer;
|
||||
|
||||
public BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
analyzer = new BengaliAnalyzer(
|
||||
Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
|
||||
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
|
||||
);
|
||||
analyzer.setVersion(version);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BengaliAnalyzer get() {
|
||||
return this.analyzer;
|
||||
}
|
||||
}
|
|
@ -32,6 +32,7 @@ import org.elasticsearch.index.analysis.AnalyzerProvider;
|
|||
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BengaliAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
|
||||
|
@ -270,6 +271,7 @@ public final class AnalysisModule {
|
|||
analyzers.register("arabic", ArabicAnalyzerProvider::new);
|
||||
analyzers.register("armenian", ArmenianAnalyzerProvider::new);
|
||||
analyzers.register("basque", BasqueAnalyzerProvider::new);
|
||||
analyzers.register("bengali", BengaliAnalyzerProvider::new);
|
||||
analyzers.register("brazilian", BrazilianAnalyzerProvider::new);
|
||||
analyzers.register("bulgarian", BulgarianAnalyzerProvider::new);
|
||||
analyzers.register("catalan", CatalanAnalyzerProvider::new);
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.ar.ArabicAnalyzer;
|
||||
import org.apache.lucene.analysis.bg.BulgarianAnalyzer;
|
||||
import org.apache.lucene.analysis.bn.BengaliAnalyzer;
|
||||
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
|
||||
import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
||||
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
||||
|
@ -183,6 +184,15 @@ public enum PreBuiltAnalyzers {
|
|||
}
|
||||
},
|
||||
|
||||
BENGALI {
|
||||
@Override
|
||||
protected Analyzer create(Version version) {
|
||||
Analyzer a = new BengaliAnalyzer();
|
||||
a.setVersion(version.luceneVersion);
|
||||
return a;
|
||||
}
|
||||
},
|
||||
|
||||
BRAZILIAN {
|
||||
@Override
|
||||
protected Analyzer create(Version version) {
|
||||
|
|
|
@ -6,6 +6,7 @@ following types are supported:
|
|||
<<arabic-analyzer,`arabic`>>,
|
||||
<<armenian-analyzer,`armenian`>>,
|
||||
<<basque-analyzer,`basque`>>,
|
||||
<<bengali-analyzer,`bengali`>>,
|
||||
<<brazilian-analyzer,`brazilian`>>,
|
||||
<<bulgarian-analyzer,`bulgarian`>>,
|
||||
<<catalan-analyzer,`catalan`>>,
|
||||
|
@ -55,7 +56,7 @@ functionality is implemented by adding the
|
|||
with the `keywords` set to the value of the `stem_exclusion` parameter.
|
||||
|
||||
The following analyzers support setting custom `stem_exclusion` list:
|
||||
`arabic`, `armenian`, `basque`, `bulgarian`, `catalan`, `czech`,
|
||||
`arabic`, `armenian`, `basque`, `bengali`, `bulgarian`, `catalan`, `czech`,
|
||||
`dutch`, `english`, `finnish`, `french`, `galician`,
|
||||
`german`, `hindi`, `hungarian`, `indonesian`, `irish`, `italian`, `latvian`,
|
||||
`lithuanian`, `norwegian`, `portuguese`, `romanian`, `russian`, `sorani`,
|
||||
|
@ -209,6 +210,54 @@ PUT /armenian_example
|
|||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[bengali-analyzer]]
|
||||
===== `bengali` analyzer
|
||||
|
||||
The `bengali` analyzer could be reimplemented as a `custom` analyzer as follows:
|
||||
|
||||
[source,js]
|
||||
----------------------------------------------------
|
||||
PUT /bengali_example
|
||||
{
|
||||
"settings": {
|
||||
"analysis": {
|
||||
"filter": {
|
||||
"bengali_stop": {
|
||||
"type": "stop",
|
||||
"stopwords": "_bengali_" <1>
|
||||
},
|
||||
"bengali_keywords": {
|
||||
"type": "keyword_marker",
|
||||
"keywords": ["উদাহরণ"] <2>
|
||||
},
|
||||
"bengali_stemmer": {
|
||||
"type": "stemmer",
|
||||
"language": "bengali"
|
||||
}
|
||||
},
|
||||
"analyzer": {
|
||||
"bengali": {
|
||||
"tokenizer": "standard",
|
||||
"filter": [
|
||||
"lowercase",
|
||||
"indic_normalization",
|
||||
"bengali_normalization",
|
||||
"bengali_stop",
|
||||
"bengali_keywords",
|
||||
"bengali_stemmer"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
----------------------------------------------------
|
||||
// CONSOLE
|
||||
<1> The default stopwords can be overridden with the `stopwords`
|
||||
or `stopwords_path` parameters.
|
||||
<2> This filter should be removed unless there are words which should
|
||||
be excluded from stemming.
|
||||
|
||||
[[brazilian-analyzer]]
|
||||
===== `brazilian` analyzer
|
||||
|
||||
|
|
|
@ -44,6 +44,10 @@ Basque::
|
|||
|
||||
http://snowball.tartarus.org/algorithms/basque/stemmer.html[*`basque`*]
|
||||
|
||||
Bengali::
|
||||
http://www.tandfonline.com/doi/abs/10.1080/02564602.1993.11437284[*`bengali`*]
|
||||
http://members.unine.ch/jacques.savoy/clef/BengaliStemmerLight.java.txt[*`light_bengali`*]
|
||||
|
||||
Brazilian Portuguese::
|
||||
|
||||
http://lucene.apache.org/core/4_9_0/analyzers-common/org/apache/lucene/analysis/br/BrazilianStemmer.html[*`brazilian`*]
|
||||
|
|
|
@ -71,7 +71,7 @@ PUT /my_index
|
|||
|
||||
Elasticsearch provides the following predefined list of languages:
|
||||
|
||||
`_arabic_`, `_armenian_`, `_basque_`, `_brazilian_`, `_bulgarian_`,
|
||||
`_arabic_`, `_armenian_`, `_basque_`, `_bengali_`, `_brazilian_`, `_bulgarian_`,
|
||||
`_catalan_`, `_czech_`, `_danish_`, `_dutch_`, `_english_`, `_finnish_`,
|
||||
`_french_`, `_galician_`, `_german_`, `_greek_`, `_hindi_`, `_hungarian_`,
|
||||
`_indonesian_`, `_irish_`, `_italian_`, `_latvian_`, `_norwegian_`, `_persian_`,
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link BengaliNormalizationFilter}
|
||||
*/
|
||||
public class BengaliNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
BengaliNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new BengaliNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.StopFilter;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||
import org.apache.lucene.analysis.bn.BengaliNormalizationFilter;
|
||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
|
@ -94,6 +95,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
|||
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
||||
filters.put("arabic_stem", ArabicStemTokenFilterFactory::new);
|
||||
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
||||
filters.put("bengali_normalization", BengaliNormalizationFilterFactory::new);
|
||||
filters.put("brazilian_stem", BrazilianStemTokenFilterFactory::new);
|
||||
filters.put("cjk_bigram", CJKBigramFilterFactory::new);
|
||||
filters.put("cjk_width", CJKWidthFilterFactory::new);
|
||||
|
@ -180,6 +182,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
|||
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
|
||||
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.analysis.common;
|
|||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ar.ArabicStemFilter;
|
||||
import org.apache.lucene.analysis.bg.BulgarianStemFilter;
|
||||
import org.apache.lucene.analysis.bn.BengaliStemFilter;
|
||||
import org.apache.lucene.analysis.br.BrazilianStemFilter;
|
||||
import org.apache.lucene.analysis.ckb.SoraniStemFilter;
|
||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
|
@ -102,6 +103,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return new SnowballFilter(tokenStream, new ArmenianStemmer());
|
||||
} else if ("basque".equalsIgnoreCase(language)) {
|
||||
return new SnowballFilter(tokenStream, new BasqueStemmer());
|
||||
} else if ("bengali".equalsIgnoreCase(language)) {
|
||||
return new BengaliStemFilter(tokenStream);
|
||||
} else if ("brazilian".equalsIgnoreCase(language)) {
|
||||
return new BrazilianStemFilter(tokenStream);
|
||||
} else if ("bulgarian".equalsIgnoreCase(language)) {
|
||||
|
|
|
@ -67,6 +67,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
filters.put("uppercase", UpperCaseTokenFilterFactory.class);
|
||||
filters.put("ngram", NGramTokenFilterFactory.class);
|
||||
filters.put("edgengram", EdgeNGramTokenFilterFactory.class);
|
||||
filters.put("bengalistem", StemmerTokenFilterFactory.class);
|
||||
filters.put("bulgarianstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("englishminimalstem", StemmerTokenFilterFactory.class);
|
||||
filters.put("englishpossessive", StemmerTokenFilterFactory.class);
|
||||
|
@ -106,6 +107,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
|
||||
filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
|
||||
filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
|
||||
filters.put("bengalinormalization", BengaliNormalizationFilterFactory.class);
|
||||
filters.put("germannormalization", GermanNormalizationFilterFactory.class);
|
||||
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
|
||||
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
|
||||
|
@ -159,6 +161,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
filters.put("arabic_normalization", null);
|
||||
filters.put("arabic_stem", null);
|
||||
filters.put("asciifolding", null);
|
||||
filters.put("bengali_normalization", null);
|
||||
filters.put("brazilian_stem", null);
|
||||
filters.put("cjk_bigram", null);
|
||||
filters.put("cjk_width", null);
|
||||
|
|
|
@ -695,6 +695,37 @@
|
|||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: اجن }
|
||||
|
||||
---
|
||||
"bengali_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_bengali_normalization:
|
||||
type: bengali_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: চাঁদ
|
||||
tokenizer: keyword
|
||||
filter: [my_bengali_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: চাদ }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: চাঁদ
|
||||
tokenizer: keyword
|
||||
filter: [bengali_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: চাদ }
|
||||
|
||||
---
|
||||
"german_normalization":
|
||||
- do:
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -54,13 +54,14 @@ The KStem stemmer in
|
|||
was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
|
||||
under the BSD-license.
|
||||
|
||||
The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
|
||||
The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default
|
||||
stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt,
|
||||
analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt
|
||||
See http://members.unine.ch/jacques.savoy/clef/index.html.
|
||||
|
||||
The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
|
||||
|
|
|
@ -112,6 +112,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
.put("arabicnormalization", MovedToAnalysisCommon.class)
|
||||
.put("arabicstem", MovedToAnalysisCommon.class)
|
||||
.put("asciifolding", MovedToAnalysisCommon.class)
|
||||
.put("bengalinormalization", MovedToAnalysisCommon.class)
|
||||
.put("bengalistem", MovedToAnalysisCommon.class)
|
||||
.put("brazilianstem", MovedToAnalysisCommon.class)
|
||||
.put("bulgarianstem", MovedToAnalysisCommon.class)
|
||||
.put("cjkbigram", MovedToAnalysisCommon.class)
|
||||
|
|
Loading…
Reference in New Issue