Move more token filters to analysis-common module

The following token filters were moved: arabic_normalization, german_normalization, hindi_normalization, indic_normalization, persian_normalization, scandinavian_normalization, serbian_normalization, sorani_normalization, cjk_width and cjk_width

Relates to #23658
This commit is contained in:
Martijn van Groningen 2017-07-13 21:54:31 +02:00
parent e9dfb2a215
commit 8003171a0c
No known key found for this signature in database
GPG Key ID: AB236F4FCF2AF12A
17 changed files with 413 additions and 67 deletions

View File

@ -31,15 +31,12 @@ import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
import org.elasticsearch.index.analysis.CJKBigramFilterFactory;
import org.elasticsearch.index.analysis.CJKWidthFilterFactory;
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
@ -62,14 +59,11 @@ import org.elasticsearch.index.analysis.FrenchAnalyzerProvider;
import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory;
import org.elasticsearch.index.analysis.GalicianAnalyzerProvider;
import org.elasticsearch.index.analysis.GermanAnalyzerProvider;
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
import org.elasticsearch.index.analysis.GreekAnalyzerProvider;
import org.elasticsearch.index.analysis.HindiAnalyzerProvider;
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
import org.elasticsearch.index.analysis.HungarianAnalyzerProvider;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
@ -88,7 +82,6 @@ import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
import org.elasticsearch.index.analysis.PatternAnalyzerProvider;
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
@ -97,13 +90,10 @@ import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
import org.elasticsearch.index.analysis.SoraniAnalyzerProvider;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SpanishAnalyzerProvider;
import org.elasticsearch.index.analysis.StandardAnalyzerProvider;
import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
@ -202,20 +192,10 @@ public final class AnalysisModule {
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);
tokenFilters.register("indic_normalization", IndicNormalizationFilterFactory::new);
tokenFilters.register("sorani_normalization", SoraniNormalizationFilterFactory::new);
tokenFilters.register("persian_normalization", PersianNormalizationFilterFactory::new);
tokenFilters.register("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
tokenFilters.register("serbian_normalization", SerbianNormalizationFilterFactory::new);
tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory
(indexSettings, name, settings, hunspellService)));
tokenFilters.register("cjk_bigram", CJKBigramFilterFactory::new);
tokenFilters.register("cjk_width", CJKWidthFilterFactory::new);
tokenFilters.register("apostrophe", ApostropheFilterFactory::new);
tokenFilters.register("classic", ClassicFilterFactory::new);

View File

@ -16,17 +16,19 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import java.util.Arrays;
import java.util.HashSet;
@ -49,7 +50,7 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
private final int flags;
private final boolean outputUnigrams;
public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices(
indexSettings.getIndexVersionCreated(), "output_unigrams", false, deprecationLogger);

View File

@ -17,17 +17,19 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -71,6 +71,7 @@ import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
@ -118,6 +119,16 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
filters.put("german_normalization", GermanNormalizationFilterFactory::new);
filters.put("hindi_normalization", HindiNormalizationFilterFactory::new);
filters.put("indic_normalization", IndicNormalizationFilterFactory::new);
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new);
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
filters.put("cjk_width", CJKWidthFilterFactory::new);
filters.put("cjk_bigram", CJKBigramFilterFactory::new);
return filters;
}

View File

@ -16,20 +16,22 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
/**
* Factory for {@link GermanNormalizationFilter}
*/
public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -16,20 +16,22 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
/**
* Factory for {@link HindiNormalizationFilter}
*/
public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -16,20 +16,22 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
/**
* Factory for {@link IndicNormalizationFilter}
*/
public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -16,17 +16,19 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -16,20 +16,22 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
/**
* Factory for {@link ScandinavianNormalizationFilter}
*/
public class ScandinavianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public ScandinavianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ScandinavianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,17 +17,19 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.sr.SerbianNormalizationFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,23 +17,32 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.junit.Before;
import java.io.IOException;
import java.io.StringReader;
public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
private static final String RESOURCE = "/org/elasticsearch/index/analysis/cjk_analysis.json";
private static final String RESOURCE = "/org/elasticsearch/analysis/common/cjk_analysis.json";
private ESTestCase.TestAnalysis analysis;
@Before
public void setup() throws IOException {
analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE, new CommonAnalysisPlugin());
}
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
@ -43,7 +52,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
}
public void testNoFlags() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
@ -53,7 +61,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
}
public void testHanOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"", "", "", "学生", "", "試験", "", "", "", "" };
@ -63,7 +70,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
}
public void testHanUnigramOnly() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
String source = "多くの学生が試験に落ちた。";
String[] expected = new String[]{"", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" };
@ -73,7 +79,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
}
public void testDisableGraph() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");

View File

@ -26,6 +26,7 @@ import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
@ -106,6 +107,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
filters.put("commongramsquery", CommonGramsTokenFilterFactory.class);
filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
filters.put("germannormalization", GermanNormalizationFilterFactory.class);
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
filters.put("persiannormalization", PersianNormalizationFilterFactory.class);
filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class);
filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);
filters.put("cjkwidth", CJKWidthFilterFactory.class);
filters.put("cjkbigram", CJKBigramFilterFactory.class);
return filters;
}

View File

@ -663,3 +663,333 @@
- match: { tokens.1.token: a_b }
- match: { tokens.2.token: b }
- match: { tokens.3.token: c }
---
"arabic_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_arabic_normalization:
type: arabic_normalization
- do:
indices.analyze:
index: test
body:
text: آجن
tokenizer: keyword
filter: [my_arabic_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: اجن }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: آجن
tokenizer: keyword
filter: [arabic_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: اجن }
---
"german_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_german_normalization:
type: german_normalization
- do:
indices.analyze:
index: test
body:
text: weißbier
tokenizer: keyword
filter: [my_german_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: weissbier }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: weißbier
tokenizer: keyword
filter: [german_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: weissbier }
---
"hindi_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_hindi_normalization:
type: hindi_normalization
- do:
indices.analyze:
index: test
body:
text: अँगरेज़ी
tokenizer: keyword
filter: [my_hindi_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: अंगरेजि }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: अँगरेज़ी
tokenizer: keyword
filter: [hindi_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: अंगरेजि }
---
"indic_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_indic_normalization:
type: indic_normalization
- do:
indices.analyze:
index: test
body:
text: ত্‍
tokenizer: keyword
filter: [my_indic_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: ৎ }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: ত্‍
tokenizer: keyword
filter: [indic_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: ৎ }
---
"persian_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_persian_normalization:
type: persian_normalization
- do:
indices.analyze:
index: test
body:
text: های
tokenizer: keyword
filter: [my_persian_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: هاي }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: های
tokenizer: keyword
filter: [persian_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: هاي }
---
"scandinavian_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_scandinavian_normalization:
type: scandinavian_normalization
- do:
indices.analyze:
index: test
body:
text: ö
tokenizer: keyword
filter: [my_scandinavian_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: ø }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: ö
tokenizer: keyword
filter: [scandinavian_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: ø }
---
"serbian_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_serbian_normalization:
type: serbian_normalization
- do:
indices.analyze:
index: test
body:
text: абвгдђежзијклљмнњопрстћуфхцчџш
tokenizer: keyword
filter: [my_serbian_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: abvgddjezzijklljmnnjoprstcufhccdzs }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: абвгдђежзијклљмнњопрстћуфхцчџш
tokenizer: keyword
filter: [serbian_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: abvgddjezzijklljmnnjoprstcufhccdzs }
---
"sorani_normalization":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_sorani_normalization:
type: sorani_normalization
- do:
indices.analyze:
index: test
body:
text: ي
tokenizer: keyword
filter: [my_sorani_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: ی }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: ي
tokenizer: keyword
filter: [sorani_normalization]
- length: { tokens: 1 }
- match: { tokens.0.token: ی }
---
"cjk_width":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_cjk_width:
type: cjk_width
- do:
indices.analyze:
index: test
body:
text: カタカナ
tokenizer: keyword
filter: [my_cjk_width]
- length: { tokens: 1 }
- match: { tokens.0.token: カタカナ }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: カタカナ
tokenizer: keyword
filter: [cjk_width]
- length: { tokens: 1 }
- match: { tokens.0.token: カタカナ }
---
"cjk_bigram":
- do:
indices.create:
index: test
body:
settings:
analysis:
filter:
my_cjk_bigram:
type: cjk_bigram
- do:
indices.analyze:
index: test
body:
text: 多くの学生が試験に落ちた
tokenizer: standard
filter: [my_cjk_bigram]
- length: { tokens: 11 }
- match: { tokens.0.token: 多く }
- match: { tokens.1.token: くの }
- match: { tokens.2.token: の学 }
- match: { tokens.3.token: 学生 }
- match: { tokens.4.token: 生が }
- match: { tokens.5.token: が試 }
- match: { tokens.6.token: 試験 }
- match: { tokens.7.token: 験に }
- match: { tokens.8.token: に落 }
- match: { tokens.9.token: 落ち }
- match: { tokens.10.token: ちた }
# Test pre-configured token filter too:
- do:
indices.analyze:
body:
text: 多くの学生が試験に落ちた
tokenizer: standard
filter: [cjk_bigram]
- length: { tokens: 11 }
- match: { tokens.0.token: 多く }
- match: { tokens.1.token: くの }
- match: { tokens.2.token: の学 }
- match: { tokens.3.token: 学生 }
- match: { tokens.4.token: 生が }
- match: { tokens.5.token: が試 }
- match: { tokens.6.token: 試験 }
- match: { tokens.7.token: 験に }
- match: { tokens.8.token: に落 }
- match: { tokens.9.token: 落ち }
- match: { tokens.10.token: ちた }

View File

@ -35,13 +35,15 @@ import java.util.Arrays;
public class AnalysisTestsHelper {
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(final Path baseDir, final String resource) throws IOException {
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(final Path baseDir,
final String resource,
final AnalysisPlugin... plugins) throws IOException {
final Settings settings = Settings.builder()
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
.build();
return createTestAnalysisFromSettings(settings);
return createTestAnalysisFromSettings(settings, plugins);
}
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(

View File

@ -24,22 +24,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
import org.elasticsearch.index.analysis.CJKBigramFilterFactory;
import org.elasticsearch.index.analysis.CJKWidthFilterFactory;
import org.elasticsearch.index.analysis.ClassicFilterFactory;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
@ -50,15 +44,11 @@ import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
@ -131,13 +121,13 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
// exposed in ES
.put("apostrophe", ApostropheFilterFactory.class)
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
.put("arabicnormalization", MovedToAnalysisCommon.class)
.put("arabicstem", ArabicStemTokenFilterFactory.class)
.put("asciifolding", MovedToAnalysisCommon.class)
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
.put("bulgarianstem", MovedToAnalysisCommon.class)
.put("cjkbigram", CJKBigramFilterFactory.class)
.put("cjkwidth", CJKWidthFilterFactory.class)
.put("cjkbigram", MovedToAnalysisCommon.class)
.put("cjkwidth", MovedToAnalysisCommon.class)
.put("classic", ClassicFilterFactory.class)
.put("commongrams", MovedToAnalysisCommon.class)
.put("commongramsquery", MovedToAnalysisCommon.class)
@ -157,15 +147,15 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("germanstem", GermanStemTokenFilterFactory.class)
.put("germanlightstem", MovedToAnalysisCommon.class)
.put("germanminimalstem", MovedToAnalysisCommon.class)
.put("germannormalization", GermanNormalizationFilterFactory.class)
.put("germannormalization", MovedToAnalysisCommon.class)
.put("greeklowercase", MovedToAnalysisCommon.class)
.put("greekstem", MovedToAnalysisCommon.class)
.put("hindinormalization", HindiNormalizationFilterFactory.class)
.put("hindinormalization", MovedToAnalysisCommon.class)
.put("hindistem", MovedToAnalysisCommon.class)
.put("hungarianlightstem", MovedToAnalysisCommon.class)
.put("hunspellstem", HunspellTokenFilterFactory.class)
.put("hyphenationcompoundword", MovedToAnalysisCommon.class)
.put("indicnormalization", IndicNormalizationFilterFactory.class)
.put("indicnormalization", MovedToAnalysisCommon.class)
.put("irishlowercase", MovedToAnalysisCommon.class)
.put("indonesianstem", MovedToAnalysisCommon.class)
.put("italianlightstem", MovedToAnalysisCommon.class)
@ -181,7 +171,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("norwegianminimalstem", MovedToAnalysisCommon.class)
.put("patterncapturegroup", MovedToAnalysisCommon.class)
.put("patternreplace", MovedToAnalysisCommon.class)
.put("persiannormalization", PersianNormalizationFilterFactory.class)
.put("persiannormalization", MovedToAnalysisCommon.class)
.put("porterstem", MovedToAnalysisCommon.class)
.put("portuguesestem", MovedToAnalysisCommon.class)
.put("portugueselightstem", MovedToAnalysisCommon.class)
@ -189,12 +179,12 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
.put("reversestring", MovedToAnalysisCommon.class)
.put("russianlightstem", MovedToAnalysisCommon.class)
.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class)
.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class)
.put("serbiannormalization", SerbianNormalizationFilterFactory.class)
.put("scandinaviannormalization", MovedToAnalysisCommon.class)
.put("serbiannormalization", MovedToAnalysisCommon.class)
.put("shingle", ShingleTokenFilterFactory.class)
.put("minhash", MinHashTokenFilterFactory.class)
.put("snowballporter", MovedToAnalysisCommon.class)
.put("soraninormalization", SoraniNormalizationFilterFactory.class)
.put("soraninormalization", MovedToAnalysisCommon.class)
.put("soranistem", MovedToAnalysisCommon.class)
.put("spanishlightstem", MovedToAnalysisCommon.class)
.put("standard", StandardTokenFilterFactory.class)