diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index bcdebc737a5..5eb33bd64c3 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -31,15 +31,12 @@ import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicAnalyzerProvider; -import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider; import org.elasticsearch.index.analysis.BasqueAnalyzerProvider; import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider; import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory; import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider; -import org.elasticsearch.index.analysis.CJKBigramFilterFactory; -import org.elasticsearch.index.analysis.CJKWidthFilterFactory; import org.elasticsearch.index.analysis.CatalanAnalyzerProvider; import org.elasticsearch.index.analysis.CharFilterFactory; import org.elasticsearch.index.analysis.ChineseAnalyzerProvider; @@ -62,14 +59,11 @@ import org.elasticsearch.index.analysis.FrenchAnalyzerProvider; import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory; import org.elasticsearch.index.analysis.GalicianAnalyzerProvider; import org.elasticsearch.index.analysis.GermanAnalyzerProvider; -import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory; import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory; import org.elasticsearch.index.analysis.GreekAnalyzerProvider; import org.elasticsearch.index.analysis.HindiAnalyzerProvider; -import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory; import org.elasticsearch.index.analysis.HungarianAnalyzerProvider; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; -import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory; import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider; import org.elasticsearch.index.analysis.IrishAnalyzerProvider; import org.elasticsearch.index.analysis.ItalianAnalyzerProvider; @@ -88,7 +82,6 @@ import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; import org.elasticsearch.index.analysis.PatternAnalyzerProvider; import org.elasticsearch.index.analysis.PatternTokenizerFactory; import org.elasticsearch.index.analysis.PersianAnalyzerProvider; -import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; @@ -97,13 +90,10 @@ import org.elasticsearch.index.analysis.RomanianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianAnalyzerProvider; import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory; import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; -import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; -import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; import org.elasticsearch.index.analysis.SimpleAnalyzerProvider; import org.elasticsearch.index.analysis.SnowballAnalyzerProvider; import org.elasticsearch.index.analysis.SoraniAnalyzerProvider; -import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.SpanishAnalyzerProvider; import org.elasticsearch.index.analysis.StandardAnalyzerProvider; import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider; @@ -202,20 +192,10 @@ public final class AnalysisModule { tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new); tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new); tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new); - tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new); - tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new); - tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new); - tokenFilters.register("indic_normalization", IndicNormalizationFilterFactory::new); - tokenFilters.register("sorani_normalization", SoraniNormalizationFilterFactory::new); - tokenFilters.register("persian_normalization", PersianNormalizationFilterFactory::new); - tokenFilters.register("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new); tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new); - tokenFilters.register("serbian_normalization", SerbianNormalizationFilterFactory::new); tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory (indexSettings, name, settings, hunspellService))); - tokenFilters.register("cjk_bigram", CJKBigramFilterFactory::new); - tokenFilters.register("cjk_width", CJKWidthFilterFactory::new); tokenFilters.register("apostrophe", ApostropheFilterFactory::new); tokenFilters.register("classic", ClassicFilterFactory::new); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ArabicNormalizationFilterFactory.java similarity index 82% rename from core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ArabicNormalizationFilterFactory.java index 15ed250e00c..f4e9e2cec34 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ArabicNormalizationFilterFactory.java @@ -16,17 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java similarity index 94% rename from core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java index 368dc3f36d2..75323eac107 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CJKBigramFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKBigramFilter; @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import java.util.Arrays; import java.util.HashSet; @@ -49,7 +50,7 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory { private final int flags; private final boolean outputUnigrams; - public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices( indexSettings.getIndexVersionCreated(), "output_unigrams", false, deprecationLogger); diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKWidthFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKWidthFilterFactory.java index 44cd11c81ed..02578a05f8a 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKWidthFilterFactory.java @@ -17,17 +17,19 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cjk.CJKWidthFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index d6a8ee2a006..5b4f76d3127 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -71,6 +71,7 @@ import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; +import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; @@ -118,6 +119,16 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new)); filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new)); + filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new); + filters.put("german_normalization", GermanNormalizationFilterFactory::new); + filters.put("hindi_normalization", HindiNormalizationFilterFactory::new); + filters.put("indic_normalization", IndicNormalizationFilterFactory::new); + filters.put("persian_normalization", PersianNormalizationFilterFactory::new); + filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new); + filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new); + filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new); + filters.put("cjk_width", CJKWidthFilterFactory::new); + filters.put("cjk_bigram", CJKBigramFilterFactory::new); return filters; } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/GermanNormalizationFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/GermanNormalizationFilterFactory.java index dcdcb4882e7..1af1e5faaa8 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/GermanNormalizationFilterFactory.java @@ -16,20 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.GermanNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link GermanNormalizationFilter} */ public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HindiNormalizationFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HindiNormalizationFilterFactory.java index a957c3dd15c..b996d5971e5 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/HindiNormalizationFilterFactory.java @@ -16,20 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.hi.HindiNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link HindiNormalizationFilter} */ public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/IndicNormalizationFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/IndicNormalizationFilterFactory.java index 67b51f02dba..f65c3897e6a 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/IndicNormalizationFilterFactory.java @@ -16,20 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.in.IndicNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link IndicNormalizationFilter} */ public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianNormalizationFilterFactory.java similarity index 82% rename from core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianNormalizationFilterFactory.java index 1a9644d611f..17239b52d97 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/PersianNormalizationFilterFactory.java @@ -16,17 +16,19 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.fa.PersianNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianNormalizationFilterFactory.java similarity index 83% rename from core/src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianNormalizationFilterFactory.java index 72c96dd0c29..332dd505e5d 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ScandinavianNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ScandinavianNormalizationFilterFactory.java @@ -16,20 +16,22 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; /** * Factory for {@link ScandinavianNormalizationFilter} */ public class ScandinavianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public ScandinavianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + ScandinavianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SerbianNormalizationFilterFactory.java similarity index 82% rename from core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SerbianNormalizationFilterFactory.java index d839a822cab..f6c3a4f55de 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/SerbianNormalizationFilterFactory.java @@ -17,17 +17,19 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.sr.SerbianNormalizationFilter; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CJKFilterFactoryTests.java similarity index 85% rename from core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CJKFilterFactoryTests.java index 846f70addaf..b89994bdc6e 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/CJKFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CJKFilterFactoryTests.java @@ -17,23 +17,32 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; +import org.junit.Before; import java.io.IOException; import java.io.StringReader; public class CJKFilterFactoryTests extends ESTokenStreamTestCase { - private static final String RESOURCE = "/org/elasticsearch/index/analysis/cjk_analysis.json"; + private static final String RESOURCE = "/org/elasticsearch/analysis/common/cjk_analysis.json"; + + private ESTestCase.TestAnalysis analysis; + + @Before + public void setup() throws IOException { + analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE, new CommonAnalysisPlugin()); + } public void testDefault() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }; @@ -43,7 +52,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase { } public void testNoFlags() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" }; @@ -53,7 +61,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase { } public void testHanOnly() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }; @@ -63,7 +70,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase { } public void testHanUnigramOnly() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only"); String source = "多くの学生が試験に落ちた。"; String[] expected = new String[]{"多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }; @@ -73,7 +79,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase { } public void testDisableGraph() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE); TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags"); TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only"); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java index e016219826b..47f37f3a0e5 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; @@ -106,6 +107,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { filters.put("commongramsquery", CommonGramsTokenFilterFactory.class); filters.put("patternreplace", PatternReplaceTokenFilterFactory.class); filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class); + filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class); + filters.put("germannormalization", GermanNormalizationFilterFactory.class); + filters.put("hindinormalization", HindiNormalizationFilterFactory.class); + filters.put("indicnormalization", IndicNormalizationFilterFactory.class); + filters.put("persiannormalization", PersianNormalizationFilterFactory.class); + filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class); + filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class); + filters.put("soraninormalization", SoraniNormalizationFilterFactory.class); + filters.put("cjkwidth", CJKWidthFilterFactory.class); + filters.put("cjkbigram", CJKBigramFilterFactory.class); return filters; } diff --git a/core/src/test/resources/org/elasticsearch/index/analysis/cjk_analysis.json b/modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/cjk_analysis.json similarity index 100% rename from core/src/test/resources/org/elasticsearch/index/analysis/cjk_analysis.json rename to modules/analysis-common/src/test/resources/org/elasticsearch/analysis/common/cjk_analysis.json diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml index 16c644338cb..1ec5060a257 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yml @@ -663,3 +663,333 @@ - match: { tokens.1.token: a_b } - match: { tokens.2.token: b } - match: { tokens.3.token: c } + +--- +"arabic_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_arabic_normalization: + type: arabic_normalization + - do: + indices.analyze: + index: test + body: + text: آجن + tokenizer: keyword + filter: [my_arabic_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: اجن } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: آجن + tokenizer: keyword + filter: [arabic_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: اجن } + +--- +"german_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_german_normalization: + type: german_normalization + - do: + indices.analyze: + index: test + body: + text: weißbier + tokenizer: keyword + filter: [my_german_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: weissbier } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: weißbier + tokenizer: keyword + filter: [german_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: weissbier } + +--- +"hindi_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_hindi_normalization: + type: hindi_normalization + - do: + indices.analyze: + index: test + body: + text: अँगरेज़ी + tokenizer: keyword + filter: [my_hindi_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: अंगरेजि } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: अँगरेज़ी + tokenizer: keyword + filter: [hindi_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: अंगरेजि } + +--- +"indic_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_indic_normalization: + type: indic_normalization + - do: + indices.analyze: + index: test + body: + text: ত্‍ + tokenizer: keyword + filter: [my_indic_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: ৎ } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: ত্‍ + tokenizer: keyword + filter: [indic_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: ৎ } + +--- +"persian_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_persian_normalization: + type: persian_normalization + - do: + indices.analyze: + index: test + body: + text: های + tokenizer: keyword + filter: [my_persian_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: هاي } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: های + tokenizer: keyword + filter: [persian_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: هاي } + +--- +"scandinavian_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_scandinavian_normalization: + type: scandinavian_normalization + - do: + indices.analyze: + index: test + body: + text: ö + tokenizer: keyword + filter: [my_scandinavian_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: ø } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: ö + tokenizer: keyword + filter: [scandinavian_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: ø } + +--- +"serbian_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_serbian_normalization: + type: serbian_normalization + - do: + indices.analyze: + index: test + body: + text: абвгдђежзијклљмнњопрстћуфхцчџш + tokenizer: keyword + filter: [my_serbian_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: abvgddjezzijklljmnnjoprstcufhccdzs } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: абвгдђежзијклљмнњопрстћуфхцчџш + tokenizer: keyword + filter: [serbian_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: abvgddjezzijklljmnnjoprstcufhccdzs } + +--- +"sorani_normalization": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_sorani_normalization: + type: sorani_normalization + - do: + indices.analyze: + index: test + body: + text: ي + tokenizer: keyword + filter: [my_sorani_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: ی } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: ي + tokenizer: keyword + filter: [sorani_normalization] + - length: { tokens: 1 } + - match: { tokens.0.token: ی } + +--- +"cjk_width": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_cjk_width: + type: cjk_width + - do: + indices.analyze: + index: test + body: + text: カタカナ + tokenizer: keyword + filter: [my_cjk_width] + - length: { tokens: 1 } + - match: { tokens.0.token: カタカナ } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: カタカナ + tokenizer: keyword + filter: [cjk_width] + - length: { tokens: 1 } + - match: { tokens.0.token: カタカナ } + +--- +"cjk_bigram": + - do: + indices.create: + index: test + body: + settings: + analysis: + filter: + my_cjk_bigram: + type: cjk_bigram + - do: + indices.analyze: + index: test + body: + text: 多くの学生が試験に落ちた + tokenizer: standard + filter: [my_cjk_bigram] + - length: { tokens: 11 } + - match: { tokens.0.token: 多く } + - match: { tokens.1.token: くの } + - match: { tokens.2.token: の学 } + - match: { tokens.3.token: 学生 } + - match: { tokens.4.token: 生が } + - match: { tokens.5.token: が試 } + - match: { tokens.6.token: 試験 } + - match: { tokens.7.token: 験に } + - match: { tokens.8.token: に落 } + - match: { tokens.9.token: 落ち } + - match: { tokens.10.token: ちた } + + # Test pre-configured token filter too: + - do: + indices.analyze: + body: + text: 多くの学生が試験に落ちた + tokenizer: standard + filter: [cjk_bigram] + - length: { tokens: 11 } + - match: { tokens.0.token: 多く } + - match: { tokens.1.token: くの } + - match: { tokens.2.token: の学 } + - match: { tokens.3.token: 学生 } + - match: { tokens.4.token: 生が } + - match: { tokens.5.token: が試 } + - match: { tokens.6.token: 試験 } + - match: { tokens.7.token: 験に } + - match: { tokens.8.token: に落 } + - match: { tokens.9.token: 落ち } + - match: { tokens.10.token: ちた } diff --git a/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java b/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java index 5b99aed66b4..146f99ed17b 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java +++ b/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java @@ -35,13 +35,15 @@ import java.util.Arrays; public class AnalysisTestsHelper { - public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(final Path baseDir, final String resource) throws IOException { + public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(final Path baseDir, + final String resource, + final AnalysisPlugin... plugins) throws IOException { final Settings settings = Settings.builder() .loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource)) .put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString()) .build(); - return createTestAnalysisFromSettings(settings); + return createTestAnalysisFromSettings(settings, plugins); } public static ESTestCase.TestAnalysis createTestAnalysisFromSettings( diff --git a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java index 3d4d39795da..c9c214560a3 100644 --- a/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/indices/analysis/AnalysisFactoryTestCase.java @@ -24,22 +24,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.index.analysis.ApostropheFilterFactory; -import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory; -import org.elasticsearch.index.analysis.CJKBigramFilterFactory; -import org.elasticsearch.index.analysis.CJKWidthFilterFactory; import org.elasticsearch.index.analysis.ClassicFilterFactory; import org.elasticsearch.index.analysis.ClassicTokenizerFactory; import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory; import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; -import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory; import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory; -import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; -import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory; import org.elasticsearch.index.analysis.KeepTypesFilterFactory; import org.elasticsearch.index.analysis.KeepWordFilterFactory; import org.elasticsearch.index.analysis.KeywordTokenizerFactory; @@ -50,15 +44,11 @@ import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; import org.elasticsearch.index.analysis.PatternTokenizerFactory; -import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenizer; import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; -import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; -import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; -import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.StandardTokenFilterFactory; import org.elasticsearch.index.analysis.StandardTokenizerFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory; @@ -131,13 +121,13 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { static final Map> KNOWN_TOKENFILTERS = new MapBuilder>() // exposed in ES .put("apostrophe", ApostropheFilterFactory.class) - .put("arabicnormalization", ArabicNormalizationFilterFactory.class) + .put("arabicnormalization", MovedToAnalysisCommon.class) .put("arabicstem", ArabicStemTokenFilterFactory.class) .put("asciifolding", MovedToAnalysisCommon.class) .put("brazilianstem", BrazilianStemTokenFilterFactory.class) .put("bulgarianstem", MovedToAnalysisCommon.class) - .put("cjkbigram", CJKBigramFilterFactory.class) - .put("cjkwidth", CJKWidthFilterFactory.class) + .put("cjkbigram", MovedToAnalysisCommon.class) + .put("cjkwidth", MovedToAnalysisCommon.class) .put("classic", ClassicFilterFactory.class) .put("commongrams", MovedToAnalysisCommon.class) .put("commongramsquery", MovedToAnalysisCommon.class) @@ -157,15 +147,15 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("germanstem", GermanStemTokenFilterFactory.class) .put("germanlightstem", MovedToAnalysisCommon.class) .put("germanminimalstem", MovedToAnalysisCommon.class) - .put("germannormalization", GermanNormalizationFilterFactory.class) + .put("germannormalization", MovedToAnalysisCommon.class) .put("greeklowercase", MovedToAnalysisCommon.class) .put("greekstem", MovedToAnalysisCommon.class) - .put("hindinormalization", HindiNormalizationFilterFactory.class) + .put("hindinormalization", MovedToAnalysisCommon.class) .put("hindistem", MovedToAnalysisCommon.class) .put("hungarianlightstem", MovedToAnalysisCommon.class) .put("hunspellstem", HunspellTokenFilterFactory.class) .put("hyphenationcompoundword", MovedToAnalysisCommon.class) - .put("indicnormalization", IndicNormalizationFilterFactory.class) + .put("indicnormalization", MovedToAnalysisCommon.class) .put("irishlowercase", MovedToAnalysisCommon.class) .put("indonesianstem", MovedToAnalysisCommon.class) .put("italianlightstem", MovedToAnalysisCommon.class) @@ -181,7 +171,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("norwegianminimalstem", MovedToAnalysisCommon.class) .put("patterncapturegroup", MovedToAnalysisCommon.class) .put("patternreplace", MovedToAnalysisCommon.class) - .put("persiannormalization", PersianNormalizationFilterFactory.class) + .put("persiannormalization", MovedToAnalysisCommon.class) .put("porterstem", MovedToAnalysisCommon.class) .put("portuguesestem", MovedToAnalysisCommon.class) .put("portugueselightstem", MovedToAnalysisCommon.class) @@ -189,12 +179,12 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase { .put("reversestring", MovedToAnalysisCommon.class) .put("russianlightstem", MovedToAnalysisCommon.class) .put("scandinavianfolding", ScandinavianFoldingFilterFactory.class) - .put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class) - .put("serbiannormalization", SerbianNormalizationFilterFactory.class) + .put("scandinaviannormalization", MovedToAnalysisCommon.class) + .put("serbiannormalization", MovedToAnalysisCommon.class) .put("shingle", ShingleTokenFilterFactory.class) .put("minhash", MinHashTokenFilterFactory.class) .put("snowballporter", MovedToAnalysisCommon.class) - .put("soraninormalization", SoraniNormalizationFilterFactory.class) + .put("soraninormalization", MovedToAnalysisCommon.class) .put("soranistem", MovedToAnalysisCommon.class) .put("spanishlightstem", MovedToAnalysisCommon.class) .put("standard", StandardTokenFilterFactory.class)