From 7ba5bceebe40779961c1c56700bac01b1b0538fb Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 22 Jun 2016 18:31:39 +0200 Subject: [PATCH] Add a MultiTermAwareComponent marker interface to analysis factories. #19028 This is the same as what Lucene does for its analysis factories, and we hawe tests that make sure that the elasticsearch factories are in sync with Lucene's. This is a first step to move forward on #9978 and #18064. --- .../ASCIIFoldingTokenFilterFactory.java | 7 +- .../ArabicNormalizationFilterFactory.java | 7 +- .../index/analysis/CJKWidthFilterFactory.java | 7 +- .../analysis/DecimalDigitFilterFactory.java | 7 +- .../analysis/ElisionTokenFilterFactory.java | 7 +- .../GermanNormalizationFilterFactory.java | 6 +- .../HindiNormalizationFilterFactory.java | 6 +- .../IndicNormalizationFilterFactory.java | 6 +- .../analysis/LowerCaseTokenFilterFactory.java | 7 +- .../analysis/LowerCaseTokenizerFactory.java | 7 +- .../analysis/MappingCharFilterFactory.java | 7 +- .../analysis/MultiTermAwareComponent.java | 30 ++ .../PersianNormalizationFilterFactory.java | 6 +- .../SerbianNormalizationFilterFactory.java | 7 +- .../SoraniNormalizationFilterFactory.java | 7 +- .../analysis/UpperCaseTokenFilterFactory.java | 7 +- .../index/analysis/AnalysisFactoryTests.java | 190 +--------- .../IcuFoldingTokenFilterFactory.java | 7 +- .../IcuNormalizerCharFilterFactory.java | 7 +- .../IcuNormalizerTokenFilterFactory.java | 7 +- .../IcuTransformTokenFilterFactory.java | 7 +- .../analysis/AnalysisICUFactoryTests.java | 52 +++ ...uromojiIterationMarkCharFilterFactory.java | 7 +- .../AnalysisKuromojiFactoryTests.java | 54 +++ .../AnalysisPhoneticFactoryTests.java | 37 ++ .../AnalysisSmartChineseFactoryTests.java | 36 ++ .../analysis/AnalysisPolishFactoryTests.java | 37 ++ .../AnalysisFactoryTestCase.java | 325 ++++++++++++++++++ 28 files changed, 690 insertions(+), 207 deletions(-) create mode 100644 core/src/main/java/org/elasticsearch/index/analysis/MultiTermAwareComponent.java create mode 100644 plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java create mode 100644 plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java create mode 100644 plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java create mode 100644 plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java create mode 100644 plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java create mode 100644 test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java index 2e6ba6377a9..b7417b26374 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java @@ -29,7 +29,7 @@ import org.elasticsearch.index.IndexSettings; /** * Factory for ASCIIFoldingFilter. */ -public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory { +public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original"); public static boolean DEFAULT_PRESERVE_ORIGINAL = false; @@ -44,4 +44,9 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return new ASCIIFoldingFilter(tokenStream, preserveOriginal); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java index 7c274a4ef8c..265e050efee 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/ArabicNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * */ -public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -37,4 +37,9 @@ public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory public TokenStream create(TokenStream tokenStream) { return new ArabicNormalizationFilter(tokenStream); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java index d7b6ab02511..44cd11c81ed 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/CJKWidthFilterFactory.java @@ -25,7 +25,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; -public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory { +public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); @@ -36,4 +36,9 @@ public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory { return new CJKWidthFilter(tokenStream); } + @Override + public Object getMultiTermComponent() { + return this; + } + } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java index a9ca009e373..b4806ab7073 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/DecimalDigitFilterFactory.java @@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings; /** * Factory for {@link DecimalDigitFilter} */ -public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory { +public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public DecimalDigitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,9 @@ public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory public TokenStream create(TokenStream tokenStream) { return new DecimalDigitFilter(tokenStream); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java index 4ca0e399f18..2291e199b3a 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/ElisionTokenFilterFactory.java @@ -29,7 +29,7 @@ import org.elasticsearch.index.IndexSettings; /** * */ -public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory { +public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { private final CharArraySet articles; @@ -42,4 +42,9 @@ public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return new ElisionFilter(tokenStream, articles); } + + @Override + public Object getMultiTermComponent() { + return this; + } } \ No newline at end of file diff --git a/core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java index 840a70ba265..dcdcb4882e7 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/GermanNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * Factory for {@link GermanNormalizationFilter} */ -public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,8 @@ public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory return new GermanNormalizationFilter(tokenStream); } + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java index 80c772c5010..a957c3dd15c 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/HindiNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * Factory for {@link HindiNormalizationFilter} */ -public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,8 @@ public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory return new HindiNormalizationFilter(tokenStream); } + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java index f0c82500ecc..67b51f02dba 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/IndicNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * Factory for {@link IndicNormalizationFilter} */ -public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,8 @@ public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory return new IndicNormalizationFilter(tokenStream); } + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java index e608e53c089..dcad6960ba4 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenFilterFactory.java @@ -37,7 +37,7 @@ import org.elasticsearch.index.IndexSettings; *
  • turkish: {@link TurkishLowerCaseFilter} * */ -public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory { +public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { private final String lang; @@ -60,6 +60,11 @@ public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory { throw new IllegalArgumentException("language [" + lang + "] not support for lower case"); } } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenizerFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenizerFactory.java index 3de1c893141..961307f7015 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenizerFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/LowerCaseTokenizerFactory.java @@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings; /** * */ -public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory { +public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent { public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,9 @@ public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory { public Tokenizer create() { return new LowerCaseTokenizer(); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/MappingCharFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/MappingCharFilterFactory.java index 4efce1935e1..c5a4e4bbdcc 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/MappingCharFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/MappingCharFilterFactory.java @@ -30,7 +30,7 @@ import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; -public class MappingCharFilterFactory extends AbstractCharFilterFactory { +public class MappingCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent { private final NormalizeCharMap normMap; @@ -114,4 +114,9 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory { } return new String(out, 0, writePos); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/MultiTermAwareComponent.java b/core/src/main/java/org/elasticsearch/index/analysis/MultiTermAwareComponent.java new file mode 100644 index 00000000000..0662352e79c --- /dev/null +++ b/core/src/main/java/org/elasticsearch/index/analysis/MultiTermAwareComponent.java @@ -0,0 +1,30 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +/** Elasticsearch counterpart of {@link org.apache.lucene.analysis.util.MultiTermAwareComponent}. */ +public interface MultiTermAwareComponent { + + /** Returns an analysis component to handle analysis if multi-term queries. + * The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory. + */ + public Object getMultiTermComponent(); + +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java index 94538260000..4f5751d985d 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/PersianNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * */ -public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,8 @@ public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactor return new PersianNormalizationFilter(tokenStream); } + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java index 41f571922dd..8fc6052247c 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/SerbianNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * */ -public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -37,4 +37,9 @@ public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactor public TokenStream create(TokenStream tokenStream) { return new SerbianNormalizationFilter(tokenStream); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java index e5a67f241ee..afe2f51ddd8 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/SoraniNormalizationFilterFactory.java @@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings; /** * Factory for {@link SoraniNormalizationFilter} */ -public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory { +public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public SoraniNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,4 +38,9 @@ public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory return new SoraniNormalizationFilter(tokenStream); } + @Override + public Object getMultiTermComponent() { + return this; + } + } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java index 0784c3b5db5..c2074cb9393 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/UpperCaseTokenFilterFactory.java @@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings; /** * */ -public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory { +public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public UpperCaseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); @@ -38,6 +38,11 @@ public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return new UpperCaseFilter(tokenStream); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java index afb34cda8a0..6893fda75b8 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java @@ -19,192 +19,8 @@ package org.elasticsearch.index.analysis; -import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; -import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; -import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.AnalysisFactoryTestCase; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -/** - * Alerts us if new analyzers are added to lucene, so we don't miss them. - *

    - * If we don't want to expose one for a specific reason, just map it to Void - */ -public class AnalysisFactoryTests extends ESTestCase { - - static final Map> KNOWN_TOKENIZERS = new HashMap>() {{ - // deprecated ones, we dont care about these - put("arabicletter", Deprecated.class); - put("chinese", Deprecated.class); - put("cjk", Deprecated.class); - put("russianletter", Deprecated.class); - - // exposed in ES - put("classic", ClassicTokenizerFactory.class); - put("edgengram", EdgeNGramTokenizerFactory.class); - put("keyword", KeywordTokenizerFactory.class); - put("letter", LetterTokenizerFactory.class); - put("lowercase", LowerCaseTokenizerFactory.class); - put("ngram", NGramTokenizerFactory.class); - put("pathhierarchy", PathHierarchyTokenizerFactory.class); - put("pattern", PatternTokenizerFactory.class); - put("standard", StandardTokenizerFactory.class); - put("thai", ThaiTokenizerFactory.class); - put("uax29urlemail", UAX29URLEmailTokenizerFactory.class); - put("whitespace", WhitespaceTokenizerFactory.class); - - // this one "seems to mess up offsets". probably shouldn't be a tokenizer... - put("wikipedia", Void.class); - }}; - - public void testTokenizers() { - Set missing = new TreeSet(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers()); - missing.removeAll(KNOWN_TOKENIZERS.keySet()); - assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty()); - } - - static final Map> KNOWN_TOKENFILTERS = new HashMap>() {{ - // deprecated ones, we dont care about these - put("chinese", Deprecated.class); - put("collationkey", Deprecated.class); - put("position", Deprecated.class); - put("thaiword", Deprecated.class); - - - // exposed in ES - put("apostrophe", ApostropheFilterFactory.class); - put("arabicnormalization", ArabicNormalizationFilterFactory.class); - put("arabicstem", ArabicStemTokenFilterFactory.class); - put("asciifolding", ASCIIFoldingTokenFilterFactory.class); - put("brazilianstem", BrazilianStemTokenFilterFactory.class); - put("bulgarianstem", StemmerTokenFilterFactory.class); - put("cjkbigram", CJKBigramFilterFactory.class); - put("cjkwidth", CJKWidthFilterFactory.class); - put("classic", ClassicFilterFactory.class); - put("commongrams", CommonGramsTokenFilterFactory.class); - put("commongramsquery", CommonGramsTokenFilterFactory.class); - put("czechstem", CzechStemTokenFilterFactory.class); - put("decimaldigit", DecimalDigitFilterFactory.class); - put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class); - put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class); - put("edgengram", EdgeNGramTokenFilterFactory.class); - put("elision", ElisionTokenFilterFactory.class); - put("englishminimalstem", StemmerTokenFilterFactory.class); - put("englishpossessive", StemmerTokenFilterFactory.class); - put("finnishlightstem", StemmerTokenFilterFactory.class); - put("frenchlightstem", StemmerTokenFilterFactory.class); - put("frenchminimalstem", StemmerTokenFilterFactory.class); - put("galicianminimalstem", StemmerTokenFilterFactory.class); - put("galicianstem", StemmerTokenFilterFactory.class); - put("germanstem", GermanStemTokenFilterFactory.class); - put("germanlightstem", StemmerTokenFilterFactory.class); - put("germanminimalstem", StemmerTokenFilterFactory.class); - put("germannormalization", GermanNormalizationFilterFactory.class); - put("greeklowercase", LowerCaseTokenFilterFactory.class); - put("greekstem", StemmerTokenFilterFactory.class); - put("hindinormalization", HindiNormalizationFilterFactory.class); - put("hindistem", StemmerTokenFilterFactory.class); - put("hungarianlightstem", StemmerTokenFilterFactory.class); - put("hunspellstem", HunspellTokenFilterFactory.class); - put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class); - put("indicnormalization", IndicNormalizationFilterFactory.class); - put("irishlowercase", LowerCaseTokenFilterFactory.class); - put("indonesianstem", StemmerTokenFilterFactory.class); - put("italianlightstem", StemmerTokenFilterFactory.class); - put("keepword", KeepWordFilterFactory.class); - put("keywordmarker", KeywordMarkerTokenFilterFactory.class); - put("kstem", KStemTokenFilterFactory.class); - put("latvianstem", StemmerTokenFilterFactory.class); - put("length", LengthTokenFilterFactory.class); - put("limittokencount", LimitTokenCountFilterFactory.class); - put("lowercase", LowerCaseTokenFilterFactory.class); - put("ngram", NGramTokenFilterFactory.class); - put("norwegianlightstem", StemmerTokenFilterFactory.class); - put("norwegianminimalstem", StemmerTokenFilterFactory.class); - put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class); - put("patternreplace", PatternReplaceTokenFilterFactory.class); - put("persiannormalization", PersianNormalizationFilterFactory.class); - put("porterstem", PorterStemTokenFilterFactory.class); - put("portuguesestem", StemmerTokenFilterFactory.class); - put("portugueselightstem", StemmerTokenFilterFactory.class); - put("portugueseminimalstem", StemmerTokenFilterFactory.class); - put("reversestring", ReverseTokenFilterFactory.class); - put("russianlightstem", StemmerTokenFilterFactory.class); - put("scandinavianfolding", ScandinavianFoldingFilterFactory.class); - put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class); - put("serbiannormalization", SerbianNormalizationFilterFactory.class); - put("shingle", ShingleTokenFilterFactory.class); - put("snowballporter", SnowballTokenFilterFactory.class); - put("soraninormalization", SoraniNormalizationFilterFactory.class); - put("soranistem", StemmerTokenFilterFactory.class); - put("spanishlightstem", StemmerTokenFilterFactory.class); - put("standard", StandardTokenFilterFactory.class); - put("stemmeroverride", StemmerOverrideTokenFilterFactory.class); - put("stop", StopTokenFilterFactory.class); - put("swedishlightstem", StemmerTokenFilterFactory.class); - put("synonym", SynonymTokenFilterFactory.class); - put("trim", TrimTokenFilterFactory.class); - put("truncate", TruncateTokenFilterFactory.class); - put("turkishlowercase", LowerCaseTokenFilterFactory.class); - put("type", KeepTypesFilterFactory.class); - put("uppercase", UpperCaseTokenFilterFactory.class); - put("worddelimiter", WordDelimiterTokenFilterFactory.class); - - // TODO: these tokenfilters are not yet exposed: useful? - - // suggest stop - put("suggeststop", Void.class); - // capitalizes tokens - put("capitalization", Void.class); - // like length filter (but codepoints) - put("codepointcount", Void.class); - // puts hyphenated words back together - put("hyphenatedwords", Void.class); - // repeats anything marked as keyword - put("keywordrepeat", Void.class); - // like limittokencount, but by offset - put("limittokenoffset", Void.class); - // like limittokencount, but by position - put("limittokenposition", Void.class); - // ??? - put("numericpayload", Void.class); - // removes duplicates at the same position (this should be used by the existing factory) - put("removeduplicates", Void.class); - // ??? - put("tokenoffsetpayload", Void.class); - // puts the type into the payload - put("typeaspayload", Void.class); - // fingerprint - put("fingerprint", Void.class); - // for tee-sinks - put("daterecognizer", Void.class); - }}; - - public void testTokenFilters() { - Set missing = new TreeSet(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters()); - missing.removeAll(KNOWN_TOKENFILTERS.keySet()); - assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty()); - } - - static final Map> KNOWN_CHARFILTERS = new HashMap>() {{ - // exposed in ES - put("htmlstrip", HtmlStripCharFilterFactory.class); - put("mapping", MappingCharFilterFactory.class); - put("patternreplace", PatternReplaceCharFilterFactory.class); - - // TODO: these charfilters are not yet exposed: useful? - // handling of zwnj for persian - put("persian", Void.class); - }}; - - public void testCharFilters() { - Set missing = new TreeSet(org.apache.lucene.analysis.util.CharFilterFactory.availableCharFilters()); - missing.removeAll(KNOWN_CHARFILTERS.keySet()); - assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty()); - } - - +public class AnalysisFactoryTests extends AnalysisFactoryTestCase { + // tests are inherited } diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java index b31502cdd7d..5fd3199e99a 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuFoldingTokenFilterFactory.java @@ -40,7 +40,7 @@ import org.elasticsearch.index.IndexSettings; * * @author kimchy (shay.banon) */ -public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { +public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { private final String unicodeSetFilter; public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { @@ -67,4 +67,9 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory { return new ICUFoldingFilter(tokenStream); } } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java index 02f9b5a3371..72bc45a0232 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerCharFilterFactory.java @@ -34,7 +34,7 @@ import java.io.Reader; *

    The name can be used to provide the type of normalization to perform.

    *

    The mode can be used to provide 'compose' or 'decompose'. Default is compose.

    */ -public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory { +public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent { private final String name; @@ -55,4 +55,9 @@ public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory { public Reader create(Reader reader) { return new ICUNormalizer2CharFilter(reader, normalizer); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java index 4833e887153..2632958d203 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuNormalizerTokenFilterFactory.java @@ -32,7 +32,7 @@ import org.elasticsearch.index.IndexSettings; * * */ -public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory { +public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { private final String name; @@ -45,4 +45,9 @@ public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory public TokenStream create(TokenStream tokenStream) { return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE)); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java index f145ad4ae30..9cc42e726a5 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/analysis/IcuTransformTokenFilterFactory.java @@ -29,7 +29,7 @@ import org.elasticsearch.index.IndexSettings; /** */ -public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { +public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { private final String id; private final int dir; @@ -47,4 +47,9 @@ public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { public TokenStream create(TokenStream tokenStream) { return new ICUTransformFilter(tokenStream, transliterator); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java new file mode 100644 index 00000000000..704ca61985a --- /dev/null +++ b/plugins/analysis-icu/src/test/java/org/elasticsearch/index/analysis/AnalysisICUFactoryTests.java @@ -0,0 +1,52 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.AnalysisFactoryTestCase; + +import java.util.HashMap; +import java.util.Map; + +public class AnalysisICUFactoryTests extends AnalysisFactoryTestCase { + + @Override + protected Map> getTokenizers() { + Map> tokenizers = new HashMap<>(super.getTokenizers()); + tokenizers.put("icu", IcuTokenizerFactory.class); + return tokenizers; + } + + @Override + protected Map> getTokenFilters() { + Map> filters = new HashMap<>(super.getTokenFilters()); + filters.put("icufolding", IcuFoldingTokenFilterFactory.class); + filters.put("icunormalizer2", IcuNormalizerTokenFilterFactory.class); + filters.put("icutransform", IcuTransformTokenFilterFactory.class); + return filters; + } + + @Override + protected Map> getCharFilters() { + Map> filters = new HashMap<>(super.getCharFilters()); + filters.put("icunormalizer2", IcuNormalizerCharFilterFactory.class); + return filters; + } + +} diff --git a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java index a1220dba2be..836dbbdfae2 100644 --- a/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java +++ b/plugins/analysis-kuromoji/src/main/java/org/elasticsearch/index/analysis/KuromojiIterationMarkCharFilterFactory.java @@ -26,7 +26,7 @@ import org.elasticsearch.index.IndexSettings; import java.io.Reader; -public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory { +public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent { private final boolean normalizeKanji; private final boolean normalizeKana; @@ -41,4 +41,9 @@ public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFa public Reader create(Reader reader) { return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana); } + + @Override + public Object getMultiTermComponent() { + return this; + } } diff --git a/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java new file mode 100644 index 00000000000..9db7def101e --- /dev/null +++ b/plugins/analysis-kuromoji/src/test/java/org/elasticsearch/index/analysis/AnalysisKuromojiFactoryTests.java @@ -0,0 +1,54 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory; +import org.elasticsearch.AnalysisFactoryTestCase; + +import java.util.HashMap; +import java.util.Map; + +public class AnalysisKuromojiFactoryTests extends AnalysisFactoryTestCase { + + @Override + protected Map> getTokenizers() { + Map> tokenizers = new HashMap<>(super.getTokenizers()); + tokenizers.put("japanese", JapaneseTokenizerFactory.class); + return tokenizers; + } + + @Override + protected Map> getTokenFilters() { + Map> filters = new HashMap<>(super.getTokenFilters()); + filters.put("japanesebaseform", KuromojiBaseFormFilterFactory.class); + filters.put("japanesepartofspeechstop", KuromojiPartOfSpeechFilterFactory.class); + filters.put("japanesereadingform", KuromojiReadingFormFilterFactory.class); + filters.put("japanesekatakanastem", KuromojiKatakanaStemmerFactory.class); + filters.put("japanesenumber", KuromojiNumberFilterFactory.class); + return filters; + } + + @Override + protected Map> getCharFilters() { + Map> filters = new HashMap<>(super.getCharFilters()); + filters.put("japaneseiterationmark", KuromojiIterationMarkCharFilterFactory.class); + return filters; + } +} diff --git a/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java new file mode 100644 index 00000000000..0546fb468c9 --- /dev/null +++ b/plugins/analysis-phonetic/src/test/java/org/elasticsearch/index/analysis/AnalysisPhoneticFactoryTests.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.AnalysisFactoryTestCase; + +import java.util.HashMap; +import java.util.Map; + +public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase { + + @Override + protected Map> getTokenFilters() { + Map> filters = new HashMap<>(super.getTokenFilters()); + filters.put("beidermorse", PhoneticTokenFilterFactory.class); + filters.put("doublemetaphone", PhoneticTokenFilterFactory.class); + filters.put("phonetic", PhoneticTokenFilterFactory.class); + return filters; + } +} diff --git a/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java b/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java new file mode 100644 index 00000000000..d8aad322dcb --- /dev/null +++ b/plugins/analysis-smartcn/src/test/java/org/elasticsearch/index/analysis/AnalysisSmartChineseFactoryTests.java @@ -0,0 +1,36 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.AnalysisFactoryTestCase; + +import java.util.HashMap; +import java.util.Map; + +public class AnalysisSmartChineseFactoryTests extends AnalysisFactoryTestCase { + + @Override + protected Map> getTokenizers() { + Map> tokenizers = new HashMap<>(super.getTokenizers()); + tokenizers.put("hmmchinese", SmartChineseTokenizerTokenizerFactory.class); + return tokenizers; + } + +} diff --git a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java new file mode 100644 index 00000000000..abf739d010a --- /dev/null +++ b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.elasticsearch.AnalysisFactoryTestCase; +import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory; + +import java.util.HashMap; +import java.util.Map; + +public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase { + + @Override + protected Map> getTokenFilters() { + Map> filters = new HashMap<>(super.getTokenFilters()); + filters.put("stempelpolishstem", PolishStemTokenFilterFactory.class); + return filters; + } + +} diff --git a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java new file mode 100644 index 00000000000..35ef0868eba --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java @@ -0,0 +1,325 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch; + +import org.elasticsearch.common.collect.MapBuilder; +import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory; +import org.elasticsearch.index.analysis.ApostropheFilterFactory; +import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; +import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; +import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory; +import org.elasticsearch.index.analysis.CJKBigramFilterFactory; +import org.elasticsearch.index.analysis.CJKWidthFilterFactory; +import org.elasticsearch.index.analysis.ClassicFilterFactory; +import org.elasticsearch.index.analysis.ClassicTokenizerFactory; +import org.elasticsearch.index.analysis.CommonGramsTokenFilterFactory; +import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory; +import org.elasticsearch.index.analysis.DecimalDigitFilterFactory; +import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; +import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory; +import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; +import org.elasticsearch.index.analysis.ElisionTokenFilterFactory; +import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory; +import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory; +import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory; +import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory; +import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; +import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory; +import org.elasticsearch.index.analysis.KStemTokenFilterFactory; +import org.elasticsearch.index.analysis.KeepTypesFilterFactory; +import org.elasticsearch.index.analysis.KeepWordFilterFactory; +import org.elasticsearch.index.analysis.KeywordMarkerTokenFilterFactory; +import org.elasticsearch.index.analysis.KeywordTokenizerFactory; +import org.elasticsearch.index.analysis.LengthTokenFilterFactory; +import org.elasticsearch.index.analysis.LetterTokenizerFactory; +import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; +import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory; +import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory; +import org.elasticsearch.index.analysis.MappingCharFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; +import org.elasticsearch.index.analysis.NGramTokenFilterFactory; +import org.elasticsearch.index.analysis.NGramTokenizerFactory; +import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory; +import org.elasticsearch.index.analysis.PatternCaptureGroupTokenFilterFactory; +import org.elasticsearch.index.analysis.PatternReplaceCharFilterFactory; +import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory; +import org.elasticsearch.index.analysis.PatternTokenizerFactory; +import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory; +import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory; +import org.elasticsearch.index.analysis.ReverseTokenFilterFactory; +import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory; +import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory; +import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory; +import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; +import org.elasticsearch.index.analysis.SnowballTokenFilterFactory; +import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; +import org.elasticsearch.index.analysis.StandardTokenFilterFactory; +import org.elasticsearch.index.analysis.StandardTokenizerFactory; +import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory; +import org.elasticsearch.index.analysis.StemmerTokenFilterFactory; +import org.elasticsearch.index.analysis.StopTokenFilterFactory; +import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; +import org.elasticsearch.index.analysis.ThaiTokenizerFactory; +import org.elasticsearch.index.analysis.TrimTokenFilterFactory; +import org.elasticsearch.index.analysis.TruncateTokenFilterFactory; +import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; +import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; +import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; +import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; +import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; +import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; +import org.elasticsearch.test.ESTestCase; + +import java.util.Collection; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +/** + * Alerts us if new analyzers are added to lucene, so we don't miss them. + *

    + * If we don't want to expose one for a specific reason, just map it to Void. + * The deprecated ones can be mapped to Deprecated.class. + */ +public class AnalysisFactoryTestCase extends ESTestCase { + + static final Map> KNOWN_TOKENIZERS = new MapBuilder>() + // exposed in ES + .put("classic", ClassicTokenizerFactory.class) + .put("edgengram", EdgeNGramTokenizerFactory.class) + .put("keyword", KeywordTokenizerFactory.class) + .put("letter", LetterTokenizerFactory.class) + .put("lowercase", LowerCaseTokenizerFactory.class) + .put("ngram", NGramTokenizerFactory.class) + .put("pathhierarchy", PathHierarchyTokenizerFactory.class) + .put("pattern", PatternTokenizerFactory.class) + .put("standard", StandardTokenizerFactory.class) + .put("thai", ThaiTokenizerFactory.class) + .put("uax29urlemail", UAX29URLEmailTokenizerFactory.class) + .put("whitespace", WhitespaceTokenizerFactory.class) + + // this one "seems to mess up offsets". probably shouldn't be a tokenizer... + .put("wikipedia", Void.class) + .immutableMap(); + + static final Map> KNOWN_TOKENFILTERS = new MapBuilder>() + // exposed in ES + .put("apostrophe", ApostropheFilterFactory.class) + .put("arabicnormalization", ArabicNormalizationFilterFactory.class) + .put("arabicstem", ArabicStemTokenFilterFactory.class) + .put("asciifolding", ASCIIFoldingTokenFilterFactory.class) + .put("brazilianstem", BrazilianStemTokenFilterFactory.class) + .put("bulgarianstem", StemmerTokenFilterFactory.class) + .put("cjkbigram", CJKBigramFilterFactory.class) + .put("cjkwidth", CJKWidthFilterFactory.class) + .put("classic", ClassicFilterFactory.class) + .put("commongrams", CommonGramsTokenFilterFactory.class) + .put("commongramsquery", CommonGramsTokenFilterFactory.class) + .put("czechstem", CzechStemTokenFilterFactory.class) + .put("decimaldigit", DecimalDigitFilterFactory.class) + .put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class) + .put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class) + .put("edgengram", EdgeNGramTokenFilterFactory.class) + .put("elision", ElisionTokenFilterFactory.class) + .put("englishminimalstem", StemmerTokenFilterFactory.class) + .put("englishpossessive", StemmerTokenFilterFactory.class) + .put("finnishlightstem", StemmerTokenFilterFactory.class) + .put("frenchlightstem", StemmerTokenFilterFactory.class) + .put("frenchminimalstem", StemmerTokenFilterFactory.class) + .put("galicianminimalstem", StemmerTokenFilterFactory.class) + .put("galicianstem", StemmerTokenFilterFactory.class) + .put("germanstem", GermanStemTokenFilterFactory.class) + .put("germanlightstem", StemmerTokenFilterFactory.class) + .put("germanminimalstem", StemmerTokenFilterFactory.class) + .put("germannormalization", GermanNormalizationFilterFactory.class) + .put("greeklowercase", LowerCaseTokenFilterFactory.class) + .put("greekstem", StemmerTokenFilterFactory.class) + .put("hindinormalization", HindiNormalizationFilterFactory.class) + .put("hindistem", StemmerTokenFilterFactory.class) + .put("hungarianlightstem", StemmerTokenFilterFactory.class) + .put("hunspellstem", HunspellTokenFilterFactory.class) + .put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class) + .put("indicnormalization", IndicNormalizationFilterFactory.class) + .put("irishlowercase", LowerCaseTokenFilterFactory.class) + .put("indonesianstem", StemmerTokenFilterFactory.class) + .put("italianlightstem", StemmerTokenFilterFactory.class) + .put("keepword", KeepWordFilterFactory.class) + .put("keywordmarker", KeywordMarkerTokenFilterFactory.class) + .put("kstem", KStemTokenFilterFactory.class) + .put("latvianstem", StemmerTokenFilterFactory.class) + .put("length", LengthTokenFilterFactory.class) + .put("limittokencount", LimitTokenCountFilterFactory.class) + .put("lowercase", LowerCaseTokenFilterFactory.class) + .put("ngram", NGramTokenFilterFactory.class) + .put("norwegianlightstem", StemmerTokenFilterFactory.class) + .put("norwegianminimalstem", StemmerTokenFilterFactory.class) + .put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class) + .put("patternreplace", PatternReplaceTokenFilterFactory.class) + .put("persiannormalization", PersianNormalizationFilterFactory.class) + .put("porterstem", PorterStemTokenFilterFactory.class) + .put("portuguesestem", StemmerTokenFilterFactory.class) + .put("portugueselightstem", StemmerTokenFilterFactory.class) + .put("portugueseminimalstem", StemmerTokenFilterFactory.class) + .put("reversestring", ReverseTokenFilterFactory.class) + .put("russianlightstem", StemmerTokenFilterFactory.class) + .put("scandinavianfolding", ScandinavianFoldingFilterFactory.class) + .put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class) + .put("serbiannormalization", SerbianNormalizationFilterFactory.class) + .put("shingle", ShingleTokenFilterFactory.class) + .put("snowballporter", SnowballTokenFilterFactory.class) + .put("soraninormalization", SoraniNormalizationFilterFactory.class) + .put("soranistem", StemmerTokenFilterFactory.class) + .put("spanishlightstem", StemmerTokenFilterFactory.class) + .put("standard", StandardTokenFilterFactory.class) + .put("stemmeroverride", StemmerOverrideTokenFilterFactory.class) + .put("stop", StopTokenFilterFactory.class) + .put("swedishlightstem", StemmerTokenFilterFactory.class) + .put("synonym", SynonymTokenFilterFactory.class) + .put("trim", TrimTokenFilterFactory.class) + .put("truncate", TruncateTokenFilterFactory.class) + .put("turkishlowercase", LowerCaseTokenFilterFactory.class) + .put("type", KeepTypesFilterFactory.class) + .put("uppercase", UpperCaseTokenFilterFactory.class) + .put("worddelimiter", WordDelimiterTokenFilterFactory.class) + + // TODO: these tokenfilters are not yet exposed: useful? + + // suggest stop + .put("suggeststop", Void.class) + // capitalizes tokens + .put("capitalization", Void.class) + // like length filter (but codepoints) + .put("codepointcount", Void.class) + // puts hyphenated words back together + .put("hyphenatedwords", Void.class) + // repeats anything marked as keyword + .put("keywordrepeat", Void.class) + // like limittokencount, but by offset + .put("limittokenoffset", Void.class) + // like limittokencount, but by position + .put("limittokenposition", Void.class) + // ??? + .put("numericpayload", Void.class) + // removes duplicates at the same position (this should be used by the existing factory) + .put("removeduplicates", Void.class) + // ??? + .put("tokenoffsetpayload", Void.class) + // puts the type into the payload + .put("typeaspayload", Void.class) + // fingerprint + .put("fingerprint", Void.class) + // for tee-sinks + .put("daterecognizer", Void.class) + .immutableMap(); + + static final Map> KNOWN_CHARFILTERS = new MapBuilder>() + // exposed in ES + .put("htmlstrip", HtmlStripCharFilterFactory.class) + .put("mapping", MappingCharFilterFactory.class) + .put("patternreplace", PatternReplaceCharFilterFactory.class) + + // TODO: these charfilters are not yet exposed: useful? + // handling of zwnj for persian + .put("persian", Void.class) + .immutableMap(); + + protected Map> getTokenizers() { + return KNOWN_TOKENIZERS; + } + + protected Map> getTokenFilters() { + return KNOWN_TOKENFILTERS; + } + + protected Map> getCharFilters() { + return KNOWN_CHARFILTERS; + } + + public void testTokenizers() { + Set missing = new TreeSet(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers()); + missing.removeAll(getTokenizers().keySet()); + assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty()); + } + + public void testCharFilters() { + Set missing = new TreeSet(org.apache.lucene.analysis.util.CharFilterFactory.availableCharFilters()); + missing.removeAll(getCharFilters().keySet()); + assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty()); + } + + public void testTokenFilters() { + Set missing = new TreeSet(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters()); + missing.removeAll(getTokenFilters().keySet()); + assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty()); + } + + public void testMultiTermAware() { + Collection> expected = new HashSet<>(); + for (Map.Entry> entry : getTokenizers().entrySet()) { + if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom( + org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(entry.getKey()))) { + expected.add(entry.getValue()); + } + } + for (Map.Entry> entry : getTokenFilters().entrySet()) { + if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom( + org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass(entry.getKey()))) { + expected.add(entry.getValue()); + } + } + for (Map.Entry> entry : getCharFilters().entrySet()) { + if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom( + org.apache.lucene.analysis.util.CharFilterFactory.lookupClass(entry.getKey()))) { + expected.add(entry.getValue()); + } + } + expected.remove(Void.class); + expected.remove(Deprecated.class); + + Collection> actual = new HashSet<>(); + for (Class clazz : getTokenizers().values()) { + if (MultiTermAwareComponent.class.isAssignableFrom(clazz)) { + actual.add(clazz); + } + } + for (Class clazz : getTokenFilters().values()) { + if (MultiTermAwareComponent.class.isAssignableFrom(clazz)) { + actual.add(clazz); + } + } + for (Class clazz : getCharFilters().values()) { + if (MultiTermAwareComponent.class.isAssignableFrom(clazz)) { + actual.add(clazz); + } + } + + Set> classesMissingMultiTermSupport = new HashSet<>(expected); + classesMissingMultiTermSupport.removeAll(actual); + assertTrue("Classes are missing multi-term support: " + classesMissingMultiTermSupport, + classesMissingMultiTermSupport.isEmpty()); + + Set> classesThatShouldNotHaveMultiTermSupport = new HashSet<>(actual); + classesThatShouldNotHaveMultiTermSupport.removeAll(expected); + assertTrue("Classes should not have multi-term support: " + classesThatShouldNotHaveMultiTermSupport, + classesThatShouldNotHaveMultiTermSupport.isEmpty()); + } + +}