Pre-built analysis factories do not implement MultiTermAware correctly. (#21981)
We had tests for the regular factories, but not for the pre-built ones, that ship by default without requiring users to define them in the analysis settings.
This commit is contained in:
parent
33b8d7a19d
commit
c746854e03
|
@ -65,6 +65,7 @@ import org.apache.lucene.analysis.util.ElisionFilter;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||||
import org.tartarus.snowball.ext.DutchStemmer;
|
import org.tartarus.snowball.ext.DutchStemmer;
|
||||||
|
@ -112,6 +113,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new ASCIIFoldingFilter(tokenStream);
|
return new ASCIIFoldingFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
LENGTH(CachingStrategy.LUCENE) {
|
LENGTH(CachingStrategy.LUCENE) {
|
||||||
|
@ -133,6 +138,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new LowerCaseFilter(tokenStream);
|
return new LowerCaseFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
UPPERCASE(CachingStrategy.LUCENE) {
|
UPPERCASE(CachingStrategy.LUCENE) {
|
||||||
|
@ -140,6 +149,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new UpperCaseFilter(tokenStream);
|
return new UpperCaseFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
KSTEM(CachingStrategy.ONE) {
|
KSTEM(CachingStrategy.ONE) {
|
||||||
|
@ -218,6 +231,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
|
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
ARABIC_STEM(CachingStrategy.ONE) {
|
ARABIC_STEM(CachingStrategy.ONE) {
|
||||||
|
@ -281,6 +298,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new ArabicNormalizationFilter(tokenStream);
|
return new ArabicNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
|
PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
|
||||||
|
@ -288,6 +309,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new PersianNormalizationFilter(tokenStream);
|
return new PersianNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
|
TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
|
||||||
|
@ -309,6 +334,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new GermanNormalizationFilter(tokenStream);
|
return new GermanNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
HINDI_NORMALIZATION(CachingStrategy.ONE) {
|
HINDI_NORMALIZATION(CachingStrategy.ONE) {
|
||||||
|
@ -316,6 +345,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new HindiNormalizationFilter(tokenStream);
|
return new HindiNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
INDIC_NORMALIZATION(CachingStrategy.ONE) {
|
INDIC_NORMALIZATION(CachingStrategy.ONE) {
|
||||||
|
@ -323,6 +356,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new IndicNormalizationFilter(tokenStream);
|
return new IndicNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
SORANI_NORMALIZATION(CachingStrategy.ONE) {
|
SORANI_NORMALIZATION(CachingStrategy.ONE) {
|
||||||
|
@ -330,6 +367,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new SoraniNormalizationFilter(tokenStream);
|
return new SoraniNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
|
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
|
||||||
|
@ -337,6 +378,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new ScandinavianNormalizationFilter(tokenStream);
|
return new ScandinavianNormalizationFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
|
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
|
||||||
|
@ -344,6 +389,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new ScandinavianFoldingFilter(tokenStream);
|
return new ScandinavianFoldingFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
APOSTROPHE(CachingStrategy.ONE) {
|
APOSTROPHE(CachingStrategy.ONE) {
|
||||||
|
@ -358,6 +407,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new CJKWidthFilter(tokenStream);
|
return new CJKWidthFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
DECIMAL_DIGIT(CachingStrategy.ONE) {
|
DECIMAL_DIGIT(CachingStrategy.ONE) {
|
||||||
|
@ -365,6 +418,10 @@ public enum PreBuiltTokenFilters {
|
||||||
public TokenStream create(TokenStream tokenStream, Version version) {
|
public TokenStream create(TokenStream tokenStream, Version version) {
|
||||||
return new DecimalDigitFilter(tokenStream);
|
return new DecimalDigitFilter(tokenStream);
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected boolean isMultiTermAware() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
CJK_BIGRAM(CachingStrategy.ONE) {
|
CJK_BIGRAM(CachingStrategy.ONE) {
|
||||||
|
@ -390,7 +447,11 @@ public enum PreBuiltTokenFilters {
|
||||||
|
|
||||||
;
|
;
|
||||||
|
|
||||||
public abstract TokenStream create(TokenStream tokenStream, Version version);
|
protected boolean isMultiTermAware() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract TokenStream create(TokenStream tokenStream, Version version);
|
||||||
|
|
||||||
protected final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
|
protected final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
|
||||||
|
|
||||||
|
@ -399,21 +460,42 @@ public enum PreBuiltTokenFilters {
|
||||||
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}
|
||||||
|
|
||||||
public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
|
public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
|
||||||
TokenFilterFactory factory = cache.get(version);
|
TokenFilterFactory factory = cache.get(version);
|
||||||
if (factory == null) {
|
if (factory == null) {
|
||||||
final String finalName = name();
|
final String finalName = name().toLowerCase(Locale.ROOT);
|
||||||
factory = new TokenFilterFactory() {
|
if (isMultiTermAware()) {
|
||||||
@Override
|
factory = new MultiTermAwareTokenFilterFactory() {
|
||||||
public String name() {
|
@Override
|
||||||
return finalName.toLowerCase(Locale.ROOT);
|
public String name() {
|
||||||
}
|
return finalName;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
return valueOf(finalName).create(tokenStream, version);
|
return PreBuiltTokenFilters.this.create(tokenStream, version);
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
@Override
|
||||||
|
public Object getMultiTermComponent() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
factory = new TokenFilterFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return finalName;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return PreBuiltTokenFilters.this.create(tokenStream, version);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
cache.put(version, factory);
|
cache.put(version, factory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,8 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||||
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.common.regex.Regex;
|
import org.elasticsearch.common.regex.Regex;
|
||||||
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||||
|
|
||||||
|
@ -87,6 +89,10 @@ public enum PreBuiltTokenizers {
|
||||||
protected Tokenizer create(Version version) {
|
protected Tokenizer create(Version version) {
|
||||||
return new LowerCaseTokenizer();
|
return new LowerCaseTokenizer();
|
||||||
}
|
}
|
||||||
|
@Override
|
||||||
|
protected TokenFilterFactory getMultiTermComponent(Version version) {
|
||||||
|
return PreBuiltTokenFilters.LOWERCASE.getTokenFilterFactory(version);
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
WHITESPACE(CachingStrategy.LUCENE) {
|
WHITESPACE(CachingStrategy.LUCENE) {
|
||||||
|
@ -128,6 +134,10 @@ public enum PreBuiltTokenizers {
|
||||||
|
|
||||||
protected abstract Tokenizer create(Version version);
|
protected abstract Tokenizer create(Version version);
|
||||||
|
|
||||||
|
protected TokenFilterFactory getMultiTermComponent(Version version) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
|
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
|
||||||
|
|
||||||
|
|
||||||
|
@ -135,22 +145,42 @@ public enum PreBuiltTokenizers {
|
||||||
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
cache = PreBuiltCacheFactory.getCache(cachingStrategy);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
|
||||||
|
|
||||||
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
|
public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
|
||||||
TokenizerFactory tokenizerFactory = cache.get(version);
|
TokenizerFactory tokenizerFactory = cache.get(version);
|
||||||
if (tokenizerFactory == null) {
|
if (tokenizerFactory == null) {
|
||||||
final String finalName = name();
|
final String finalName = name().toLowerCase(Locale.ROOT);
|
||||||
|
if (getMultiTermComponent(version) != null) {
|
||||||
|
tokenizerFactory = new MultiTermAwareTokenizerFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return finalName;
|
||||||
|
}
|
||||||
|
|
||||||
tokenizerFactory = new TokenizerFactory() {
|
@Override
|
||||||
@Override
|
public Tokenizer create() {
|
||||||
public String name() {
|
return PreBuiltTokenizers.this.create(version);
|
||||||
return finalName.toLowerCase(Locale.ROOT);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Tokenizer create() {
|
public Object getMultiTermComponent() {
|
||||||
return valueOf(finalName).create(version);
|
return PreBuiltTokenizers.this.getMultiTermComponent(version);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
} else {
|
||||||
|
tokenizerFactory = new TokenizerFactory() {
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return finalName;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Tokenizer create() {
|
||||||
|
return PreBuiltTokenizers.this.create(version);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
cache.put(version, tokenizerFactory);
|
cache.put(version, tokenizerFactory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,12 @@
|
||||||
|
|
||||||
package org.elasticsearch;
|
package org.elasticsearch;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||||
import org.elasticsearch.common.collect.MapBuilder;
|
import org.elasticsearch.common.collect.MapBuilder;
|
||||||
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||||
|
@ -86,13 +92,19 @@ import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||||
|
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
|
||||||
|
import org.elasticsearch.indices.analysis.PreBuiltTokenFilters;
|
||||||
|
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Alerts us if new analyzers are added to lucene, so we don't miss them.
|
* Alerts us if new analyzers are added to lucene, so we don't miss them.
|
||||||
|
@ -102,6 +114,19 @@ import java.util.TreeSet;
|
||||||
*/
|
*/
|
||||||
public class AnalysisFactoryTestCase extends ESTestCase {
|
public class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
|
|
||||||
|
private static final Pattern UNDERSCORE_THEN_ANYTHING = Pattern.compile("_(.)");
|
||||||
|
|
||||||
|
private static String toCamelCase(String s) {
|
||||||
|
Matcher m = UNDERSCORE_THEN_ANYTHING.matcher(s);
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
while (m.find()) {
|
||||||
|
m.appendReplacement(sb, m.group(1).toUpperCase());
|
||||||
|
}
|
||||||
|
m.appendTail(sb);
|
||||||
|
sb.setCharAt(0, Character.toUpperCase(sb.charAt(0)));
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
|
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
|
||||||
// exposed in ES
|
// exposed in ES
|
||||||
.put("classic", ClassicTokenizerFactory.class)
|
.put("classic", ClassicTokenizerFactory.class)
|
||||||
|
@ -121,6 +146,26 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
.put("wikipedia", Void.class)
|
.put("wikipedia", Void.class)
|
||||||
.immutableMap();
|
.immutableMap();
|
||||||
|
|
||||||
|
static final Map<PreBuiltTokenizers, Class<?>> PREBUILT_TOKENIZERS;
|
||||||
|
static {
|
||||||
|
PREBUILT_TOKENIZERS = new HashMap<>();
|
||||||
|
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
|
||||||
|
Class<?> luceneFactoryClazz;
|
||||||
|
switch (tokenizer) {
|
||||||
|
case UAX_URL_EMAIL:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
|
||||||
|
break;
|
||||||
|
case PATH_HIERARCHY:
|
||||||
|
luceneFactoryClazz = Void.class;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(
|
||||||
|
toCamelCase(tokenizer.getTokenizerFactory(Version.CURRENT).name()));
|
||||||
|
}
|
||||||
|
PREBUILT_TOKENIZERS.put(tokenizer, luceneFactoryClazz);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
|
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
|
||||||
// exposed in ES
|
// exposed in ES
|
||||||
.put("apostrophe", ApostropheFilterFactory.class)
|
.put("apostrophe", ApostropheFilterFactory.class)
|
||||||
|
@ -233,6 +278,41 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
|
|
||||||
.immutableMap();
|
.immutableMap();
|
||||||
|
|
||||||
|
static final Map<PreBuiltTokenFilters, Class<?>> PREBUILT_TOKENFILTERS;
|
||||||
|
static {
|
||||||
|
PREBUILT_TOKENFILTERS = new HashMap<>();
|
||||||
|
for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) {
|
||||||
|
Class<?> luceneFactoryClazz;
|
||||||
|
switch (tokenizer) {
|
||||||
|
case REVERSE:
|
||||||
|
luceneFactoryClazz = ReverseStringFilterFactory.class;
|
||||||
|
break;
|
||||||
|
case UNIQUE:
|
||||||
|
luceneFactoryClazz = Void.class;
|
||||||
|
break;
|
||||||
|
case SNOWBALL:
|
||||||
|
case DUTCH_STEM:
|
||||||
|
case FRENCH_STEM:
|
||||||
|
case RUSSIAN_STEM:
|
||||||
|
luceneFactoryClazz = SnowballPorterFilterFactory.class;
|
||||||
|
break;
|
||||||
|
case STEMMER:
|
||||||
|
luceneFactoryClazz = PorterStemFilterFactory.class;
|
||||||
|
break;
|
||||||
|
case DELIMITED_PAYLOAD_FILTER:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class;
|
||||||
|
break;
|
||||||
|
case LIMIT:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass(
|
||||||
|
toCamelCase(tokenizer.getTokenFilterFactory(Version.CURRENT).name()));
|
||||||
|
}
|
||||||
|
PREBUILT_TOKENFILTERS.put(tokenizer, luceneFactoryClazz);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static final Map<String,Class<?>> KNOWN_CHARFILTERS = new MapBuilder<String,Class<?>>()
|
static final Map<String,Class<?>> KNOWN_CHARFILTERS = new MapBuilder<String,Class<?>>()
|
||||||
// exposed in ES
|
// exposed in ES
|
||||||
.put("htmlstrip", HtmlStripCharFilterFactory.class)
|
.put("htmlstrip", HtmlStripCharFilterFactory.class)
|
||||||
|
@ -244,6 +324,20 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
.put("persian", Void.class)
|
.put("persian", Void.class)
|
||||||
.immutableMap();
|
.immutableMap();
|
||||||
|
|
||||||
|
static final Map<PreBuiltCharFilters, Class<?>> PREBUILT_CHARFILTERS;
|
||||||
|
static {
|
||||||
|
PREBUILT_CHARFILTERS = new HashMap<>();
|
||||||
|
for (PreBuiltCharFilters tokenizer : PreBuiltCharFilters.values()) {
|
||||||
|
Class<?> luceneFactoryClazz;
|
||||||
|
switch (tokenizer) {
|
||||||
|
default:
|
||||||
|
luceneFactoryClazz = org.apache.lucene.analysis.util.CharFilterFactory.lookupClass(
|
||||||
|
toCamelCase(tokenizer.getCharFilterFactory(Version.CURRENT).name()));
|
||||||
|
}
|
||||||
|
PREBUILT_CHARFILTERS.put(tokenizer, luceneFactoryClazz);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
protected Map<String, Class<?>> getTokenizers() {
|
protected Map<String, Class<?>> getTokenizers() {
|
||||||
return KNOWN_TOKENIZERS;
|
return KNOWN_TOKENIZERS;
|
||||||
}
|
}
|
||||||
|
@ -325,4 +419,62 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||||
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testPreBuiltMultiTermAware() {
|
||||||
|
Collection<Object> expected = new HashSet<>();
|
||||||
|
Collection<Object> actual = new HashSet<>();
|
||||||
|
|
||||||
|
for (Map.Entry<PreBuiltTokenizers, Class<?>> entry : PREBUILT_TOKENIZERS.entrySet()) {
|
||||||
|
PreBuiltTokenizers tokenizer = entry.getKey();
|
||||||
|
Class<?> luceneFactory = entry.getValue();
|
||||||
|
if (luceneFactory == Void.class) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assertTrue(TokenizerFactory.class.isAssignableFrom(luceneFactory));
|
||||||
|
if (tokenizer.getTokenizerFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
|
||||||
|
actual.add(tokenizer);
|
||||||
|
}
|
||||||
|
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
|
||||||
|
expected.add(tokenizer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Map.Entry<PreBuiltTokenFilters, Class<?>> entry : PREBUILT_TOKENFILTERS.entrySet()) {
|
||||||
|
PreBuiltTokenFilters tokenFilter = entry.getKey();
|
||||||
|
Class<?> luceneFactory = entry.getValue();
|
||||||
|
if (luceneFactory == Void.class) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory));
|
||||||
|
if (tokenFilter.getTokenFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
|
||||||
|
actual.add(tokenFilter);
|
||||||
|
}
|
||||||
|
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
|
||||||
|
expected.add(tokenFilter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (Map.Entry<PreBuiltCharFilters, Class<?>> entry : PREBUILT_CHARFILTERS.entrySet()) {
|
||||||
|
PreBuiltCharFilters charFilter = entry.getKey();
|
||||||
|
Class<?> luceneFactory = entry.getValue();
|
||||||
|
if (luceneFactory == Void.class) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
assertTrue(CharFilterFactory.class.isAssignableFrom(luceneFactory));
|
||||||
|
if (charFilter.getCharFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
|
||||||
|
actual.add(charFilter);
|
||||||
|
}
|
||||||
|
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
|
||||||
|
expected.add(charFilter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<Object> classesMissingMultiTermSupport = new HashSet<>(expected);
|
||||||
|
classesMissingMultiTermSupport.removeAll(actual);
|
||||||
|
assertTrue("Pre-built components are missing multi-term support: " + classesMissingMultiTermSupport,
|
||||||
|
classesMissingMultiTermSupport.isEmpty());
|
||||||
|
|
||||||
|
Set<Object> classesThatShouldNotHaveMultiTermSupport = new HashSet<>(actual);
|
||||||
|
classesThatShouldNotHaveMultiTermSupport.removeAll(expected);
|
||||||
|
assertTrue("Pre-built components should not have multi-term support: " + classesThatShouldNotHaveMultiTermSupport,
|
||||||
|
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue