Pre-built analysis factories do not implement MultiTermAware correctly. (#21981)

We had tests for the regular factories, but not for the pre-built ones, that
ship by default without requiring users to define them in the analysis settings.
This commit is contained in:
Adrien Grand 2016-12-07 10:32:25 +01:00 committed by GitHub
parent 33b8d7a19d
commit c746854e03
3 changed files with 287 additions and 23 deletions

View File

@ -65,6 +65,7 @@ import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory; import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory; import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import org.tartarus.snowball.ext.DutchStemmer; import org.tartarus.snowball.ext.DutchStemmer;
@ -112,6 +113,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new ASCIIFoldingFilter(tokenStream); return new ASCIIFoldingFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
LENGTH(CachingStrategy.LUCENE) { LENGTH(CachingStrategy.LUCENE) {
@ -133,6 +138,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new LowerCaseFilter(tokenStream); return new LowerCaseFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
UPPERCASE(CachingStrategy.LUCENE) { UPPERCASE(CachingStrategy.LUCENE) {
@ -140,6 +149,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new UpperCaseFilter(tokenStream); return new UpperCaseFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
KSTEM(CachingStrategy.ONE) { KSTEM(CachingStrategy.ONE) {
@ -218,6 +231,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES); return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
ARABIC_STEM(CachingStrategy.ONE) { ARABIC_STEM(CachingStrategy.ONE) {
@ -281,6 +298,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new ArabicNormalizationFilter(tokenStream); return new ArabicNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
PERSIAN_NORMALIZATION(CachingStrategy.ONE) { PERSIAN_NORMALIZATION(CachingStrategy.ONE) {
@ -288,6 +309,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new PersianNormalizationFilter(tokenStream); return new PersianNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
TYPE_AS_PAYLOAD(CachingStrategy.ONE) { TYPE_AS_PAYLOAD(CachingStrategy.ONE) {
@ -309,6 +334,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new GermanNormalizationFilter(tokenStream); return new GermanNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
HINDI_NORMALIZATION(CachingStrategy.ONE) { HINDI_NORMALIZATION(CachingStrategy.ONE) {
@ -316,6 +345,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new HindiNormalizationFilter(tokenStream); return new HindiNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
INDIC_NORMALIZATION(CachingStrategy.ONE) { INDIC_NORMALIZATION(CachingStrategy.ONE) {
@ -323,6 +356,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new IndicNormalizationFilter(tokenStream); return new IndicNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
SORANI_NORMALIZATION(CachingStrategy.ONE) { SORANI_NORMALIZATION(CachingStrategy.ONE) {
@ -330,6 +367,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new SoraniNormalizationFilter(tokenStream); return new SoraniNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) { SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
@ -337,6 +378,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianNormalizationFilter(tokenStream); return new ScandinavianNormalizationFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) { SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
@ -344,6 +389,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianFoldingFilter(tokenStream); return new ScandinavianFoldingFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
APOSTROPHE(CachingStrategy.ONE) { APOSTROPHE(CachingStrategy.ONE) {
@ -358,6 +407,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKWidthFilter(tokenStream); return new CJKWidthFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
DECIMAL_DIGIT(CachingStrategy.ONE) { DECIMAL_DIGIT(CachingStrategy.ONE) {
@ -365,6 +418,10 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) { public TokenStream create(TokenStream tokenStream, Version version) {
return new DecimalDigitFilter(tokenStream); return new DecimalDigitFilter(tokenStream);
} }
@Override
protected boolean isMultiTermAware() {
return true;
}
}, },
CJK_BIGRAM(CachingStrategy.ONE) { CJK_BIGRAM(CachingStrategy.ONE) {
@ -390,6 +447,10 @@ public enum PreBuiltTokenFilters {
; ;
protected boolean isMultiTermAware() {
return false;
}
public abstract TokenStream create(TokenStream tokenStream, Version version); public abstract TokenStream create(TokenStream tokenStream, Version version);
protected final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache; protected final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
@ -399,21 +460,42 @@ public enum PreBuiltTokenFilters {
cache = PreBuiltCacheFactory.getCache(cachingStrategy); cache = PreBuiltCacheFactory.getCache(cachingStrategy);
} }
private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}
public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
TokenFilterFactory factory = cache.get(version); TokenFilterFactory factory = cache.get(version);
if (factory == null) { if (factory == null) {
final String finalName = name(); final String finalName = name().toLowerCase(Locale.ROOT);
factory = new TokenFilterFactory() { if (isMultiTermAware()) {
factory = new MultiTermAwareTokenFilterFactory() {
@Override @Override
public String name() { public String name() {
return finalName.toLowerCase(Locale.ROOT); return finalName;
} }
@Override @Override
public TokenStream create(TokenStream tokenStream) { public TokenStream create(TokenStream tokenStream) {
return valueOf(finalName).create(tokenStream, version); return PreBuiltTokenFilters.this.create(tokenStream, version);
}
@Override
public Object getMultiTermComponent() {
return this;
} }
}; };
} else {
factory = new TokenFilterFactory() {
@Override
public String name() {
return finalName;
}
@Override
public TokenStream create(TokenStream tokenStream) {
return PreBuiltTokenFilters.this.create(tokenStream, version);
}
};
}
cache.put(version, factory); cache.put(version, factory);
} }

View File

@ -33,6 +33,8 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer; import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory; import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
@ -87,6 +89,10 @@ public enum PreBuiltTokenizers {
protected Tokenizer create(Version version) { protected Tokenizer create(Version version) {
return new LowerCaseTokenizer(); return new LowerCaseTokenizer();
} }
@Override
protected TokenFilterFactory getMultiTermComponent(Version version) {
return PreBuiltTokenFilters.LOWERCASE.getTokenFilterFactory(version);
}
}, },
WHITESPACE(CachingStrategy.LUCENE) { WHITESPACE(CachingStrategy.LUCENE) {
@ -128,6 +134,10 @@ public enum PreBuiltTokenizers {
protected abstract Tokenizer create(Version version); protected abstract Tokenizer create(Version version);
protected TokenFilterFactory getMultiTermComponent(Version version) {
return null;
}
protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache; protected final PreBuiltCacheFactory.PreBuiltCache<TokenizerFactory> cache;
@ -135,22 +145,42 @@ public enum PreBuiltTokenizers {
cache = PreBuiltCacheFactory.getCache(cachingStrategy); cache = PreBuiltCacheFactory.getCache(cachingStrategy);
} }
private interface MultiTermAwareTokenizerFactory extends TokenizerFactory, MultiTermAwareComponent {}
public synchronized TokenizerFactory getTokenizerFactory(final Version version) { public synchronized TokenizerFactory getTokenizerFactory(final Version version) {
TokenizerFactory tokenizerFactory = cache.get(version); TokenizerFactory tokenizerFactory = cache.get(version);
if (tokenizerFactory == null) { if (tokenizerFactory == null) {
final String finalName = name(); final String finalName = name().toLowerCase(Locale.ROOT);
if (getMultiTermComponent(version) != null) {
tokenizerFactory = new TokenizerFactory() { tokenizerFactory = new MultiTermAwareTokenizerFactory() {
@Override @Override
public String name() { public String name() {
return finalName.toLowerCase(Locale.ROOT); return finalName;
} }
@Override @Override
public Tokenizer create() { public Tokenizer create() {
return valueOf(finalName).create(version); return PreBuiltTokenizers.this.create(version);
}
@Override
public Object getMultiTermComponent() {
return PreBuiltTokenizers.this.getMultiTermComponent(version);
} }
}; };
} else {
tokenizerFactory = new TokenizerFactory() {
@Override
public String name() {
return finalName;
}
@Override
public Tokenizer create() {
return PreBuiltTokenizers.this.create(version);
}
};
}
cache.put(version, tokenizerFactory); cache.put(version, tokenizerFactory);
} }

View File

@ -19,6 +19,12 @@
package org.elasticsearch; package org.elasticsearch;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory; import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ApostropheFilterFactory;
@ -86,13 +92,19 @@ import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
import org.elasticsearch.indices.analysis.PreBuiltTokenFilters;
import org.elasticsearch.indices.analysis.PreBuiltTokenizers;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import java.util.Collection; import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/** /**
* Alerts us if new analyzers are added to lucene, so we don't miss them. * Alerts us if new analyzers are added to lucene, so we don't miss them.
@ -102,6 +114,19 @@ import java.util.TreeSet;
*/ */
public class AnalysisFactoryTestCase extends ESTestCase { public class AnalysisFactoryTestCase extends ESTestCase {
private static final Pattern UNDERSCORE_THEN_ANYTHING = Pattern.compile("_(.)");
private static String toCamelCase(String s) {
Matcher m = UNDERSCORE_THEN_ANYTHING.matcher(s);
StringBuffer sb = new StringBuffer();
while (m.find()) {
m.appendReplacement(sb, m.group(1).toUpperCase());
}
m.appendTail(sb);
sb.setCharAt(0, Character.toUpperCase(sb.charAt(0)));
return sb.toString();
}
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>() static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
// exposed in ES // exposed in ES
.put("classic", ClassicTokenizerFactory.class) .put("classic", ClassicTokenizerFactory.class)
@ -121,6 +146,26 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("wikipedia", Void.class) .put("wikipedia", Void.class)
.immutableMap(); .immutableMap();
static final Map<PreBuiltTokenizers, Class<?>> PREBUILT_TOKENIZERS;
static {
PREBUILT_TOKENIZERS = new HashMap<>();
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
Class<?> luceneFactoryClazz;
switch (tokenizer) {
case UAX_URL_EMAIL:
luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
break;
case PATH_HIERARCHY:
luceneFactoryClazz = Void.class;
break;
default:
luceneFactoryClazz = org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(
toCamelCase(tokenizer.getTokenizerFactory(Version.CURRENT).name()));
}
PREBUILT_TOKENIZERS.put(tokenizer, luceneFactoryClazz);
}
}
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>() static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
// exposed in ES // exposed in ES
.put("apostrophe", ApostropheFilterFactory.class) .put("apostrophe", ApostropheFilterFactory.class)
@ -233,6 +278,41 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.immutableMap(); .immutableMap();
static final Map<PreBuiltTokenFilters, Class<?>> PREBUILT_TOKENFILTERS;
static {
PREBUILT_TOKENFILTERS = new HashMap<>();
for (PreBuiltTokenFilters tokenizer : PreBuiltTokenFilters.values()) {
Class<?> luceneFactoryClazz;
switch (tokenizer) {
case REVERSE:
luceneFactoryClazz = ReverseStringFilterFactory.class;
break;
case UNIQUE:
luceneFactoryClazz = Void.class;
break;
case SNOWBALL:
case DUTCH_STEM:
case FRENCH_STEM:
case RUSSIAN_STEM:
luceneFactoryClazz = SnowballPorterFilterFactory.class;
break;
case STEMMER:
luceneFactoryClazz = PorterStemFilterFactory.class;
break;
case DELIMITED_PAYLOAD_FILTER:
luceneFactoryClazz = org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class;
break;
case LIMIT:
luceneFactoryClazz = org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory.class;
break;
default:
luceneFactoryClazz = org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass(
toCamelCase(tokenizer.getTokenFilterFactory(Version.CURRENT).name()));
}
PREBUILT_TOKENFILTERS.put(tokenizer, luceneFactoryClazz);
}
}
static final Map<String,Class<?>> KNOWN_CHARFILTERS = new MapBuilder<String,Class<?>>() static final Map<String,Class<?>> KNOWN_CHARFILTERS = new MapBuilder<String,Class<?>>()
// exposed in ES // exposed in ES
.put("htmlstrip", HtmlStripCharFilterFactory.class) .put("htmlstrip", HtmlStripCharFilterFactory.class)
@ -244,6 +324,20 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("persian", Void.class) .put("persian", Void.class)
.immutableMap(); .immutableMap();
static final Map<PreBuiltCharFilters, Class<?>> PREBUILT_CHARFILTERS;
static {
PREBUILT_CHARFILTERS = new HashMap<>();
for (PreBuiltCharFilters tokenizer : PreBuiltCharFilters.values()) {
Class<?> luceneFactoryClazz;
switch (tokenizer) {
default:
luceneFactoryClazz = org.apache.lucene.analysis.util.CharFilterFactory.lookupClass(
toCamelCase(tokenizer.getCharFilterFactory(Version.CURRENT).name()));
}
PREBUILT_CHARFILTERS.put(tokenizer, luceneFactoryClazz);
}
}
protected Map<String, Class<?>> getTokenizers() { protected Map<String, Class<?>> getTokenizers() {
return KNOWN_TOKENIZERS; return KNOWN_TOKENIZERS;
} }
@ -325,4 +419,62 @@ public class AnalysisFactoryTestCase extends ESTestCase {
classesThatShouldNotHaveMultiTermSupport.isEmpty()); classesThatShouldNotHaveMultiTermSupport.isEmpty());
} }
public void testPreBuiltMultiTermAware() {
Collection<Object> expected = new HashSet<>();
Collection<Object> actual = new HashSet<>();
for (Map.Entry<PreBuiltTokenizers, Class<?>> entry : PREBUILT_TOKENIZERS.entrySet()) {
PreBuiltTokenizers tokenizer = entry.getKey();
Class<?> luceneFactory = entry.getValue();
if (luceneFactory == Void.class) {
continue;
}
assertTrue(TokenizerFactory.class.isAssignableFrom(luceneFactory));
if (tokenizer.getTokenizerFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
actual.add(tokenizer);
}
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
expected.add(tokenizer);
}
}
for (Map.Entry<PreBuiltTokenFilters, Class<?>> entry : PREBUILT_TOKENFILTERS.entrySet()) {
PreBuiltTokenFilters tokenFilter = entry.getKey();
Class<?> luceneFactory = entry.getValue();
if (luceneFactory == Void.class) {
continue;
}
assertTrue(TokenFilterFactory.class.isAssignableFrom(luceneFactory));
if (tokenFilter.getTokenFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
actual.add(tokenFilter);
}
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
expected.add(tokenFilter);
}
}
for (Map.Entry<PreBuiltCharFilters, Class<?>> entry : PREBUILT_CHARFILTERS.entrySet()) {
PreBuiltCharFilters charFilter = entry.getKey();
Class<?> luceneFactory = entry.getValue();
if (luceneFactory == Void.class) {
continue;
}
assertTrue(CharFilterFactory.class.isAssignableFrom(luceneFactory));
if (charFilter.getCharFilterFactory(Version.CURRENT) instanceof MultiTermAwareComponent) {
actual.add(charFilter);
}
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(luceneFactory)) {
expected.add(charFilter);
}
}
Set<Object> classesMissingMultiTermSupport = new HashSet<>(expected);
classesMissingMultiTermSupport.removeAll(actual);
assertTrue("Pre-built components are missing multi-term support: " + classesMissingMultiTermSupport,
classesMissingMultiTermSupport.isEmpty());
Set<Object> classesThatShouldNotHaveMultiTermSupport = new HashSet<>(actual);
classesThatShouldNotHaveMultiTermSupport.removeAll(expected);
assertTrue("Pre-built components should not have multi-term support: " + classesThatShouldNotHaveMultiTermSupport,
classesThatShouldNotHaveMultiTermSupport.isEmpty());
}
} }