Add a MultiTermAwareComponent marker interface to analysis factories. #19028
This is the same as what Lucene does for its analysis factories, and we hawe tests that make sure that the elasticsearch factories are in sync with Lucene's. This is a first step to move forward on #9978 and #18064.
This commit is contained in:
parent
6c8744ecb5
commit
7ba5bceebe
|
@ -29,7 +29,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
* Factory for ASCIIFoldingFilter.
|
||||
*/
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
||||
public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
|
||||
|
@ -44,4 +44,9 @@ public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ASCIIFoldingFilter(tokenStream, preserveOriginal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -37,4 +37,9 @@ public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ArabicNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.elasticsearch.common.settings.Settings;
|
|||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
|
||||
public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory {
|
||||
public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -36,4 +36,9 @@ public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory {
|
|||
return new CJKWidthFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
* Factory for {@link DecimalDigitFilter}
|
||||
*/
|
||||
public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory {
|
||||
public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public DecimalDigitFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,9 @@ public final class DecimalDigitFilterFactory extends AbstractTokenFilterFactory
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new DecimalDigitFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final CharArraySet articles;
|
||||
|
||||
|
@ -42,4 +42,9 @@ public class ElisionTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ElisionFilter(tokenStream, articles);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
* Factory for {@link GermanNormalizationFilter}
|
||||
*/
|
||||
public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,8 @@ public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory
|
|||
return new GermanNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
* Factory for {@link HindiNormalizationFilter}
|
||||
*/
|
||||
public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,8 @@ public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory
|
|||
return new HindiNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
* Factory for {@link IndicNormalizationFilter}
|
||||
*/
|
||||
public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,8 @@ public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory
|
|||
return new IndicNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
* <li>turkish: {@link TurkishLowerCaseFilter}
|
||||
* </ul>
|
||||
*/
|
||||
public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final String lang;
|
||||
|
||||
|
@ -60,6 +60,11 @@ public class LowerCaseTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
throw new IllegalArgumentException("language [" + lang + "] not support for lower case");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory {
|
||||
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {
|
||||
|
||||
public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,9 @@ public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory {
|
|||
public Tokenizer create() {
|
||||
return new LowerCaseTokenizer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,7 +30,7 @@ import java.util.List;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class MappingCharFilterFactory extends AbstractCharFilterFactory {
|
||||
public class MappingCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final NormalizeCharMap normMap;
|
||||
|
||||
|
@ -114,4 +114,9 @@ public class MappingCharFilterFactory extends AbstractCharFilterFactory {
|
|||
}
|
||||
return new String(out, 0, writePos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
/** Elasticsearch counterpart of {@link org.apache.lucene.analysis.util.MultiTermAwareComponent}. */
|
||||
public interface MultiTermAwareComponent {
|
||||
|
||||
/** Returns an analysis component to handle analysis if multi-term queries.
|
||||
* The returned component must be a TokenizerFactory, TokenFilterFactory or CharFilterFactory.
|
||||
*/
|
||||
public Object getMultiTermComponent();
|
||||
|
||||
}
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,8 @@ public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactor
|
|||
return new PersianNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -37,4 +37,9 @@ public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactor
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new SerbianNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
* Factory for {@link SoraniNormalizationFilter}
|
||||
*/
|
||||
public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public SoraniNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,4 +38,9 @@ public class SoraniNormalizationFilterFactory extends AbstractTokenFilterFactory
|
|||
return new SoraniNormalizationFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
/**
|
||||
*
|
||||
*/
|
||||
public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public UpperCaseTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
@ -38,6 +38,11 @@ public class UpperCaseTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new UpperCaseFilter(tokenStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -19,192 +19,8 @@
|
|||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Alerts us if new analyzers are added to lucene, so we don't miss them.
|
||||
* <p>
|
||||
* If we don't want to expose one for a specific reason, just map it to Void
|
||||
*/
|
||||
public class AnalysisFactoryTests extends ESTestCase {
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new HashMap<String,Class<?>>() {{
|
||||
// deprecated ones, we dont care about these
|
||||
put("arabicletter", Deprecated.class);
|
||||
put("chinese", Deprecated.class);
|
||||
put("cjk", Deprecated.class);
|
||||
put("russianletter", Deprecated.class);
|
||||
|
||||
// exposed in ES
|
||||
put("classic", ClassicTokenizerFactory.class);
|
||||
put("edgengram", EdgeNGramTokenizerFactory.class);
|
||||
put("keyword", KeywordTokenizerFactory.class);
|
||||
put("letter", LetterTokenizerFactory.class);
|
||||
put("lowercase", LowerCaseTokenizerFactory.class);
|
||||
put("ngram", NGramTokenizerFactory.class);
|
||||
put("pathhierarchy", PathHierarchyTokenizerFactory.class);
|
||||
put("pattern", PatternTokenizerFactory.class);
|
||||
put("standard", StandardTokenizerFactory.class);
|
||||
put("thai", ThaiTokenizerFactory.class);
|
||||
put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
|
||||
put("whitespace", WhitespaceTokenizerFactory.class);
|
||||
|
||||
// this one "seems to mess up offsets". probably shouldn't be a tokenizer...
|
||||
put("wikipedia", Void.class);
|
||||
}};
|
||||
|
||||
public void testTokenizers() {
|
||||
Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers());
|
||||
missing.removeAll(KNOWN_TOKENIZERS.keySet());
|
||||
assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty());
|
||||
}
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new HashMap<String,Class<?>>() {{
|
||||
// deprecated ones, we dont care about these
|
||||
put("chinese", Deprecated.class);
|
||||
put("collationkey", Deprecated.class);
|
||||
put("position", Deprecated.class);
|
||||
put("thaiword", Deprecated.class);
|
||||
|
||||
|
||||
// exposed in ES
|
||||
put("apostrophe", ApostropheFilterFactory.class);
|
||||
put("arabicnormalization", ArabicNormalizationFilterFactory.class);
|
||||
put("arabicstem", ArabicStemTokenFilterFactory.class);
|
||||
put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
|
||||
put("brazilianstem", BrazilianStemTokenFilterFactory.class);
|
||||
put("bulgarianstem", StemmerTokenFilterFactory.class);
|
||||
put("cjkbigram", CJKBigramFilterFactory.class);
|
||||
put("cjkwidth", CJKWidthFilterFactory.class);
|
||||
put("classic", ClassicFilterFactory.class);
|
||||
put("commongrams", CommonGramsTokenFilterFactory.class);
|
||||
put("commongramsquery", CommonGramsTokenFilterFactory.class);
|
||||
put("czechstem", CzechStemTokenFilterFactory.class);
|
||||
put("decimaldigit", DecimalDigitFilterFactory.class);
|
||||
put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class);
|
||||
put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class);
|
||||
put("edgengram", EdgeNGramTokenFilterFactory.class);
|
||||
put("elision", ElisionTokenFilterFactory.class);
|
||||
put("englishminimalstem", StemmerTokenFilterFactory.class);
|
||||
put("englishpossessive", StemmerTokenFilterFactory.class);
|
||||
put("finnishlightstem", StemmerTokenFilterFactory.class);
|
||||
put("frenchlightstem", StemmerTokenFilterFactory.class);
|
||||
put("frenchminimalstem", StemmerTokenFilterFactory.class);
|
||||
put("galicianminimalstem", StemmerTokenFilterFactory.class);
|
||||
put("galicianstem", StemmerTokenFilterFactory.class);
|
||||
put("germanstem", GermanStemTokenFilterFactory.class);
|
||||
put("germanlightstem", StemmerTokenFilterFactory.class);
|
||||
put("germanminimalstem", StemmerTokenFilterFactory.class);
|
||||
put("germannormalization", GermanNormalizationFilterFactory.class);
|
||||
put("greeklowercase", LowerCaseTokenFilterFactory.class);
|
||||
put("greekstem", StemmerTokenFilterFactory.class);
|
||||
put("hindinormalization", HindiNormalizationFilterFactory.class);
|
||||
put("hindistem", StemmerTokenFilterFactory.class);
|
||||
put("hungarianlightstem", StemmerTokenFilterFactory.class);
|
||||
put("hunspellstem", HunspellTokenFilterFactory.class);
|
||||
put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class);
|
||||
put("indicnormalization", IndicNormalizationFilterFactory.class);
|
||||
put("irishlowercase", LowerCaseTokenFilterFactory.class);
|
||||
put("indonesianstem", StemmerTokenFilterFactory.class);
|
||||
put("italianlightstem", StemmerTokenFilterFactory.class);
|
||||
put("keepword", KeepWordFilterFactory.class);
|
||||
put("keywordmarker", KeywordMarkerTokenFilterFactory.class);
|
||||
put("kstem", KStemTokenFilterFactory.class);
|
||||
put("latvianstem", StemmerTokenFilterFactory.class);
|
||||
put("length", LengthTokenFilterFactory.class);
|
||||
put("limittokencount", LimitTokenCountFilterFactory.class);
|
||||
put("lowercase", LowerCaseTokenFilterFactory.class);
|
||||
put("ngram", NGramTokenFilterFactory.class);
|
||||
put("norwegianlightstem", StemmerTokenFilterFactory.class);
|
||||
put("norwegianminimalstem", StemmerTokenFilterFactory.class);
|
||||
put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
|
||||
put("patternreplace", PatternReplaceTokenFilterFactory.class);
|
||||
put("persiannormalization", PersianNormalizationFilterFactory.class);
|
||||
put("porterstem", PorterStemTokenFilterFactory.class);
|
||||
put("portuguesestem", StemmerTokenFilterFactory.class);
|
||||
put("portugueselightstem", StemmerTokenFilterFactory.class);
|
||||
put("portugueseminimalstem", StemmerTokenFilterFactory.class);
|
||||
put("reversestring", ReverseTokenFilterFactory.class);
|
||||
put("russianlightstem", StemmerTokenFilterFactory.class);
|
||||
put("scandinavianfolding", ScandinavianFoldingFilterFactory.class);
|
||||
put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
|
||||
put("serbiannormalization", SerbianNormalizationFilterFactory.class);
|
||||
put("shingle", ShingleTokenFilterFactory.class);
|
||||
put("snowballporter", SnowballTokenFilterFactory.class);
|
||||
put("soraninormalization", SoraniNormalizationFilterFactory.class);
|
||||
put("soranistem", StemmerTokenFilterFactory.class);
|
||||
put("spanishlightstem", StemmerTokenFilterFactory.class);
|
||||
put("standard", StandardTokenFilterFactory.class);
|
||||
put("stemmeroverride", StemmerOverrideTokenFilterFactory.class);
|
||||
put("stop", StopTokenFilterFactory.class);
|
||||
put("swedishlightstem", StemmerTokenFilterFactory.class);
|
||||
put("synonym", SynonymTokenFilterFactory.class);
|
||||
put("trim", TrimTokenFilterFactory.class);
|
||||
put("truncate", TruncateTokenFilterFactory.class);
|
||||
put("turkishlowercase", LowerCaseTokenFilterFactory.class);
|
||||
put("type", KeepTypesFilterFactory.class);
|
||||
put("uppercase", UpperCaseTokenFilterFactory.class);
|
||||
put("worddelimiter", WordDelimiterTokenFilterFactory.class);
|
||||
|
||||
// TODO: these tokenfilters are not yet exposed: useful?
|
||||
|
||||
// suggest stop
|
||||
put("suggeststop", Void.class);
|
||||
// capitalizes tokens
|
||||
put("capitalization", Void.class);
|
||||
// like length filter (but codepoints)
|
||||
put("codepointcount", Void.class);
|
||||
// puts hyphenated words back together
|
||||
put("hyphenatedwords", Void.class);
|
||||
// repeats anything marked as keyword
|
||||
put("keywordrepeat", Void.class);
|
||||
// like limittokencount, but by offset
|
||||
put("limittokenoffset", Void.class);
|
||||
// like limittokencount, but by position
|
||||
put("limittokenposition", Void.class);
|
||||
// ???
|
||||
put("numericpayload", Void.class);
|
||||
// removes duplicates at the same position (this should be used by the existing factory)
|
||||
put("removeduplicates", Void.class);
|
||||
// ???
|
||||
put("tokenoffsetpayload", Void.class);
|
||||
// puts the type into the payload
|
||||
put("typeaspayload", Void.class);
|
||||
// fingerprint
|
||||
put("fingerprint", Void.class);
|
||||
// for tee-sinks
|
||||
put("daterecognizer", Void.class);
|
||||
}};
|
||||
|
||||
public void testTokenFilters() {
|
||||
Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters());
|
||||
missing.removeAll(KNOWN_TOKENFILTERS.keySet());
|
||||
assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty());
|
||||
}
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_CHARFILTERS = new HashMap<String,Class<?>>() {{
|
||||
// exposed in ES
|
||||
put("htmlstrip", HtmlStripCharFilterFactory.class);
|
||||
put("mapping", MappingCharFilterFactory.class);
|
||||
put("patternreplace", PatternReplaceCharFilterFactory.class);
|
||||
|
||||
// TODO: these charfilters are not yet exposed: useful?
|
||||
// handling of zwnj for persian
|
||||
put("persian", Void.class);
|
||||
}};
|
||||
|
||||
public void testCharFilters() {
|
||||
Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.CharFilterFactory.availableCharFilters());
|
||||
missing.removeAll(KNOWN_CHARFILTERS.keySet());
|
||||
assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty());
|
||||
}
|
||||
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||
// tests are inherited
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
*
|
||||
* @author kimchy (shay.banon)
|
||||
*/
|
||||
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
private final String unicodeSetFilter;
|
||||
|
||||
public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
|
@ -67,4 +67,9 @@ public class IcuFoldingTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
return new ICUFoldingFilter(tokenStream);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ import java.io.Reader;
|
|||
* <p>The <tt>name</tt> can be used to provide the type of normalization to perform.</p>
|
||||
* <p>The <tt>mode</tt> can be used to provide 'compose' or 'decompose'. Default is compose.</p>
|
||||
*/
|
||||
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
|
||||
public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final String name;
|
||||
|
||||
|
@ -55,4 +55,9 @@ public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory {
|
|||
public Reader create(Reader reader) {
|
||||
return new ICUNormalizer2CharFilter(reader, normalizer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,7 +32,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
*
|
||||
*
|
||||
*/
|
||||
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final String name;
|
||||
|
||||
|
@ -45,4 +45,9 @@ public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, Normalizer2.getInstance(null, name, Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
|
||||
/**
|
||||
*/
|
||||
public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final String id;
|
||||
private final int dir;
|
||||
|
@ -47,4 +47,9 @@ public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ICUTransformFilter(tokenStream, transliterator);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnalysisICUFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenizers() {
|
||||
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
|
||||
tokenizers.put("icu", IcuTokenizerFactory.class);
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
|
||||
filters.put("icufolding", IcuFoldingTokenFilterFactory.class);
|
||||
filters.put("icunormalizer2", IcuNormalizerTokenFilterFactory.class);
|
||||
filters.put("icutransform", IcuTransformTokenFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getCharFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
|
||||
filters.put("icunormalizer2", IcuNormalizerCharFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
}
|
|
@ -26,7 +26,7 @@ import org.elasticsearch.index.IndexSettings;
|
|||
|
||||
import java.io.Reader;
|
||||
|
||||
public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory {
|
||||
public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
private final boolean normalizeKanji;
|
||||
private final boolean normalizeKana;
|
||||
|
@ -41,4 +41,9 @@ public class KuromojiIterationMarkCharFilterFactory extends AbstractCharFilterFa
|
|||
public Reader create(Reader reader) {
|
||||
return new JapaneseIterationMarkCharFilter(reader, normalizeKanji, normalizeKana);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object getMultiTermComponent() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.ja.JapaneseTokenizerFactory;
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnalysisKuromojiFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenizers() {
|
||||
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
|
||||
tokenizers.put("japanese", JapaneseTokenizerFactory.class);
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
|
||||
filters.put("japanesebaseform", KuromojiBaseFormFilterFactory.class);
|
||||
filters.put("japanesepartofspeechstop", KuromojiPartOfSpeechFilterFactory.class);
|
||||
filters.put("japanesereadingform", KuromojiReadingFormFilterFactory.class);
|
||||
filters.put("japanesekatakanastem", KuromojiKatakanaStemmerFactory.class);
|
||||
filters.put("japanesenumber", KuromojiNumberFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getCharFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
|
||||
filters.put("japaneseiterationmark", KuromojiIterationMarkCharFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnalysisPhoneticFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
|
||||
filters.put("beidermorse", PhoneticTokenFilterFactory.class);
|
||||
filters.put("doublemetaphone", PhoneticTokenFilterFactory.class);
|
||||
filters.put("phonetic", PhoneticTokenFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnalysisSmartChineseFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenizers() {
|
||||
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
|
||||
tokenizers.put("hmmchinese", SmartChineseTokenizerTokenizerFactory.class);
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
|
||||
filters.put("stempelpolishstem", PolishStemTokenFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,325 @@
|
|||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch;
|
||||
|
||||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CJKBigramFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CJKWidthFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ClassicFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.CommonGramsTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.ElisionTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordMarkerTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.LengthTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
|
||||
import org.elasticsearch.index.analysis.LowerCaseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.MappingCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PatternCaptureGroupTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PatternReplaceCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PatternReplaceTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PorterStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ReverseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SnowballTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StemmerOverrideTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StemmerTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TrimTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Alerts us if new analyzers are added to lucene, so we don't miss them.
|
||||
* <p>
|
||||
* If we don't want to expose one for a specific reason, just map it to Void.
|
||||
* The deprecated ones can be mapped to Deprecated.class.
|
||||
*/
|
||||
public class AnalysisFactoryTestCase extends ESTestCase {
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
|
||||
// exposed in ES
|
||||
.put("classic", ClassicTokenizerFactory.class)
|
||||
.put("edgengram", EdgeNGramTokenizerFactory.class)
|
||||
.put("keyword", KeywordTokenizerFactory.class)
|
||||
.put("letter", LetterTokenizerFactory.class)
|
||||
.put("lowercase", LowerCaseTokenizerFactory.class)
|
||||
.put("ngram", NGramTokenizerFactory.class)
|
||||
.put("pathhierarchy", PathHierarchyTokenizerFactory.class)
|
||||
.put("pattern", PatternTokenizerFactory.class)
|
||||
.put("standard", StandardTokenizerFactory.class)
|
||||
.put("thai", ThaiTokenizerFactory.class)
|
||||
.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
|
||||
.put("whitespace", WhitespaceTokenizerFactory.class)
|
||||
|
||||
// this one "seems to mess up offsets". probably shouldn't be a tokenizer...
|
||||
.put("wikipedia", Void.class)
|
||||
.immutableMap();
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
|
||||
// exposed in ES
|
||||
.put("apostrophe", ApostropheFilterFactory.class)
|
||||
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
|
||||
.put("arabicstem", ArabicStemTokenFilterFactory.class)
|
||||
.put("asciifolding", ASCIIFoldingTokenFilterFactory.class)
|
||||
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
|
||||
.put("bulgarianstem", StemmerTokenFilterFactory.class)
|
||||
.put("cjkbigram", CJKBigramFilterFactory.class)
|
||||
.put("cjkwidth", CJKWidthFilterFactory.class)
|
||||
.put("classic", ClassicFilterFactory.class)
|
||||
.put("commongrams", CommonGramsTokenFilterFactory.class)
|
||||
.put("commongramsquery", CommonGramsTokenFilterFactory.class)
|
||||
.put("czechstem", CzechStemTokenFilterFactory.class)
|
||||
.put("decimaldigit", DecimalDigitFilterFactory.class)
|
||||
.put("delimitedpayload", DelimitedPayloadTokenFilterFactory.class)
|
||||
.put("dictionarycompoundword", DictionaryCompoundWordTokenFilterFactory.class)
|
||||
.put("edgengram", EdgeNGramTokenFilterFactory.class)
|
||||
.put("elision", ElisionTokenFilterFactory.class)
|
||||
.put("englishminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("englishpossessive", StemmerTokenFilterFactory.class)
|
||||
.put("finnishlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("frenchlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("frenchminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("galicianminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("galicianstem", StemmerTokenFilterFactory.class)
|
||||
.put("germanstem", GermanStemTokenFilterFactory.class)
|
||||
.put("germanlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("germanminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("germannormalization", GermanNormalizationFilterFactory.class)
|
||||
.put("greeklowercase", LowerCaseTokenFilterFactory.class)
|
||||
.put("greekstem", StemmerTokenFilterFactory.class)
|
||||
.put("hindinormalization", HindiNormalizationFilterFactory.class)
|
||||
.put("hindistem", StemmerTokenFilterFactory.class)
|
||||
.put("hungarianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("hunspellstem", HunspellTokenFilterFactory.class)
|
||||
.put("hyphenationcompoundword", HyphenationCompoundWordTokenFilterFactory.class)
|
||||
.put("indicnormalization", IndicNormalizationFilterFactory.class)
|
||||
.put("irishlowercase", LowerCaseTokenFilterFactory.class)
|
||||
.put("indonesianstem", StemmerTokenFilterFactory.class)
|
||||
.put("italianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("keepword", KeepWordFilterFactory.class)
|
||||
.put("keywordmarker", KeywordMarkerTokenFilterFactory.class)
|
||||
.put("kstem", KStemTokenFilterFactory.class)
|
||||
.put("latvianstem", StemmerTokenFilterFactory.class)
|
||||
.put("length", LengthTokenFilterFactory.class)
|
||||
.put("limittokencount", LimitTokenCountFilterFactory.class)
|
||||
.put("lowercase", LowerCaseTokenFilterFactory.class)
|
||||
.put("ngram", NGramTokenFilterFactory.class)
|
||||
.put("norwegianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("norwegianminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class)
|
||||
.put("patternreplace", PatternReplaceTokenFilterFactory.class)
|
||||
.put("persiannormalization", PersianNormalizationFilterFactory.class)
|
||||
.put("porterstem", PorterStemTokenFilterFactory.class)
|
||||
.put("portuguesestem", StemmerTokenFilterFactory.class)
|
||||
.put("portugueselightstem", StemmerTokenFilterFactory.class)
|
||||
.put("portugueseminimalstem", StemmerTokenFilterFactory.class)
|
||||
.put("reversestring", ReverseTokenFilterFactory.class)
|
||||
.put("russianlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class)
|
||||
.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class)
|
||||
.put("serbiannormalization", SerbianNormalizationFilterFactory.class)
|
||||
.put("shingle", ShingleTokenFilterFactory.class)
|
||||
.put("snowballporter", SnowballTokenFilterFactory.class)
|
||||
.put("soraninormalization", SoraniNormalizationFilterFactory.class)
|
||||
.put("soranistem", StemmerTokenFilterFactory.class)
|
||||
.put("spanishlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("standard", StandardTokenFilterFactory.class)
|
||||
.put("stemmeroverride", StemmerOverrideTokenFilterFactory.class)
|
||||
.put("stop", StopTokenFilterFactory.class)
|
||||
.put("swedishlightstem", StemmerTokenFilterFactory.class)
|
||||
.put("synonym", SynonymTokenFilterFactory.class)
|
||||
.put("trim", TrimTokenFilterFactory.class)
|
||||
.put("truncate", TruncateTokenFilterFactory.class)
|
||||
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
|
||||
.put("type", KeepTypesFilterFactory.class)
|
||||
.put("uppercase", UpperCaseTokenFilterFactory.class)
|
||||
.put("worddelimiter", WordDelimiterTokenFilterFactory.class)
|
||||
|
||||
// TODO: these tokenfilters are not yet exposed: useful?
|
||||
|
||||
// suggest stop
|
||||
.put("suggeststop", Void.class)
|
||||
// capitalizes tokens
|
||||
.put("capitalization", Void.class)
|
||||
// like length filter (but codepoints)
|
||||
.put("codepointcount", Void.class)
|
||||
// puts hyphenated words back together
|
||||
.put("hyphenatedwords", Void.class)
|
||||
// repeats anything marked as keyword
|
||||
.put("keywordrepeat", Void.class)
|
||||
// like limittokencount, but by offset
|
||||
.put("limittokenoffset", Void.class)
|
||||
// like limittokencount, but by position
|
||||
.put("limittokenposition", Void.class)
|
||||
// ???
|
||||
.put("numericpayload", Void.class)
|
||||
// removes duplicates at the same position (this should be used by the existing factory)
|
||||
.put("removeduplicates", Void.class)
|
||||
// ???
|
||||
.put("tokenoffsetpayload", Void.class)
|
||||
// puts the type into the payload
|
||||
.put("typeaspayload", Void.class)
|
||||
// fingerprint
|
||||
.put("fingerprint", Void.class)
|
||||
// for tee-sinks
|
||||
.put("daterecognizer", Void.class)
|
||||
.immutableMap();
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_CHARFILTERS = new MapBuilder<String,Class<?>>()
|
||||
// exposed in ES
|
||||
.put("htmlstrip", HtmlStripCharFilterFactory.class)
|
||||
.put("mapping", MappingCharFilterFactory.class)
|
||||
.put("patternreplace", PatternReplaceCharFilterFactory.class)
|
||||
|
||||
// TODO: these charfilters are not yet exposed: useful?
|
||||
// handling of zwnj for persian
|
||||
.put("persian", Void.class)
|
||||
.immutableMap();
|
||||
|
||||
protected Map<String, Class<?>> getTokenizers() {
|
||||
return KNOWN_TOKENIZERS;
|
||||
}
|
||||
|
||||
protected Map<String, Class<?>> getTokenFilters() {
|
||||
return KNOWN_TOKENFILTERS;
|
||||
}
|
||||
|
||||
protected Map<String, Class<?>> getCharFilters() {
|
||||
return KNOWN_CHARFILTERS;
|
||||
}
|
||||
|
||||
public void testTokenizers() {
|
||||
Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenizerFactory.availableTokenizers());
|
||||
missing.removeAll(getTokenizers().keySet());
|
||||
assertTrue("new tokenizers found, please update KNOWN_TOKENIZERS: " + missing.toString(), missing.isEmpty());
|
||||
}
|
||||
|
||||
public void testCharFilters() {
|
||||
Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.CharFilterFactory.availableCharFilters());
|
||||
missing.removeAll(getCharFilters().keySet());
|
||||
assertTrue("new charfilters found, please update KNOWN_CHARFILTERS: " + missing.toString(), missing.isEmpty());
|
||||
}
|
||||
|
||||
public void testTokenFilters() {
|
||||
Set<String> missing = new TreeSet<String>(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters());
|
||||
missing.removeAll(getTokenFilters().keySet());
|
||||
assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty());
|
||||
}
|
||||
|
||||
public void testMultiTermAware() {
|
||||
Collection<Class<?>> expected = new HashSet<>();
|
||||
for (Map.Entry<String, Class<?>> entry : getTokenizers().entrySet()) {
|
||||
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(
|
||||
org.apache.lucene.analysis.util.TokenizerFactory.lookupClass(entry.getKey()))) {
|
||||
expected.add(entry.getValue());
|
||||
}
|
||||
}
|
||||
for (Map.Entry<String, Class<?>> entry : getTokenFilters().entrySet()) {
|
||||
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(
|
||||
org.apache.lucene.analysis.util.TokenFilterFactory.lookupClass(entry.getKey()))) {
|
||||
expected.add(entry.getValue());
|
||||
}
|
||||
}
|
||||
for (Map.Entry<String, Class<?>> entry : getCharFilters().entrySet()) {
|
||||
if (org.apache.lucene.analysis.util.MultiTermAwareComponent.class.isAssignableFrom(
|
||||
org.apache.lucene.analysis.util.CharFilterFactory.lookupClass(entry.getKey()))) {
|
||||
expected.add(entry.getValue());
|
||||
}
|
||||
}
|
||||
expected.remove(Void.class);
|
||||
expected.remove(Deprecated.class);
|
||||
|
||||
Collection<Class<?>> actual = new HashSet<>();
|
||||
for (Class<?> clazz : getTokenizers().values()) {
|
||||
if (MultiTermAwareComponent.class.isAssignableFrom(clazz)) {
|
||||
actual.add(clazz);
|
||||
}
|
||||
}
|
||||
for (Class<?> clazz : getTokenFilters().values()) {
|
||||
if (MultiTermAwareComponent.class.isAssignableFrom(clazz)) {
|
||||
actual.add(clazz);
|
||||
}
|
||||
}
|
||||
for (Class<?> clazz : getCharFilters().values()) {
|
||||
if (MultiTermAwareComponent.class.isAssignableFrom(clazz)) {
|
||||
actual.add(clazz);
|
||||
}
|
||||
}
|
||||
|
||||
Set<Class<?>> classesMissingMultiTermSupport = new HashSet<>(expected);
|
||||
classesMissingMultiTermSupport.removeAll(actual);
|
||||
assertTrue("Classes are missing multi-term support: " + classesMissingMultiTermSupport,
|
||||
classesMissingMultiTermSupport.isEmpty());
|
||||
|
||||
Set<Class<?>> classesThatShouldNotHaveMultiTermSupport = new HashSet<>(actual);
|
||||
classesThatShouldNotHaveMultiTermSupport.removeAll(expected);
|
||||
assertTrue("Classes should not have multi-term support: " + classesThatShouldNotHaveMultiTermSupport,
|
||||
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue