Add missing pre built analysis components

closes #6770
This commit is contained in:
Shay Banon 2014-07-07 16:50:34 +02:00
parent bb92d46163
commit 5643563730
4 changed files with 112 additions and 9 deletions

View File

@ -33,8 +33,8 @@ import org.elasticsearch.index.settings.IndexSettings;
*/
public class DelimitedPayloadTokenFilterFactory extends AbstractTokenFilterFactory {
static final char DEFAULT_DELIMITER = '|';
static final PayloadEncoder DEFAULT_ENCODER = new FloatEncoder();
public static final char DEFAULT_DELIMITER = '|';
public static final PayloadEncoder DEFAULT_ENCODER = new FloatEncoder();
static final String ENCODING = "encoding";
static final String DELIMITER = "delimiter";

View File

@ -33,14 +33,17 @@ import org.elasticsearch.index.settings.IndexSettings;
*/
public class LimitTokenCountFilterFactory extends AbstractTokenFilterFactory {
final int maxTokenCount;
final boolean consumeAllTokens;
public static final int DEFAULT_MAX_TOKEN_COUNT = 1;
public static final boolean DEFAULT_CONSUME_ALL_TOKENS = false;
final int maxTokenCount;
final boolean consumeAllTokens;
@Inject
public LimitTokenCountFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.maxTokenCount = settings.getAsInt("max_token_count", 1);
this.consumeAllTokens = settings.getAsBoolean("consume_all_tokens", false);
this.maxTokenCount = settings.getAsInt("max_token_count", DEFAULT_MAX_TOKEN_COUNT);
this.consumeAllTokens = settings.getAsBoolean("consume_all_tokens", DEFAULT_CONSUME_ALL_TOKENS);
}
@Override

View File

@ -22,32 +22,41 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.ar.ArabicStemFilter;
import org.apache.lucene.analysis.br.BrazilianStemFilter;
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.fr.FrenchStemFilter;
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
import org.apache.lucene.analysis.miscellaneous.*;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.nl.DutchStemFilter;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.Version;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.*;
import org.elasticsearch.index.analysis.LimitTokenCountFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
import java.util.Locale;
@ -76,6 +85,8 @@ public enum PreBuiltTokenFilters {
WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null);
}
}
},
STOP(CachingStrategy.LUCENE) {
@ -294,7 +305,86 @@ public enum PreBuiltTokenFilters {
public TokenStream create(TokenStream tokenStream, Version version) {
return new ShingleFilter(tokenStream);
}
};
},
GERMAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new GermanNormalizationFilter(tokenStream);
}
},
HINDI_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new HindiNormalizationFilter(tokenStream);
}
},
INDIC_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new IndicNormalizationFilter(tokenStream);
}
},
SORANI_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new SoraniNormalizationFilter(tokenStream);
}
},
SCANDINAVIAN_NORMALIZATION(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianNormalizationFilter(tokenStream);
}
},
SCANDINAVIAN_FOLDING(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ScandinavianFoldingFilter(tokenStream);
}
},
APOSTROPHE(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new ApostropheFilter(tokenStream);
}
},
CJK_WIDTH(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKWidthFilter(tokenStream);
}
},
CJK_BIGRAM(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new CJKBigramFilter(tokenStream);
}
},
DELIMITED_PAYLOAD_FILTER(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new DelimitedPayloadTokenFilter(tokenStream, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER);
}
},
LIMIT(CachingStrategy.ONE) {
@Override
public TokenStream create(TokenStream tokenStream, Version version) {
return new LimitTokenCountFilter(tokenStream, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS);
}
}
;
abstract public TokenStream create(TokenStream tokenStream, Version version);

View File

@ -30,6 +30,7 @@ import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.TokenizerFactory;
@ -118,7 +119,16 @@ public enum PreBuiltTokenizers {
protected Tokenizer create(Reader reader, Version version) {
return new PatternTokenizer(reader, Regex.compile("\\W+", null), -1);
}
};
},
THAI(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Reader reader, Version version) {
return new ThaiTokenizer(reader);
}
}
;
abstract protected Tokenizer create(Reader reader, Version version);