From 8257370415a0a2da1c7574baca7faaf4cf9e621c Mon Sep 17 00:00:00 2001 From: Alexander Reelsen Date: Mon, 18 Nov 2013 16:00:01 +0100 Subject: [PATCH] Refactoring IndicesAnalysisService Using enums where possible in order to clean up the code in IndicesAnalysisService Also introduced a simpler generic caching mechanism, and tests. --- .../PreBuiltCharFilterFactoryFactory.java | 10 + .../PreBuiltTokenFilterFactoryFactory.java | 9 + .../PreBuiltTokenizerFactoryFactory.java | 11 + .../analysis/IndicesAnalysisService.java | 592 +----------------- .../indices/analysis/PreBuiltAnalyzers.java | 75 +-- .../analysis/PreBuiltCacheFactory.java | 114 ++++ .../indices/analysis/PreBuiltCharFilters.java | 70 +++ .../analysis/PreBuiltTokenFilters.java | 312 +++++++++ .../indices/analysis/PreBuiltTokenizers.java | 154 +++++ ...PreBuiltCharFilterFactoryFactoryTests.java | 49 ++ ...reBuiltTokenFilterFactoryFactoryTests.java | 63 ++ .../PreBuiltTokenizerFactoryFactoryTests.java | 51 ++ .../PreBuiltAnalyzerIntegrationTests.java | 39 +- 13 files changed, 906 insertions(+), 643 deletions(-) create mode 100644 src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java create mode 100644 src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactoryTests.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactoryTests.java create mode 100644 src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactoryTests.java rename src/test/java/org/elasticsearch/{index => indices}/analysis/PreBuiltAnalyzerIntegrationTests.java (78%) diff --git a/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java b/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java index 091dc1c76df..78c4ea42e92 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactory.java @@ -19,7 +19,12 @@ package org.elasticsearch.index.analysis; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.indices.analysis.PreBuiltCharFilters; + +import java.util.Locale; public class PreBuiltCharFilterFactoryFactory implements CharFilterFactoryFactory { @@ -31,6 +36,11 @@ public class PreBuiltCharFilterFactoryFactory implements CharFilterFactoryFactor @Override public CharFilterFactory create(String name, Settings settings) { + Version indexVersion = settings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT); + if (!Version.CURRENT.equals(indexVersion)) { + return PreBuiltCharFilters.valueOf(name.toUpperCase(Locale.ROOT)).getCharFilterFactory(indexVersion); + } + return charFilterFactory; } } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java b/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java index 8677a449c63..b55e4ff6557 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactory.java @@ -19,7 +19,12 @@ package org.elasticsearch.index.analysis; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; + +import java.util.Locale; public class PreBuiltTokenFilterFactoryFactory implements TokenFilterFactoryFactory { @@ -31,6 +36,10 @@ public class PreBuiltTokenFilterFactoryFactory implements TokenFilterFactoryFact @Override public TokenFilterFactory create(String name, Settings settings) { + Version indexVersion = settings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT); + if (!Version.CURRENT.equals(indexVersion)) { + return PreBuiltTokenFilters.valueOf(name.toUpperCase(Locale.ROOT)).getTokenFilterFactory(indexVersion); + } return tokenFilterFactory; } } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java b/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java index 1d70411d147..6761dfe5bff 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactory.java @@ -19,7 +19,12 @@ package org.elasticsearch.index.analysis; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.indices.analysis.PreBuiltTokenizers; + +import java.util.Locale; public class PreBuiltTokenizerFactoryFactory implements TokenizerFactoryFactory { @@ -31,6 +36,12 @@ public class PreBuiltTokenizerFactoryFactory implements TokenizerFactoryFactory @Override public TokenizerFactory create(String name, Settings settings) { + Version indexVersion = settings.getAsVersion(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT); + if (!Version.CURRENT.equals(indexVersion)) { + TokenizerFactory versionedTokenizerFactory = PreBuiltTokenizers.valueOf(name.toUpperCase(Locale.ROOT)).getTokenizerFactory(indexVersion); + return versionedTokenizerFactory; + } + return tokenizerFactory; } } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java b/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java index e62fc9d0dc6..da48d2ff7a0 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java @@ -82,574 +82,40 @@ public class IndicesAnalysisService extends AbstractComponent { public IndicesAnalysisService(Settings settings) { super(settings); + // Analyzers for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) { String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT); analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT))); } - // Base Tokenizers - tokenizerFactories.put("standard", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "standard"; - } - - @Override - public Tokenizer create(Reader reader) { - return new StandardTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("classic", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "classic"; - } - - @Override - public Tokenizer create(Reader reader) { - return new ClassicTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("uax_url_email", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "uax_url_email"; - } - - @Override - public Tokenizer create(Reader reader) { - return new UAX29URLEmailTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("path_hierarchy", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "path_hierarchy"; - } - - @Override - public Tokenizer create(Reader reader) { - return new PathHierarchyTokenizer(reader); - } - })); - - tokenizerFactories.put("keyword", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "keyword"; - } - - @Override - public Tokenizer create(Reader reader) { - return new KeywordTokenizer(reader); - } - })); - - tokenizerFactories.put("letter", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "letter"; - } - - @Override - public Tokenizer create(Reader reader) { - return new LetterTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("lowercase", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "lowercase"; - } - - @Override - public Tokenizer create(Reader reader) { - return new LowerCaseTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("whitespace", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "whitespace"; - } - - @Override - public Tokenizer create(Reader reader) { - return new WhitespaceTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "nGram"; - } - - @Override - public Tokenizer create(Reader reader) { - return new NGramTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("ngram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "ngram"; - } - - @Override - public Tokenizer create(Reader reader) { - return new NGramTokenizer(Lucene.ANALYZER_VERSION, reader); - } - })); - - tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "edgeNGram"; - } - - @Override - public Tokenizer create(Reader reader) { - return new EdgeNGramTokenizer(Lucene.ANALYZER_VERSION, reader, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - } - })); - - tokenizerFactories.put("edge_ngram", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "edge_ngram"; - } - - @Override - public Tokenizer create(Reader reader) { - return new EdgeNGramTokenizer(Lucene.ANALYZER_VERSION, reader, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - } - })); - - tokenizerFactories.put("pattern", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { - @Override - public String name() { - return "pattern"; - } - - @Override - public Tokenizer create(Reader reader) { - return new PatternTokenizer(reader, Regex.compile("\\W+", null), -1); - } - })); - - // Token Filters - tokenFilterFactories.put("word_delimiter", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "word_delimiter"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new WordDelimiterFilter(tokenStream, - WordDelimiterFilter.GENERATE_WORD_PARTS | - WordDelimiterFilter.GENERATE_NUMBER_PARTS | - WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | - WordDelimiterFilter.SPLIT_ON_NUMERICS | - WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null); - } - })); - tokenFilterFactories.put("stop", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "stop"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new StopFilter(Lucene.ANALYZER_VERSION, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - } - })); - - tokenFilterFactories.put("trim", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "trim"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new TrimFilter(Lucene.ANALYZER_VERSION, tokenStream); - } - })); - - tokenFilterFactories.put("reverse", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "reverse"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ReverseStringFilter(Lucene.ANALYZER_VERSION, tokenStream); - } - })); - - tokenFilterFactories.put("asciifolding", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "asciifolding"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ASCIIFoldingFilter(tokenStream); - } - })); - - tokenFilterFactories.put("length", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "length"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new LengthFilter(Lucene.ANALYZER_VERSION, tokenStream, 0, Integer.MAX_VALUE); - } - })); - - tokenFilterFactories.put("common_grams", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "common_grams"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new CommonGramsFilter(Lucene.ANALYZER_VERSION, tokenStream, CharArraySet.EMPTY_SET); - } - })); - - tokenFilterFactories.put("lowercase", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "lowercase"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new LowerCaseFilter(Lucene.ANALYZER_VERSION, tokenStream); - } - })); - - tokenFilterFactories.put("kstem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "kstem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new KStemFilter(tokenStream); - } - })); - - tokenFilterFactories.put("porter_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "porter_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new PorterStemFilter(tokenStream); - } - })); - - tokenFilterFactories.put("standard", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "standard"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new StandardFilter(Lucene.ANALYZER_VERSION, tokenStream); - } - })); - - tokenFilterFactories.put("classic", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "classic"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ClassicFilter(tokenStream); - } - })); - - tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "nGram"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new NGramTokenFilter(Lucene.ANALYZER_VERSION, tokenStream); - } - })); - - tokenFilterFactories.put("ngram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "ngram"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new NGramTokenFilter(Lucene.ANALYZER_VERSION, tokenStream); - } - })); - - tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "edgeNGram"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new EdgeNGramTokenFilter(Lucene.ANALYZER_VERSION, tokenStream, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - } - })); - - tokenFilterFactories.put("edge_ngram", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "edge_ngram"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new EdgeNGramTokenFilter(Lucene.ANALYZER_VERSION, tokenStream, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); - } - })); - - tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new ShingleTokenFilterFactory.Factory("shingle"))); - - tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "unique"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new UniqueTokenFilter(tokenStream); - } - })); - - tokenFilterFactories.put("truncate", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "truncate"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new TruncateTokenFilter(tokenStream, 10); - } - })); - - // Extended Token Filters - tokenFilterFactories.put("snowball", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "snowball"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new SnowballFilter(tokenStream, "English"); - } - })); - tokenFilterFactories.put("stemmer", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "stemmer"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new PorterStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("elision", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "elision"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - // LUCENE 4 UPGRADE: French default for now, make set of articles configurable - return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES); - } - })); - tokenFilterFactories.put("arabic_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "arabic_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ArabicStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("brazilian_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "brazilian_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new BrazilianStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("czech_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "czech_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new CzechStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("dutch_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "dutch_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new DutchStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("french_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "french_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new FrenchStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("german_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "german_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new GermanStemFilter(tokenStream); - } - })); - tokenFilterFactories.put("russian_stem", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "russian_stem"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new SnowballFilter(tokenStream, "Russian"); - } - })); - tokenFilterFactories.put("keyword_repeat", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "keyword_repeat"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new KeywordRepeatFilter(tokenStream); - } - })); - tokenFilterFactories.put("arabic_normalization", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "arabic_normalization"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new ArabicNormalizationFilter(tokenStream); - } - })); - tokenFilterFactories.put("persian_normalization", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - @Override - public String name() { - return "persian_normalization"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new PersianNormalizationFilter(tokenStream); - } - })); - - tokenFilterFactories.put("type_as_payload", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { - - @Override - public String name() { - return "type_as_payload"; - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new TypeAsPayloadTokenFilter(tokenStream); - } - })); - - // Char Filter - charFilterFactories.put("html_strip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { - @Override - public String name() { - return "html_strip"; - } - - @Override - public Reader create(Reader tokenStream) { - return new HTMLStripCharFilter(tokenStream); - } - })); - - charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { - @Override - public String name() { - return "htmlStrip"; - } - - @Override - public Reader create(Reader tokenStream) { - return new HTMLStripCharFilter(tokenStream); - } - })); + // Tokenizers + for (PreBuiltTokenizers preBuiltTokenizer : PreBuiltTokenizers.values()) { + String name = preBuiltTokenizer.name().toLowerCase(Locale.ROOT); + tokenizerFactories.put(name, new PreBuiltTokenizerFactoryFactory(preBuiltTokenizer.getTokenizerFactory(Version.CURRENT))); + } + + // Tokenizer aliases + tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.NGRAM.getTokenizerFactory(Version.CURRENT))); + tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT))); + + + // Token filters + for (PreBuiltTokenFilters preBuiltTokenFilter : PreBuiltTokenFilters.values()) { + String name = preBuiltTokenFilter.name().toLowerCase(Locale.ROOT); + tokenFilterFactories.put(name, new PreBuiltTokenFilterFactoryFactory(preBuiltTokenFilter.getTokenFilterFactory(Version.CURRENT))); + } + // Token filter aliases + tokenFilterFactories.put("nGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.NGRAM.getTokenFilterFactory(Version.CURRENT))); + tokenFilterFactories.put("edgeNGram", new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.EDGE_NGRAM.getTokenFilterFactory(Version.CURRENT))); + + + // Char Filters + for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) { + String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT); + charFilterFactories.put(name, new PreBuiltCharFilterFactoryFactory(preBuiltCharFilter.getCharFilterFactory(Version.CURRENT))); + } + // Char filter aliases + charFilterFactories.put("htmlStrip", new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT))); } public boolean hasCharFilter(String name) { diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java index b299d34d58c..61456e01e5e 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzers.java @@ -18,8 +18,6 @@ */ package org.elasticsearch.indices.analysis; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.ar.ArabicAnalyzer; import org.apache.lucene.analysis.bg.BulgarianAnalyzer; @@ -62,12 +60,10 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer; import org.apache.lucene.analysis.th.ThaiAnalyzer; import org.apache.lucene.analysis.tr.TurkishAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; -import org.elasticsearch.ElasticSearchException; import org.elasticsearch.Version; import org.elasticsearch.common.regex.Regex; import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzer; - -import java.util.Map; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; /** * @@ -373,71 +369,30 @@ public enum PreBuiltAnalyzers { } }; - /** - * The strategy of caching the analyzer - * - * ONE Exactly one version is stored. Useful for analyzers which do not store version information - * LUCENE Exactly one version for each lucene version is stored. Useful to prevent different analyzers with the same version - * ELASTICSEARCH Exactly one version per elasticsearch version is stored. Useful if you change an analyzer between elasticsearch releases, when the lucene version does not change - */ - private static enum CachingStrategy { ONE, LUCENE, ELASTICSEARCH }; - - private CachingStrategy cachingStrategy; - protected final Map cachedAnalyzers = Maps.newHashMapWithExpectedSize(2); - - PreBuiltAnalyzers() { - this(CachingStrategy.LUCENE); - } - - PreBuiltAnalyzers(CachingStrategy cachingStrategy) { - this.cachingStrategy = cachingStrategy; - } - abstract protected Analyzer create(Version version); - public Map getCachedAnalyzers() { - return ImmutableMap.copyOf(cachedAnalyzers); + protected final PreBuiltCacheFactory.PreBuiltCache cache; + + PreBuiltAnalyzers() { + this(PreBuiltCacheFactory.CachingStrategy.LUCENE); + } + + PreBuiltAnalyzers(PreBuiltCacheFactory.CachingStrategy cachingStrategy) { + cache = PreBuiltCacheFactory.getCache(cachingStrategy); + } + + PreBuiltCacheFactory.PreBuiltCache getCache() { + return cache; } public synchronized Analyzer getAnalyzer(Version version) { - Analyzer analyzer = getCachedAnalyzer(version); + Analyzer analyzer = cache.get(version); if (analyzer == null) { analyzer = this.create(version); - } - - if (!cachedAnalyzers.containsKey(version)) { - cachedAnalyzers.put(version, analyzer); + cache.put(version, analyzer); } return analyzer; } - private Analyzer getCachedAnalyzer(Version version) { - switch (this.cachingStrategy) { - case ONE: - // method to return the first found analyzer in the cache - if (cachedAnalyzers.size() > 0) { - return (Analyzer) cachedAnalyzers.values().toArray()[0]; - } - break; - case LUCENE: - // find already cached analyzers with the same lucene version - for (Version elasticsearchVersion : cachedAnalyzers.keySet()) { - if (elasticsearchVersion.luceneVersion.equals(version.luceneVersion)) { - return cachedAnalyzers.get(elasticsearchVersion); - } - } - break; - case ELASTICSEARCH: - // check only for the same es version - if (cachedAnalyzers.containsKey(version)) { - return cachedAnalyzers.get(version); - } - break; - default: - throw new ElasticSearchException("No action configured for caching strategy[" + this.cachingStrategy + "]"); - } - - return null; - } } diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java new file mode 100644 index 00000000000..82c1b6dbf9e --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCacheFactory.java @@ -0,0 +1,114 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.indices.analysis; + +import com.google.common.collect.Maps; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.Version; + +import java.util.Map; + +/** + * + */ +public class PreBuiltCacheFactory { + + /** + * The strategy of caching the analyzer + * + * ONE Exactly one version is stored. Useful for analyzers which do not store version information + * LUCENE Exactly one version for each lucene version is stored. Useful to prevent different analyzers with the same version + * ELASTICSEARCH Exactly one version per elasticsearch version is stored. Useful if you change an analyzer between elasticsearch releases, when the lucene version does not change + */ + static enum CachingStrategy { ONE, LUCENE, ELASTICSEARCH }; + + public interface PreBuiltCache { + T get(Version version); + void put(Version version, T t); + } + + private PreBuiltCacheFactory() {} + + static PreBuiltCache getCache(CachingStrategy cachingStrategy) { + switch (cachingStrategy) { + case ONE: + return new PreBuiltCacheStrategyOne(); + case LUCENE: + return new PreBuiltCacheStrategyLucene(); + case ELASTICSEARCH: + return new PreBuiltCacheStrategyElasticsearch(); + default: + throw new ElasticSearchException("No action configured for caching strategy[" + cachingStrategy + "]"); + } + } + + /** + * This is a pretty simple cache, it only contains one version + */ + private static class PreBuiltCacheStrategyOne implements PreBuiltCache { + + private T model = null; + + @Override + public T get(Version version) { + return model; + } + + @Override + public void put(Version version, T model) { + this.model = model; + } + } + + /** + * This cache contains one version for each elasticsearch version object + */ + private static class PreBuiltCacheStrategyElasticsearch implements PreBuiltCache { + + Map mapModel = Maps.newHashMapWithExpectedSize(2); + + @Override + public T get(Version version) { + return mapModel.get(version); + } + + @Override + public void put(Version version, T model) { + mapModel.put(version, model); + } + } + + /** + * This cache uses the lucene version for caching + */ + private static class PreBuiltCacheStrategyLucene implements PreBuiltCache { + + private Map mapModel = Maps.newHashMapWithExpectedSize(2); + + @Override + public T get(Version version) { + return mapModel.get(version.luceneVersion); + } + + @Override + public void put(org.elasticsearch.Version version, T model) { + mapModel.put(version.luceneVersion, model); + } + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java new file mode 100644 index 00000000000..9e4b71038d3 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltCharFilters.java @@ -0,0 +1,70 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.indices.analysis; + +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; +import org.elasticsearch.Version; +import org.elasticsearch.index.analysis.CharFilterFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; + +import java.io.Reader; +import java.util.Locale; + +/** + * + */ +public enum PreBuiltCharFilters { + + HTML_STRIP(CachingStrategy.ONE) { + @Override + public Reader create(Reader tokenStream, Version version) { + return new HTMLStripCharFilter(tokenStream); + } + }; + + abstract public Reader create(Reader tokenStream, Version version); + + protected final PreBuiltCacheFactory.PreBuiltCache cache; + + PreBuiltCharFilters(CachingStrategy cachingStrategy) { + cache = PreBuiltCacheFactory.getCache(cachingStrategy); + } + + public synchronized CharFilterFactory getCharFilterFactory(final Version version) { + CharFilterFactory charFilterFactory = cache.get(version); + if (charFilterFactory == null) { + final String finalName = name(); + + charFilterFactory = new CharFilterFactory() { + @Override + public String name() { + return finalName.toLowerCase(Locale.ROOT); + } + + @Override + public Reader create(Reader tokenStream) { + return valueOf(finalName).create(tokenStream, version); + } + }; + cache.put(version, charFilterFactory); + } + + return charFilterFactory; + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java new file mode 100644 index 00000000000..7e32a519e16 --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenFilters.java @@ -0,0 +1,312 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.indices.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; +import org.apache.lucene.analysis.ar.ArabicStemFilter; +import org.apache.lucene.analysis.br.BrazilianStemFilter; +import org.apache.lucene.analysis.commongrams.CommonGramsFilter; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopAnalyzer; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.cz.CzechStemFilter; +import org.apache.lucene.analysis.de.GermanStemFilter; +import org.apache.lucene.analysis.en.KStemFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.fa.PersianNormalizationFilter; +import org.apache.lucene.analysis.fr.FrenchAnalyzer; +import org.apache.lucene.analysis.fr.FrenchStemFilter; +import org.apache.lucene.analysis.miscellaneous.*; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; +import org.apache.lucene.analysis.ngram.NGramTokenFilter; +import org.apache.lucene.analysis.nl.DutchStemFilter; +import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.shingle.ShingleFilter; +import org.apache.lucene.analysis.snowball.SnowballFilter; +import org.apache.lucene.analysis.standard.ClassicFilter; +import org.apache.lucene.analysis.standard.StandardFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.ElisionFilter; +import org.elasticsearch.Version; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; + +import java.util.Locale; + +/** + * + */ +public enum PreBuiltTokenFilters { + + WORD_DELIMITER(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new WordDelimiterFilter(tokenStream, + WordDelimiterFilter.GENERATE_WORD_PARTS | + WordDelimiterFilter.GENERATE_NUMBER_PARTS | + WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | + WordDelimiterFilter.SPLIT_ON_NUMERICS | + WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null); + } + }, + + STOP(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new StopFilter(version.luceneVersion, tokenStream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + } + }, + + TRIM(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new TrimFilter(version.luceneVersion, tokenStream); + } + }, + + REVERSE(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ReverseStringFilter(version.luceneVersion, tokenStream); + } + }, + + ASCIIFOLDING(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ASCIIFoldingFilter(tokenStream); + } + }, + + LENGTH(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new LengthFilter(version.luceneVersion, tokenStream, 0, Integer.MAX_VALUE); + } + }, + + COMMON_GRAMS(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new CommonGramsFilter(version.luceneVersion, tokenStream, CharArraySet.EMPTY_SET); + } + }, + + LOWERCASE(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new LowerCaseFilter(version.luceneVersion, tokenStream); + } + }, + + KSTEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new KStemFilter(tokenStream); + } + }, + + PORTER_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new PorterStemFilter(tokenStream); + } + }, + + STANDARD(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new StandardFilter(version.luceneVersion, tokenStream); + } + }, + + CLASSIC(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ClassicFilter(tokenStream); + } + }, + + NGRAM(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new NGramTokenFilter(version.luceneVersion, tokenStream); + } + }, + + EDGE_NGRAM(CachingStrategy.LUCENE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new EdgeNGramTokenFilter(version.luceneVersion, tokenStream, EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE); + } + }, + + UNIQUE(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new UniqueTokenFilter(tokenStream); + } + }, + + TRUNCATE(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new TruncateTokenFilter(tokenStream, 10); + } + }, + + // Extended Token Filters + SNOWBALL(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new SnowballFilter(tokenStream, "English"); + } + }, + + STEMMER(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new PorterStemFilter(tokenStream); + } + }, + + ELISION(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ElisionFilter(tokenStream, FrenchAnalyzer.DEFAULT_ARTICLES); + } + }, + + ARABIC_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ArabicStemFilter(tokenStream); + } + }, + + BRAZILIAN_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new BrazilianStemFilter(tokenStream); + } + }, + + CZECH_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new CzechStemFilter(tokenStream); + } + }, + + DUTCH_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new DutchStemFilter(tokenStream); + } + }, + + FRENCH_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new FrenchStemFilter(tokenStream); + } + }, + + GERMAN_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new GermanStemFilter(tokenStream); + } + }, + + RUSSIAN_STEM(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new SnowballFilter(tokenStream, "Russian"); + } + }, + + KEYWORD_REPEAT(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new KeywordRepeatFilter(tokenStream); + } + }, + + ARABIC_NORMALIZATION(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ArabicNormalizationFilter(tokenStream); + } + }, + + PERSIAN_NORMALIZATION(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new PersianNormalizationFilter(tokenStream); + } + }, + + TYPE_AS_PAYLOAD(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new TypeAsPayloadTokenFilter(tokenStream); + } + }, + + SHINGLE(CachingStrategy.ONE) { + @Override + public TokenStream create(TokenStream tokenStream, Version version) { + return new ShingleFilter(tokenStream); + } + }; + + abstract public TokenStream create(TokenStream tokenStream, Version version); + + protected final PreBuiltCacheFactory.PreBuiltCache cache; + + + PreBuiltTokenFilters(CachingStrategy cachingStrategy) { + cache = PreBuiltCacheFactory.getCache(cachingStrategy); + } + + public synchronized TokenFilterFactory getTokenFilterFactory(final Version version) { + TokenFilterFactory factory = cache.get(version); + if (factory == null) { + final String finalName = name(); + factory = new TokenFilterFactory() { + @Override + public String name() { + return finalName.toLowerCase(Locale.ROOT); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return valueOf(finalName).create(tokenStream, version); + } + }; + cache.put(version, factory); + } + + return factory; + } + +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java new file mode 100644 index 00000000000..aeeb80f5a9d --- /dev/null +++ b/src/main/java/org/elasticsearch/indices/analysis/PreBuiltTokenizers.java @@ -0,0 +1,154 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.indices.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.core.LowerCaseTokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer; +import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.analysis.path.PathHierarchyTokenizer; +import org.apache.lucene.analysis.pattern.PatternTokenizer; +import org.apache.lucene.analysis.standard.ClassicTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.common.regex.Regex; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy; + +import java.io.Reader; +import java.util.Locale; + +/** + * + */ +public enum PreBuiltTokenizers { + + STANDARD(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new StandardTokenizer(version.luceneVersion, reader); + } + }, + + CLASSIC(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new ClassicTokenizer(version.luceneVersion, reader); + } + }, + + UAX_URL_EMAIL(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new UAX29URLEmailTokenizer(version.luceneVersion, reader); + } + }, + + PATH_HIERARCHY(CachingStrategy.ONE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new PathHierarchyTokenizer(reader); + } + }, + + KEYWORD(CachingStrategy.ONE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new KeywordTokenizer(reader); + } + }, + + LETTER(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new LetterTokenizer(version.luceneVersion, reader); + } + }, + + LOWERCASE(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new LowerCaseTokenizer(version.luceneVersion, reader); + } + }, + + WHITESPACE(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new WhitespaceTokenizer(version.luceneVersion, reader); + } + }, + + NGRAM(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new NGramTokenizer(version.luceneVersion, reader); + } + }, + + EDGE_NGRAM(CachingStrategy.LUCENE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new EdgeNGramTokenizer(version.luceneVersion, reader, EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); + } + }, + + PATTERN(CachingStrategy.ONE) { + @Override + protected Tokenizer create(Reader reader, Version version) { + return new PatternTokenizer(reader, Regex.compile("\\W+", null), -1); + } + }; + + abstract protected Tokenizer create(Reader reader, Version version); + + protected final PreBuiltCacheFactory.PreBuiltCache cache; + + + PreBuiltTokenizers(CachingStrategy cachingStrategy) { + cache = PreBuiltCacheFactory.getCache(cachingStrategy); + } + + public synchronized TokenizerFactory getTokenizerFactory(final Version version) { + TokenizerFactory tokenizerFactory = cache.get(version); + if (tokenizerFactory == null) { + final String finalName = name(); + + tokenizerFactory = new TokenizerFactory() { + @Override + public String name() { + return finalName.toLowerCase(Locale.ROOT); + } + + @Override + public Tokenizer create(Reader reader) { + return valueOf(finalName).create(reader, version); + } + }; + cache.put(version, tokenizerFactory); + } + + return tokenizerFactory; + } + +} diff --git a/src/test/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactoryTests.java new file mode 100644 index 00000000000..e52c4dd10bc --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/PreBuiltCharFilterFactoryFactoryTests.java @@ -0,0 +1,49 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.indices.analysis.PreBuiltCharFilters; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.*; +import static org.hamcrest.MatcherAssert.assertThat; + +/** + * + */ +public class PreBuiltCharFilterFactoryFactoryTests { + + @Test + public void testThatDifferentVersionsCanBeLoaded() { + PreBuiltCharFilterFactoryFactory factory = new PreBuiltCharFilterFactoryFactory(PreBuiltCharFilters.HTML_STRIP.getCharFilterFactory(Version.CURRENT)); + + CharFilterFactory emptySettingsTokenizerFactory = factory.create("html_strip", ImmutableSettings.EMPTY); + CharFilterFactory former090TokenizerFactory = factory.create("html_strip", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_0).build()); + CharFilterFactory former090TokenizerFactoryCopy = factory.create("html_strip", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_0).build()); + CharFilterFactory currentTokenizerFactory = factory.create("html_strip", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); + + assertThat(emptySettingsTokenizerFactory, is(currentTokenizerFactory)); + assertThat(emptySettingsTokenizerFactory, is(former090TokenizerFactory)); + assertThat(emptySettingsTokenizerFactory, is(former090TokenizerFactoryCopy)); + } + +} diff --git a/src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactoryTests.java new file mode 100644 index 00000000000..ec8e29daefd --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenFilterFactoryFactoryTests.java @@ -0,0 +1,63 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.indices.analysis.PreBuiltTokenFilters; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.*; + +/** + * + */ +public class PreBuiltTokenFilterFactoryFactoryTests extends ElasticsearchTestCase { + + @Test + public void testThatCachingWorksForCachingStrategyOne() { + PreBuiltTokenFilterFactoryFactory factory = new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.WORD_DELIMITER.getTokenFilterFactory(Version.CURRENT)); + + TokenFilterFactory emptySettingsTokenizerFactory = factory.create("word_delimiter", ImmutableSettings.EMPTY); + TokenFilterFactory former090TokenizerFactory = factory.create("word_delimiter", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_1).build()); + TokenFilterFactory former090TokenizerFactoryCopy = factory.create("word_delimiter", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_2).build()); + TokenFilterFactory currentTokenizerFactory = factory.create("word_delimiter", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); + + assertThat(emptySettingsTokenizerFactory, is(currentTokenizerFactory)); + assertThat(emptySettingsTokenizerFactory, is(former090TokenizerFactory)); + assertThat(emptySettingsTokenizerFactory, is(former090TokenizerFactoryCopy)); + } + + @Test + public void testThatDifferentVersionsCanBeLoaded() { + PreBuiltTokenFilterFactoryFactory factory = new PreBuiltTokenFilterFactoryFactory(PreBuiltTokenFilters.STOP.getTokenFilterFactory(Version.CURRENT)); + + TokenFilterFactory emptySettingsTokenizerFactory = factory.create("stop", ImmutableSettings.EMPTY); + TokenFilterFactory former090TokenizerFactory = factory.create("stop", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_1).build()); + TokenFilterFactory former090TokenizerFactoryCopy = factory.create("stop", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_2).build()); + TokenFilterFactory currentTokenizerFactory = factory.create("stop", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); + + assertThat(emptySettingsTokenizerFactory, is(currentTokenizerFactory)); + assertThat(emptySettingsTokenizerFactory, is(not(former090TokenizerFactory))); + assertThat(former090TokenizerFactory, is(former090TokenizerFactoryCopy)); + } + +} diff --git a/src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactoryTests.java b/src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactoryTests.java new file mode 100644 index 00000000000..a94f63d09e2 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/analysis/PreBuiltTokenizerFactoryFactoryTests.java @@ -0,0 +1,51 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.analysis; + +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.indices.analysis.PreBuiltTokenizers; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import static org.hamcrest.CoreMatchers.*; + +/** + * + */ +public class PreBuiltTokenizerFactoryFactoryTests extends ElasticsearchTestCase { + + @Test + public void testThatDifferentVersionsCanBeLoaded() { + PreBuiltTokenizerFactoryFactory factory = new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.STANDARD.getTokenizerFactory(Version.CURRENT)); + + TokenizerFactory emptySettingsTokenizerFactory = factory.create("standard", ImmutableSettings.EMPTY); + // different es versions, same lucene version, thus cached + TokenizerFactory former090TokenizerFactory = factory.create("standard", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_1).build()); + TokenizerFactory former090TokenizerFactoryCopy = factory.create("standard", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_0_90_2).build()); + TokenizerFactory currentTokenizerFactory = factory.create("standard", ImmutableSettings.settingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build()); + + assertThat(emptySettingsTokenizerFactory, is(currentTokenizerFactory)); + assertThat(emptySettingsTokenizerFactory, is(not(former090TokenizerFactory))); + assertThat(emptySettingsTokenizerFactory, is(not(former090TokenizerFactoryCopy))); + assertThat(former090TokenizerFactory, is(former090TokenizerFactoryCopy)); + } + +} diff --git a/src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerIntegrationTests.java b/src/test/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzerIntegrationTests.java similarity index 78% rename from src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerIntegrationTests.java rename to src/test/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzerIntegrationTests.java index 6b620e6d452..8c0695e4acb 100644 --- a/src/test/java/org/elasticsearch/index/analysis/PreBuiltAnalyzerIntegrationTests.java +++ b/src/test/java/org/elasticsearch/indices/analysis/PreBuiltAnalyzerIntegrationTests.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.indices.analysis; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -26,7 +26,6 @@ import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.test.ElasticsearchIntegrationTest; import org.junit.Test; @@ -36,7 +35,8 @@ import java.util.Locale; import java.util.Map; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; -import static org.hamcrest.Matchers.*; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.notNullValue; /** * @@ -105,36 +105,35 @@ public class PreBuiltAnalyzerIntegrationTests extends ElasticsearchIntegrationTe assertThatAnalyzersHaveBeenLoaded(loadedAnalyzers); // check that all of the prebuiltanalyzers are still open - for (PreBuiltAnalyzers preBuiltAnalyzer : PreBuiltAnalyzers.values()) { - assertLuceneAnalyzerIsNotClosed(preBuiltAnalyzer); - } + assertLuceneAnalyzersAreNotClosed(loadedAnalyzers); } private void assertThatAnalyzersHaveBeenLoaded(Map> expectedLoadedAnalyzers) { for (Map.Entry> entry : expectedLoadedAnalyzers.entrySet()) { - Map cachedAnalyzers = entry.getKey().getCachedAnalyzers(); - assertThat(cachedAnalyzers.keySet(), hasItems(entry.getValue().toArray(new Version[]{}))); - /*for (Version expectedVersion : entry.getValue()) { - assertThat(cachedAnalyzers, contains(ex)) + for (Version version : entry.getValue()) { + // if it is not null in the cache, it has been loaded + assertThat(entry.getKey().getCache().get(version), is(notNullValue())); } - */ } } // the close() method of a lucene analyzer sets the storedValue field to null // we simply check this via reflection - ugly but works - private void assertLuceneAnalyzerIsNotClosed(PreBuiltAnalyzers preBuiltAnalyzer) throws IllegalAccessException, NoSuchFieldException { + private void assertLuceneAnalyzersAreNotClosed(Map> loadedAnalyzers) throws IllegalAccessException, NoSuchFieldException { + for (Map.Entry> preBuiltAnalyzerEntry : loadedAnalyzers.entrySet()) { + PreBuiltAnalyzers preBuiltAnalyzer = preBuiltAnalyzerEntry.getKey(); + for (Version version : preBuiltAnalyzerEntry.getValue()) { + Analyzer analyzer = preBuiltAnalyzerEntry.getKey().getCache().get(version); - for (Map.Entry luceneAnalyzerEntry : preBuiltAnalyzer.getCachedAnalyzers().entrySet()) { - Field field = getFieldFromClass("storedValue", luceneAnalyzerEntry.getValue()); - boolean currentAccessible = field.isAccessible(); - field.setAccessible(true); - Object storedValue = field.get(preBuiltAnalyzer.getAnalyzer(luceneAnalyzerEntry.getKey())); - field.setAccessible(currentAccessible); + Field field = getFieldFromClass("storedValue", analyzer); + boolean currentAccessible = field.isAccessible(); + field.setAccessible(true); + Object storedValue = field.get(analyzer); + field.setAccessible(currentAccessible); - assertThat(String.format(Locale.ROOT, "Analyzer %s in version %s seems to be closed", preBuiltAnalyzer.name(), luceneAnalyzerEntry.getKey()), storedValue, is(notNullValue())); + assertThat(String.format(Locale.ROOT, "Analyzer %s in version %s seems to be closed", preBuiltAnalyzer.name(), version), storedValue, is(notNullValue())); + } } - } /**