Moved tokenizers to analysis common module (#30538)

The following tokenizers were moved: classic, edge_ngram,
letter, lowercase, ngram, path_hierarchy, pattern, thai, uax_url_email and
whitespace.

Left keyword tokenizer factory in server module, because
normalizers directly depend on it.This should be addressed on a
follow up change.

Relates to #23658
This commit is contained in:
Martijn van Groningen 2018-05-14 07:55:01 +02:00 committed by GitHub
parent 901436148b
commit 7b95470897
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 679 additions and 336 deletions

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import java.util.HashSet;
import java.util.Set;

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
/**
* Factory for {@link ClassicTokenizer}
@ -33,7 +34,7 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {
private final int maxTokenLength;
public ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}

View File

@ -34,9 +34,11 @@ import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.cz.CzechStemFilter;
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
import org.apache.lucene.analysis.de.GermanStemFilter;
@ -58,17 +60,25 @@ import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.ClassicFilter;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.apache.lucene.analysis.tr.ApostropheFilter;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
@ -169,6 +179,19 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
tokenizers.put("thai", ThaiTokenizerFactory::new);
tokenizers.put("nGram", NGramTokenizerFactory::new);
tokenizers.put("ngram", NGramTokenizerFactory::new);
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.put("classic", ClassicTokenizerFactory::new);
tokenizers.put("letter", LetterTokenizerFactory::new);
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.put("pattern", PatternTokenizerFactory::new);
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
return tokenizers;
}
@ -283,6 +306,16 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
@Override
public String name() {
@ -294,6 +327,13 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
return new LowerCaseFilter(tokenStream);
}
}));
// Temporary shim for aliases. TODO deprecate after they are moved
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new, null));
tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));
return tokenizers;
}
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
@ -25,19 +25,17 @@ import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
import static org.elasticsearch.analysis.common.NGramTokenizerFactory.parseTokenChars;
public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
private final int minGram;
private final int maxGram;
private final CharMatcher matcher;
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);

View File

@ -17,17 +17,18 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
public class LetterTokenizerFactory extends AbstractTokenizerFactory {
public LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,17 +17,19 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {
public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
@ -25,6 +25,7 @@ import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
@ -83,7 +84,7 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
return builder.build();
}
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
@ -35,7 +36,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
private final int skip;
private final boolean reverse;
public PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
bufferSize = settings.getAsInt("buffer_size", 1024);
String delimiter = settings.get("delimiter");

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
@ -25,6 +25,7 @@ import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
import java.util.regex.Pattern;
@ -33,7 +34,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
private final Pattern pattern;
private final int group;
public PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);

View File

@ -17,20 +17,21 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
/**
* Factory for {@link ThaiTokenizer}
*/
public class ThaiTokenizerFactory extends AbstractTokenizerFactory {
public ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -25,12 +25,13 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
private final int maxTokenLength;
public UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}
@ -41,4 +42,4 @@ public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
}
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -26,13 +26,14 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
public class WhitespaceTokenizerFactory extends AbstractTokenizerFactory {
static final String MAX_TOKEN_LENGTH = "max_token_length";
private Integer maxTokenLength;
public WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.elasticsearch.test.ESTestCase;

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
@ -45,6 +46,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class);
tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class);
tokenizers.put("thai", ThaiTokenizerFactory.class);
tokenizers.put("ngram", NGramTokenizerFactory.class);
tokenizers.put("edgengram", EdgeNGramTokenizerFactory.class);
tokenizers.put("classic", ClassicTokenizerFactory.class);
tokenizers.put("letter", LetterTokenizerFactory.class);
tokenizers.put("lowercase", LowerCaseTokenizerFactory.class);
tokenizers.put("pathhierarchy", PathHierarchyTokenizerFactory.class);
tokenizers.put("pattern", PatternTokenizerFactory.class);
tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
tokenizers.put("whitespace", WhitespaceTokenizerFactory.class);
return tokenizers;
}
@ -211,10 +222,25 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
@Override
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
filters.put("keyword", null);
filters.put("lowercase", null);
return filters;
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getPreConfiguredTokenizers());
tokenizers.put("keyword", null);
tokenizers.put("lowercase", null);
tokenizers.put("classic", null);
tokenizers.put("uax_url_email", org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class);
tokenizers.put("path_hierarchy", null);
tokenizers.put("letter", null);
tokenizers.put("whitespace", null);
tokenizers.put("ngram", null);
tokenizers.put("edge_ngram", null);
tokenizers.put("pattern", null);
tokenizers.put("thai", null);
// TODO drop aliases once they are moved to module
tokenizers.put("nGram", tokenizers.get("ngram"));
tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
return tokenizers;
}
/**

View File

@ -45,7 +45,7 @@ public class CommonGramsTokenFilterFactoryTests extends ESTokenStreamTestCase {
.build();
try {
AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
Assert.fail("[common_words] or [common_words_path] is set");
} catch (IllegalArgumentException e) {
} catch (IOException e) {

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.query;
package org.elasticsearch.analysis.common;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
@ -29,12 +29,22 @@ import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.QueryStringQueryBuilder;
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
import org.elasticsearch.index.query.SimpleQueryStringFlag;
import org.elasticsearch.index.search.MatchQuery;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.junit.After;
import org.junit.Before;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import static org.hamcrest.Matchers.equalTo;
@ -49,6 +59,11 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
private static Query expectedQueryWithUnigram;
private static Query expectedPhraseQueryWithUnigram;
@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
return Collections.singleton(CommonAnalysisPlugin.class);
}
@Before
public void setup() {
Settings settings = Settings.builder()
@ -150,42 +165,42 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
public void testMatchPhraseQuery() throws IOException {
MatchPhraseQueryBuilder builder =
new MatchPhraseQueryBuilder("text_shingle_unigram", "foo bar baz");
Query query = builder.doToQuery(shardContext);
Query query = builder.toQuery(shardContext);
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
builder =
new MatchPhraseQueryBuilder("text_shingle", "foo bar baz biz");
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQuery, equalTo(query));
}
public void testMatchQuery() throws IOException {
MatchQueryBuilder builder =
new MatchQueryBuilder("text_shingle_unigram", "foo bar baz");
Query query = builder.doToQuery(shardContext);
Query query = builder.toQuery(shardContext);
assertThat(expectedQueryWithUnigram, equalTo(query));
builder = new MatchQueryBuilder("text_shingle", "foo bar baz biz");
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedQuery, equalTo(query));
}
public void testMultiMatchQuery() throws IOException {
MultiMatchQueryBuilder builder = new MultiMatchQueryBuilder("foo bar baz",
"text_shingle_unigram");
Query query = builder.doToQuery(shardContext);
Query query = builder.toQuery(shardContext);
assertThat(expectedQueryWithUnigram, equalTo(query));
builder.type(MatchQuery.Type.PHRASE);
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
builder = new MultiMatchQueryBuilder("foo bar baz biz", "text_shingle");
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedQuery, equalTo(query));
builder.type(MatchQuery.Type.PHRASE);
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQuery, equalTo(query));
}
@ -193,47 +208,47 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
SimpleQueryStringBuilder builder = new SimpleQueryStringBuilder("foo bar baz");
builder.field("text_shingle_unigram");
builder.flags(SimpleQueryStringFlag.NONE);
Query query = builder.doToQuery(shardContext);
Query query = builder.toQuery(shardContext);
assertThat(expectedQueryWithUnigram, equalTo(query));
builder = new SimpleQueryStringBuilder("\"foo bar baz\"");
builder.field("text_shingle_unigram");
builder.flags(SimpleQueryStringFlag.PHRASE);
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
builder = new SimpleQueryStringBuilder("foo bar baz biz");
builder.field("text_shingle");
builder.flags(SimpleQueryStringFlag.NONE);
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedQuery, equalTo(query));
builder = new SimpleQueryStringBuilder("\"foo bar baz biz\"");
builder.field("text_shingle");
builder.flags(SimpleQueryStringFlag.PHRASE);
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQuery, equalTo(query));
}
public void testQueryString() throws IOException {
QueryStringQueryBuilder builder = new QueryStringQueryBuilder("foo bar baz");
builder.field("text_shingle_unigram");
Query query = builder.doToQuery(shardContext);
Query query = builder.toQuery(shardContext);
assertThat(expectedQueryWithUnigram, equalTo(query));
builder = new QueryStringQueryBuilder("\"foo bar baz\"");
builder.field("text_shingle_unigram");
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
builder = new QueryStringQueryBuilder("foo bar baz biz");
builder.field("text_shingle");
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedQuery, equalTo(query));
builder = new QueryStringQueryBuilder("\"foo bar baz biz\"");
builder.field("text_shingle");
query = builder.doToQuery(shardContext);
query = builder.toQuery(shardContext);
assertThat(expectedPhraseQuery, equalTo(query));
}
}

View File

@ -30,8 +30,6 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.Settings.Builder;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;

View File

@ -17,15 +17,13 @@
* under the License.
*/
package org.elasticsearch.index.analysis.synonyms;
package org.elasticsearch.analysis.common;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
@ -44,7 +42,6 @@ import static org.hamcrest.Matchers.instanceOf;
import static org.hamcrest.Matchers.startsWith;
public class SynonymsAnalysisTests extends ESTestCase {
protected final Logger logger = Loggers.getLogger(getClass());
private IndexAnalyzers indexAnalyzers;
public void testSynonymsAnalysis() throws IOException {
@ -56,14 +53,14 @@ public class SynonymsAnalysisTests extends ESTestCase {
Files.copy(synonyms, config.resolve("synonyms.txt"));
Files.copy(synonymsWordnet, config.resolve("synonyms_wordnet.txt"));
String json = "/org/elasticsearch/index/analysis/synonyms/synonyms.json";
String json = "/org/elasticsearch/analysis/common/synonyms.json";
Settings settings = Settings.builder().
loadFromStream(json, getClass().getResourceAsStream(json), false)
.put(Environment.PATH_HOME_SETTING.getKey(), home)
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
match("synonymAnalyzer", "kimchy is the dude abides", "shay is the elasticsearch man!");
match("synonymAnalyzer_file", "kimchy is the dude abides", "shay is the elasticsearch man!");
@ -91,7 +88,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try {
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
fail("fail! due to synonym word deleted by analyzer");
} catch (Exception e) {
assertThat(e, instanceOf(IllegalArgumentException.class));
@ -112,7 +109,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try {
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
fail("fail! due to synonym word deleted by analyzer");
} catch (Exception e) {
assertThat(e, instanceOf(IllegalArgumentException.class));

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;

View File

@ -70,3 +70,374 @@
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: foo }
- match: { detail.tokenizer.tokens.1.token: bar }
---
"thai_tokenizer":
- do:
indices.analyze:
body:
text: "ภาษาไทย"
explain: true
tokenizer:
type: thai
- length: { detail.tokenizer.tokens: 2 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: ภาษา }
- match: { detail.tokenizer.tokens.1.token: ไทย }
- do:
indices.analyze:
body:
text: "ภาษาไทย"
explain: true
tokenizer: thai
- length: { detail.tokenizer.tokens: 2 }
- match: { detail.tokenizer.name: thai }
- match: { detail.tokenizer.tokens.0.token: ภาษา }
- match: { detail.tokenizer.tokens.1.token: ไทย }
---
"ngram":
- do:
indices.analyze:
body:
text: "foobar"
explain: true
tokenizer:
type: ngram
min_gram: 3
max_gram: 3
- length: { detail.tokenizer.tokens: 4 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: foo }
- match: { detail.tokenizer.tokens.1.token: oob }
- match: { detail.tokenizer.tokens.2.token: oba }
- match: { detail.tokenizer.tokens.3.token: bar }
- do:
indices.analyze:
body:
text: "foobar"
explain: true
tokenizer:
type: nGram
min_gram: 3
max_gram: 3
- length: { detail.tokenizer.tokens: 4 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: foo }
- match: { detail.tokenizer.tokens.1.token: oob }
- match: { detail.tokenizer.tokens.2.token: oba }
- match: { detail.tokenizer.tokens.3.token: bar }
- do:
indices.analyze:
body:
text: "foo"
explain: true
tokenizer: ngram
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: ngram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- match: { detail.tokenizer.tokens.2.token: o }
- match: { detail.tokenizer.tokens.3.token: oo }
- match: { detail.tokenizer.tokens.4.token: o }
- do:
indices.analyze:
body:
text: "foo"
explain: true
tokenizer: nGram
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: nGram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- match: { detail.tokenizer.tokens.2.token: o }
- match: { detail.tokenizer.tokens.3.token: oo }
- match: { detail.tokenizer.tokens.4.token: o }
---
"edge_ngram":
- do:
indices.analyze:
body:
text: "foo"
explain: true
tokenizer:
type: edge_ngram
min_gram: 1
max_gram: 3
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- match: { detail.tokenizer.tokens.2.token: foo }
- do:
indices.analyze:
body:
text: "foo"
explain: true
tokenizer:
type: edgeNGram
min_gram: 1
max_gram: 3
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- match: { detail.tokenizer.tokens.2.token: foo }
- do:
indices.analyze:
body:
text: "foo"
explain: true
tokenizer: edge_ngram
- length: { detail.tokenizer.tokens: 2 }
- match: { detail.tokenizer.name: edge_ngram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- do:
indices.analyze:
body:
text: "foo"
explain: true
tokenizer: edgeNGram
- length: { detail.tokenizer.tokens: 2 }
- match: { detail.tokenizer.name: edgeNGram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
---
"classic":
- do:
indices.analyze:
body:
text: "Brown-Foxes don't jump."
explain: true
tokenizer:
type: classic
- length: { detail.tokenizer.tokens: 4 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: Brown }
- match: { detail.tokenizer.tokens.1.token: Foxes }
- match: { detail.tokenizer.tokens.2.token: don't }
- match: { detail.tokenizer.tokens.3.token: jump }
- do:
indices.analyze:
body:
text: "Brown-Foxes don't jump."
explain: true
tokenizer: classic
- length: { detail.tokenizer.tokens: 4 }
- match: { detail.tokenizer.name: classic }
- match: { detail.tokenizer.tokens.0.token: Brown }
- match: { detail.tokenizer.tokens.1.token: Foxes }
- match: { detail.tokenizer.tokens.2.token: don't }
- match: { detail.tokenizer.tokens.3.token: jump }
---
"letter":
- do:
indices.analyze:
body:
text: "Brown-Foxes don't jump."
explain: true
tokenizer:
type: letter
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: Brown }
- match: { detail.tokenizer.tokens.1.token: Foxes }
- match: { detail.tokenizer.tokens.2.token: don }
- match: { detail.tokenizer.tokens.3.token: t }
- match: { detail.tokenizer.tokens.4.token: jump }
- do:
indices.analyze:
body:
text: "Brown-Foxes don't jump."
explain: true
tokenizer: letter
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: letter }
- match: { detail.tokenizer.tokens.0.token: Brown }
- match: { detail.tokenizer.tokens.1.token: Foxes }
- match: { detail.tokenizer.tokens.2.token: don }
- match: { detail.tokenizer.tokens.3.token: t }
- match: { detail.tokenizer.tokens.4.token: jump }
---
"lowercase":
- do:
indices.analyze:
body:
text: "Brown-Foxes don't jump."
explain: true
tokenizer:
type: lowercase
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: brown }
- match: { detail.tokenizer.tokens.1.token: foxes }
- match: { detail.tokenizer.tokens.2.token: don }
- match: { detail.tokenizer.tokens.3.token: t }
- match: { detail.tokenizer.tokens.4.token: jump }
- do:
indices.analyze:
body:
text: "Brown-Foxes don't jump."
explain: true
tokenizer: lowercase
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: lowercase }
- match: { detail.tokenizer.tokens.0.token: brown }
- match: { detail.tokenizer.tokens.1.token: foxes }
- match: { detail.tokenizer.tokens.2.token: don }
- match: { detail.tokenizer.tokens.3.token: t }
- match: { detail.tokenizer.tokens.4.token: jump }
---
"path_hierarchy":
- do:
indices.analyze:
body:
text: "a/b/c"
explain: true
tokenizer:
type: path_hierarchy
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: a }
- match: { detail.tokenizer.tokens.1.token: a/b }
- match: { detail.tokenizer.tokens.2.token: a/b/c }
- do:
indices.analyze:
body:
text: "a/b/c"
explain: true
tokenizer:
type: PathHierarchy
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: a }
- match: { detail.tokenizer.tokens.1.token: a/b }
- match: { detail.tokenizer.tokens.2.token: a/b/c }
- do:
indices.analyze:
body:
text: "a/b/c"
explain: true
tokenizer: path_hierarchy
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: path_hierarchy }
- match: { detail.tokenizer.tokens.0.token: a }
- match: { detail.tokenizer.tokens.1.token: a/b }
- match: { detail.tokenizer.tokens.2.token: a/b/c }
- do:
indices.analyze:
body:
text: "a/b/c"
explain: true
tokenizer: PathHierarchy
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: PathHierarchy }
- match: { detail.tokenizer.tokens.0.token: a }
- match: { detail.tokenizer.tokens.1.token: a/b }
- match: { detail.tokenizer.tokens.2.token: a/b/c }
---
"pattern":
- do:
indices.analyze:
body:
text: "split by whitespace by default"
explain: true
tokenizer:
type: pattern
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: split }
- match: { detail.tokenizer.tokens.1.token: by }
- match: { detail.tokenizer.tokens.2.token: whitespace }
- match: { detail.tokenizer.tokens.3.token: by }
- match: { detail.tokenizer.tokens.4.token: default }
- do:
indices.analyze:
body:
text: "split by whitespace by default"
explain: true
tokenizer: pattern
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: pattern }
- match: { detail.tokenizer.tokens.0.token: split }
- match: { detail.tokenizer.tokens.1.token: by }
- match: { detail.tokenizer.tokens.2.token: whitespace }
- match: { detail.tokenizer.tokens.3.token: by }
- match: { detail.tokenizer.tokens.4.token: default }
---
"uax_url_email":
- do:
indices.analyze:
body:
text: "Email me at john.smith@global-international.com"
explain: true
tokenizer:
type: uax_url_email
- length: { detail.tokenizer.tokens: 4 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: Email }
- match: { detail.tokenizer.tokens.1.token: me }
- match: { detail.tokenizer.tokens.2.token: at }
- match: { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
- do:
indices.analyze:
body:
text: "Email me at john.smith@global-international.com"
explain: true
tokenizer: uax_url_email
- length: { detail.tokenizer.tokens: 4 }
- match: { detail.tokenizer.name: uax_url_email }
- match: { detail.tokenizer.tokens.0.token: Email }
- match: { detail.tokenizer.tokens.1.token: me }
- match: { detail.tokenizer.tokens.2.token: at }
- match: { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
---
"whitespace":
- do:
indices.analyze:
body:
text: "split by whitespace"
explain: true
tokenizer:
type: whitespace
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: split }
- match: { detail.tokenizer.tokens.1.token: by }
- match: { detail.tokenizer.tokens.2.token: whitespace }
- do:
indices.analyze:
body:
text: "split by whitespace"
explain: true
tokenizer: whitespace
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: whitespace }
- match: { detail.tokenizer.tokens.0.token: split }
- match: { detail.tokenizer.tokens.1.token: by }
- match: { detail.tokenizer.tokens.2.token: whitespace }

View File

@ -67,3 +67,33 @@
text: "<html>foo</html>"
- length: { tokens: 1 }
- match: { tokens.0.token: "\nfoo\n" }
---
"Synonym filter with tokenizer":
- do:
indices.create:
index: test_synonym
body:
settings:
index:
analysis:
tokenizer:
trigram:
type: nGram
min_gram: 3
max_gram: 3
filter:
synonym:
type: synonym
synonyms: ["kimchy => shay"]
- do:
indices.analyze:
index: test_synonym
body:
tokenizer: trigram
filter: [synonym]
text: kimchy
- length: { tokens: 2 }
- match: { tokens.0.token: sha }
- match: { tokens.1.token: hay }

View File

@ -39,3 +39,97 @@
text:
query: foa
- match: {hits.total: 1}
---
"testNGramCopyField":
- do:
indices.create:
index: test
body:
settings:
number_of_shards: 1
number_of_replicas: 0
max_ngram_diff: 9
analysis:
analyzer:
my_ngram_analyzer:
tokenizer: my_ngram_tokenizer
tokenizer:
my_ngram_tokenizer:
type: ngram
min: 1,
max: 10
token_chars: []
mappings:
doc:
properties:
origin:
type: text
copy_to: meta
meta:
type: text
analyzer: my_ngram_analyzer
- do:
index:
index: test
type: doc
id: 1
body: { "origin": "C.A1234.5678" }
refresh: true
- do:
search:
body:
query:
match:
meta:
query: 1234
- match: {hits.total: 1}
- do:
search:
body:
query:
match:
meta:
query: 1234.56
- match: {hits.total: 1}
- do:
search:
body:
query:
match:
meta:
query: A1234
- match: {hits.total: 1}
- do:
search:
body:
query:
term:
meta:
value: a1234
- match: {hits.total: 0}
- do:
search:
body:
query:
match:
meta:
query: A1234
analyzer: my_ngram_analyzer
- match: {hits.total: 1}
- do:
search:
body:
query:
match:
meta:
query: a1234
analyzer: my_ngram_analyzer
- match: {hits.total: 1}

View File

@ -76,36 +76,6 @@
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
- match: { detail.tokenfilters.0.tokens.0.token: bar }
---
"Synonym filter with tokenizer":
- do:
indices.create:
index: test_synonym
body:
settings:
index:
analysis:
tokenizer:
trigram:
type: nGram
min_gram: 3
max_gram: 3
filter:
synonym:
type: synonym
synonyms: ["kimchy => shay"]
- do:
indices.analyze:
index: test_synonym
body:
tokenizer: trigram
filter: [synonym]
text: kimchy
- length: { tokens: 2 }
- match: { tokens.0.token: sha }
- match: { tokens.1.token: hay }
---
"Custom normalizer in request":
- do:

View File

@ -24,6 +24,7 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
public class KeywordTokenizerFactory extends AbstractTokenizerFactory {

View File

@ -39,11 +39,9 @@ import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
@ -60,14 +58,9 @@ import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
import org.elasticsearch.index.analysis.PatternAnalyzerProvider;
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
@ -88,13 +81,10 @@ import org.elasticsearch.index.analysis.StopAnalyzerProvider;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import java.io.IOException;
@ -223,36 +213,19 @@ public final class AnalysisModule {
}
preConfiguredTokenizers.register(name, preConfigured);
}
// Temporary shim for aliases. TODO deprecate after they are moved
preConfiguredTokenizers.register("nGram", preConfiguredTokenizers.getRegistry().get("ngram"));
preConfiguredTokenizers.register("edgeNGram", preConfiguredTokenizers.getRegistry().get("edge_ngram"));
preConfiguredTokenizers.register("PathHierarchy", preConfiguredTokenizers.getRegistry().get("path_hierarchy"));
for (AnalysisPlugin plugin: plugins) {
for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) {
preConfiguredTokenizers.register(tokenizer.getName(), tokenizer);
}
}
return unmodifiableMap(preConfiguredTokenizers.getRegistry());
}
private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
tokenizers.register("standard", StandardTokenizerFactory::new);
tokenizers.register("uax_url_email", UAX29URLEmailTokenizerFactory::new);
tokenizers.register("path_hierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.register("PathHierarchy", PathHierarchyTokenizerFactory::new);
tokenizers.register("keyword", KeywordTokenizerFactory::new);
tokenizers.register("letter", LetterTokenizerFactory::new);
tokenizers.register("lowercase", LowerCaseTokenizerFactory::new);
tokenizers.register("whitespace", WhitespaceTokenizerFactory::new);
tokenizers.register("nGram", NGramTokenizerFactory::new);
tokenizers.register("ngram", NGramTokenizerFactory::new);
tokenizers.register("edgeNGram", EdgeNGramTokenizerFactory::new);
tokenizers.register("edge_ngram", EdgeNGramTokenizerFactory::new);
tokenizers.register("pattern", PatternTokenizerFactory::new);
tokenizers.register("classic", ClassicTokenizerFactory::new);
tokenizers.register("thai", ThaiTokenizerFactory::new);
tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
return tokenizers;
}

View File

@ -19,18 +19,8 @@
package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.standard.ClassicTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
import org.apache.lucene.analysis.th.ThaiTokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
@ -41,69 +31,6 @@ public enum PreBuiltTokenizers {
protected Tokenizer create(Version version) {
return new StandardTokenizer();
}
},
CLASSIC(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new ClassicTokenizer();
}
},
UAX_URL_EMAIL(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new UAX29URLEmailTokenizer();
}
},
PATH_HIERARCHY(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new PathHierarchyTokenizer();
}
},
LETTER(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new LetterTokenizer();
}
},
WHITESPACE(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new WhitespaceTokenizer();
}
},
NGRAM(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new NGramTokenizer();
}
},
EDGE_NGRAM(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
}
},
PATTERN(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new PatternTokenizer(Regex.compile("\\W+", null), -1);
}
},
THAI(CachingStrategy.ONE) {
@Override
protected Tokenizer create(Version version) {
return new ThaiTokenizer();
}
}
;

View File

@ -287,7 +287,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
e = expectThrows(IllegalArgumentException.class,
() -> TransportAnalyzeAction.analyze(
new AnalyzeRequest()
.tokenizer("whitespace")
.tokenizer("standard")
.addTokenFilter("foobar")
.text("the qu1ck brown fox"),
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
@ -300,7 +300,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
e = expectThrows(IllegalArgumentException.class,
() -> TransportAnalyzeAction.analyze(
new AnalyzeRequest()
.tokenizer("whitespace")
.tokenizer("standard")
.addTokenFilter("lowercase")
.addCharFilter("foobar")
.text("the qu1ck brown fox"),
@ -322,7 +322,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
public void testNonPreBuildTokenFilter() throws IOException {
AnalyzeRequest request = new AnalyzeRequest();
request.tokenizer("whitespace");
request.tokenizer("standard");
request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
request.text("the quick brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);

View File

@ -188,7 +188,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
.addAlias(new Alias("alias"))
.setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
@ -260,7 +260,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
.endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping)
.setSettings(Settings.builder()
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
@ -394,7 +394,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
.addMapping("type1", mapping)
.setSettings(Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
ensureGreen();

View File

@ -18,6 +18,7 @@
*/
package org.elasticsearch.action.termvectors;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.payloads.FloatEncoder;
@ -35,6 +36,7 @@ import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
@ -93,6 +95,12 @@ public class GetTermVectorsTests extends ESSingleNodeTestCase {
});
}
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
return Collections.singletonList(PreConfiguredTokenizer.singleton("mock-whitespace",
() -> new MockTokenizer(MockTokenizer.WHITESPACE, false), null));
}
// Based on DelimitedPayloadTokenFilter:
final class MockPayloadTokenFilter extends TokenFilter {
private final char delimiter;
@ -151,7 +159,7 @@ public class GetTermVectorsTests extends ESSingleNodeTestCase {
.startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
Settings setting = Settings.builder()
.put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
.put("index.analysis.analyzer.payload_test.tokenizer", "mock-whitespace")
.putList("index.analysis.analyzer.payload_test.filter", "my_delimited_payload")
.put("index.analysis.filter.my_delimited_payload.delimiter", delimiter)
.put("index.analysis.filter.my_delimited_payload.encoding", encodingString)

View File

@ -35,10 +35,8 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.startsWith;
public class AnalyzeActionIT extends ESIntegTestCase {
public void testSimpleAnalyzerTests() throws Exception {
assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
@ -333,14 +331,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
AnalyzeResponse analyzeResponse = client().admin().indices()
.prepareAnalyze()
.setText("Foo buzz test")
.setTokenizer("whitespace")
.setTokenizer("standard")
.addTokenFilter("lowercase")
.addTokenFilter(stopFilterSettings)
.setExplain(true)
.get();
//tokenizer
assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("whitespace"));
assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("standard"));
assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(3));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("Foo"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
@ -393,41 +391,6 @@ public class AnalyzeActionIT extends ESIntegTestCase {
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getPositionLength(), equalTo(1));
}
public void testCustomTokenizerInRequest() throws Exception {
Map<String, Object> tokenizerSettings = new HashMap<>();
tokenizerSettings.put("type", "nGram");
tokenizerSettings.put("min_gram", 2);
tokenizerSettings.put("max_gram", 2);
AnalyzeResponse analyzeResponse = client().admin().indices()
.prepareAnalyze()
.setText("good")
.setTokenizer(tokenizerSettings)
.setExplain(true)
.get();
//tokenizer
assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("_anonymous_tokenizer"));
assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(3));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("go"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getEndOffset(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPosition(), equalTo(0));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getTerm(), equalTo("oo"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getStartOffset(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getEndOffset(), equalTo(3));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPosition(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPositionLength(), equalTo(1));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getTerm(), equalTo("od"));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getStartOffset(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getEndOffset(), equalTo(4));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPosition(), equalTo(2));
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPositionLength(), equalTo(1));
}
public void testAnalyzeKeywordField() throws IOException {
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("test", "keyword", "type=keyword"));
ensureGreen("test");

View File

@ -677,7 +677,7 @@ public class SimpleIndexTemplateIT extends ESIntegTestCase {
" \"analysis\" : {\n" +
" \"analyzer\" : {\n" +
" \"custom_1\" : {\n" +
" \"tokenizer\" : \"whitespace\"\n" +
" \"tokenizer\" : \"standard\"\n" +
" }\n" +
" }\n" +
" }\n" +

View File

@ -1359,7 +1359,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
public void testPhrasePrefix() throws IOException {
Builder builder = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.synonym.tokenizer", "whitespace")
.put("index.analysis.analyzer.synonym.tokenizer", "standard")
.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
.put("index.analysis.filter.synonym.type", "synonym")
.putList("index.analysis.filter.synonym.synonyms", "quick => fast");
@ -2804,7 +2804,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
public void testSynonyms() throws IOException {
Builder builder = Settings.builder()
.put(indexSettings())
.put("index.analysis.analyzer.synonym.tokenizer", "whitespace")
.put("index.analysis.analyzer.synonym.tokenizer", "standard")
.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
.put("index.analysis.filter.synonym.type", "synonym")
.putList("index.analysis.filter.synonym.synonyms", "fast,quick");

View File

@ -156,7 +156,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
public void testMoreDocs() throws Exception {
Builder builder = Settings.builder();
builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
builder.put("index.analysis.filter.synonym.type", "synonym");
builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
@ -234,7 +234,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
// Tests a rescore window smaller than number of hits:
public void testSmallRescoreWindow() throws Exception {
Builder builder = Settings.builder();
builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
builder.put("index.analysis.filter.synonym.type", "synonym");
builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
@ -306,7 +306,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
// Tests a rescorer that penalizes the scores:
public void testRescorerMadeScoresWorse() throws Exception {
Builder builder = Settings.builder();
builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
builder.put("index.analysis.filter.synonym.type", "synonym");
builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");

View File

@ -82,7 +82,7 @@ public class MultiMatchQueryIT extends ESIntegTestCase {
.put("index.analysis.analyzer.perfect_match.tokenizer", "keyword")
.put("index.analysis.analyzer.perfect_match.filter", "lowercase")
.put("index.analysis.analyzer.category.type", "custom")
.put("index.analysis.analyzer.category.tokenizer", "whitespace")
.put("index.analysis.analyzer.category.tokenizer", "standard")
.put("index.analysis.analyzer.category.filter", "lowercase")
);
assertAcked(builder.addMapping("test", createMapping()));

View File

@ -20,7 +20,6 @@
package org.elasticsearch.search.query;
import org.apache.lucene.util.English;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchPhaseExecutionException;
@ -30,7 +29,6 @@ import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MatchQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
@ -351,7 +349,7 @@ public class SearchQueryIT extends ESIntegTestCase {
.put(SETTING_NUMBER_OF_SHARDS,1)
.put("index.analysis.filter.syns.type","synonym")
.putList("index.analysis.filter.syns.synonyms","quick,fast")
.put("index.analysis.analyzer.syns.tokenizer","whitespace")
.put("index.analysis.analyzer.syns.tokenizer","standard")
.put("index.analysis.analyzer.syns.filter","syns")
)
.addMapping("type1", "field1", "type=text,analyzer=syns", "field2", "type=text,analyzer=syns"));
@ -1764,56 +1762,6 @@ public class SearchQueryIT extends ESIntegTestCase {
assertHitCount(client().prepareSearch().setQuery(matchAllQuery()).get(), 1L);
}
// see #5120
public void testNGramCopyField() {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
.put(indexSettings())
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
.put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
.put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
.put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")
.put("index.analysis.tokenizer.my_ngram_tokenizer.min_gram", "1")
.put("index.analysis.tokenizer.my_ngram_tokenizer.max_gram", "10")
.putList("index.analysis.tokenizer.my_ngram_tokenizer.token_chars", new String[0]));
assertAcked(builder.addMapping("test", "origin", "type=text,copy_to=meta", "meta", "type=text,analyzer=my_ngram_analyzer"));
// we only have ngrams as the index analyzer so searches will get standard analyzer
client().prepareIndex("test", "test", "1").setSource("origin", "C.A1234.5678")
.setRefreshPolicy(IMMEDIATE)
.get();
SearchResponse searchResponse = client().prepareSearch("test")
.setQuery(matchQuery("meta", "1234"))
.get();
assertHitCount(searchResponse, 1L);
searchResponse = client().prepareSearch("test")
.setQuery(matchQuery("meta", "1234.56"))
.get();
assertHitCount(searchResponse, 1L);
searchResponse = client().prepareSearch("test")
.setQuery(termQuery("meta", "A1234"))
.get();
assertHitCount(searchResponse, 1L);
searchResponse = client().prepareSearch("test")
.setQuery(termQuery("meta", "a1234"))
.get();
assertHitCount(searchResponse, 0L); // it's upper case
searchResponse = client().prepareSearch("test")
.setQuery(matchQuery("meta", "A1234").analyzer("my_ngram_analyzer"))
.get(); // force ngram analyzer
assertHitCount(searchResponse, 1L);
searchResponse = client().prepareSearch("test")
.setQuery(matchQuery("meta", "a1234").analyzer("my_ngram_analyzer"))
.get(); // this one returns a hit since it's default operator is OR
assertHitCount(searchResponse, 1L);
}
public void testMatchPhrasePrefixQuery() throws ExecutionException, InterruptedException {
createIndex("test1");
indexRandom(true, client().prepareIndex("test1", "type1", "1").setSource("field", "Johnnie Walker Black Label"),

View File

@ -427,7 +427,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
public void testStopwordsOnlyPhraseSuggest() throws IOException {
assertAcked(prepareCreate("test").addMapping("typ1", "body", "type=text,analyzer=stopwd").setSettings(
Settings.builder()
.put("index.analysis.analyzer.stopwd.tokenizer", "whitespace")
.put("index.analysis.analyzer.stopwd.tokenizer", "standard")
.putList("index.analysis.analyzer.stopwd.filter", "stop")
));
ensureGreen();

View File

@ -22,18 +22,10 @@ package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.Version;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
@ -43,9 +35,6 @@ import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
@ -88,20 +77,20 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
// exposed in ES
.put("classic", ClassicTokenizerFactory.class)
.put("edgengram", EdgeNGramTokenizerFactory.class)
.put("classic", MovedToAnalysisCommon.class)
.put("edgengram", MovedToAnalysisCommon.class)
.put("keyword", KeywordTokenizerFactory.class)
.put("letter", LetterTokenizerFactory.class)
.put("lowercase", LowerCaseTokenizerFactory.class)
.put("ngram", NGramTokenizerFactory.class)
.put("pathhierarchy", PathHierarchyTokenizerFactory.class)
.put("pattern", PatternTokenizerFactory.class)
.put("letter", MovedToAnalysisCommon.class)
.put("lowercase", MovedToAnalysisCommon.class)
.put("ngram", MovedToAnalysisCommon.class)
.put("pathhierarchy", MovedToAnalysisCommon.class)
.put("pattern", MovedToAnalysisCommon.class)
.put("simplepattern", MovedToAnalysisCommon.class)
.put("simplepatternsplit", MovedToAnalysisCommon.class)
.put("standard", StandardTokenizerFactory.class)
.put("thai", ThaiTokenizerFactory.class)
.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
.put("whitespace", WhitespaceTokenizerFactory.class)
.put("thai", MovedToAnalysisCommon.class)
.put("uax29urlemail", MovedToAnalysisCommon.class)
.put("whitespace", MovedToAnalysisCommon.class)
// this one "seems to mess up offsets". probably shouldn't be a tokenizer...
.put("wikipedia", Void.class)
@ -292,23 +281,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
Map<String, Class<?>> tokenizers = new HashMap<>();
// TODO drop this temporary shim when all the old style tokenizers have been migrated to new style
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
final Class<?> luceneFactoryClazz;
switch (tokenizer) {
case UAX_URL_EMAIL:
luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
break;
case PATH_HIERARCHY:
luceneFactoryClazz = Void.class;
break;
default:
luceneFactoryClazz = null;
}
tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz);
tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), null);
}
// TODO drop aliases once they are moved to module
tokenizers.put("nGram", tokenizers.get("ngram"));
tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
return tokenizers;
}