mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-24 17:09:48 +00:00
Moved tokenizers to analysis common module (#30538)
The following tokenizers were moved: classic, edge_ngram, letter, lowercase, ngram, path_hierarchy, pattern, thai, uax_url_email and whitespace. Left keyword tokenizer factory in server module, because normalizers directly depend on it.This should be addressed on a follow up change. Relates to #23658
This commit is contained in:
parent
901436148b
commit
7b95470897
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizer;
|
||||
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ClassicTokenizer}
|
||||
@ -33,7 +34,7 @@ public class ClassicTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
private final int maxTokenLength;
|
||||
|
||||
public ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
ClassicTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
@ -34,9 +34,11 @@ import org.apache.lucene.analysis.ckb.SoraniNormalizationFilter;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.core.DecimalDigitFilter;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||
import org.apache.lucene.analysis.core.UpperCaseFilter;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
|
||||
import org.apache.lucene.analysis.de.GermanStemFilter;
|
||||
@ -58,17 +60,25 @@ import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||
import org.apache.lucene.analysis.pattern.PatternTokenizer;
|
||||
import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilter;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||
import org.apache.lucene.analysis.standard.ClassicFilter;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizer;
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
||||
import org.apache.lucene.analysis.tr.ApostropheFilter;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.elasticsearch.common.logging.DeprecationLogger;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.common.regex.Regex;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
@ -169,6 +179,19 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers = new TreeMap<>();
|
||||
tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new);
|
||||
tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new);
|
||||
tokenizers.put("thai", ThaiTokenizerFactory::new);
|
||||
tokenizers.put("nGram", NGramTokenizerFactory::new);
|
||||
tokenizers.put("ngram", NGramTokenizerFactory::new);
|
||||
tokenizers.put("edgeNGram", EdgeNGramTokenizerFactory::new);
|
||||
tokenizers.put("edge_ngram", EdgeNGramTokenizerFactory::new);
|
||||
tokenizers.put("classic", ClassicTokenizerFactory::new);
|
||||
tokenizers.put("letter", LetterTokenizerFactory::new);
|
||||
tokenizers.put("lowercase", LowerCaseTokenizerFactory::new);
|
||||
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
|
||||
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
|
||||
tokenizers.put("pattern", PatternTokenizerFactory::new);
|
||||
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
|
||||
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
@ -283,6 +306,16 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
||||
List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
|
||||
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", LowerCaseTokenizer::new, () -> new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
@ -294,6 +327,13 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||
return new LowerCaseFilter(tokenStream);
|
||||
}
|
||||
}));
|
||||
|
||||
// Temporary shim for aliases. TODO deprecate after they are moved
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("nGram", NGramTokenizer::new, null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("edgeNGram",
|
||||
() -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
|
||||
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));
|
||||
|
||||
return tokenizers;
|
||||
}
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
@ -25,19 +25,17 @@ import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
import static org.elasticsearch.index.analysis.NGramTokenizerFactory.parseTokenChars;
|
||||
import static org.elasticsearch.analysis.common.NGramTokenizerFactory.parseTokenChars;
|
||||
|
||||
public class EdgeNGramTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
private final int minGram;
|
||||
|
||||
private final int maxGram;
|
||||
|
||||
private final CharMatcher matcher;
|
||||
|
||||
|
||||
public EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
||||
this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
|
@ -17,17 +17,18 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
public class LetterTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
public LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
LetterTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
@ -17,17 +17,19 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
public class LowerCaseTokenizerFactory extends AbstractTokenizerFactory implements MultiTermAwareComponent {
|
||||
|
||||
public LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
LowerCaseTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
@ -25,6 +25,7 @@ import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.Modifier;
|
||||
@ -83,7 +84,7 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
public NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
|
||||
this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
@ -35,7 +36,7 @@ public class PathHierarchyTokenizerFactory extends AbstractTokenizerFactory {
|
||||
private final int skip;
|
||||
private final boolean reverse;
|
||||
|
||||
public PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
PathHierarchyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
bufferSize = settings.getAsInt("buffer_size", 1024);
|
||||
String delimiter = settings.get("delimiter");
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.pattern.PatternTokenizer;
|
||||
@ -25,6 +25,7 @@ import org.elasticsearch.common.regex.Regex;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -33,7 +34,7 @@ public class PatternTokenizerFactory extends AbstractTokenizerFactory {
|
||||
private final Pattern pattern;
|
||||
private final int group;
|
||||
|
||||
public PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
PatternTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
|
@ -17,20 +17,21 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link ThaiTokenizer}
|
||||
*/
|
||||
public class ThaiTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
public ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
ThaiTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
@ -25,12 +25,13 @@ import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
private final int maxTokenLength;
|
||||
|
||||
public UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
UAX29URLEmailTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
||||
@ -41,4 +42,4 @@ public class UAX29URLEmailTokenizerFactory extends AbstractTokenizerFactory {
|
||||
tokenizer.setMaxTokenLength(maxTokenLength);
|
||||
return tokenizer;
|
||||
}
|
||||
}
|
||||
}
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
@ -26,13 +26,14 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
public class WhitespaceTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
static final String MAX_TOKEN_LENGTH = "max_token_length";
|
||||
private Integer maxTokenLength;
|
||||
|
||||
public WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
WhitespaceTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
maxTokenLength = settings.getAsInt(MAX_TOKEN_LENGTH, StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
|
||||
}
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
|
||||
@ -45,6 +46,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getTokenizers());
|
||||
tokenizers.put("simplepattern", SimplePatternTokenizerFactory.class);
|
||||
tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory.class);
|
||||
tokenizers.put("thai", ThaiTokenizerFactory.class);
|
||||
tokenizers.put("ngram", NGramTokenizerFactory.class);
|
||||
tokenizers.put("edgengram", EdgeNGramTokenizerFactory.class);
|
||||
tokenizers.put("classic", ClassicTokenizerFactory.class);
|
||||
tokenizers.put("letter", LetterTokenizerFactory.class);
|
||||
tokenizers.put("lowercase", LowerCaseTokenizerFactory.class);
|
||||
tokenizers.put("pathhierarchy", PathHierarchyTokenizerFactory.class);
|
||||
tokenizers.put("pattern", PatternTokenizerFactory.class);
|
||||
tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
|
||||
tokenizers.put("whitespace", WhitespaceTokenizerFactory.class);
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
@ -211,10 +222,25 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getPreConfiguredTokenizers() {
|
||||
Map<String, Class<?>> filters = new TreeMap<>(super.getPreConfiguredTokenizers());
|
||||
filters.put("keyword", null);
|
||||
filters.put("lowercase", null);
|
||||
return filters;
|
||||
Map<String, Class<?>> tokenizers = new TreeMap<>(super.getPreConfiguredTokenizers());
|
||||
tokenizers.put("keyword", null);
|
||||
tokenizers.put("lowercase", null);
|
||||
tokenizers.put("classic", null);
|
||||
tokenizers.put("uax_url_email", org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class);
|
||||
tokenizers.put("path_hierarchy", null);
|
||||
tokenizers.put("letter", null);
|
||||
tokenizers.put("whitespace", null);
|
||||
tokenizers.put("ngram", null);
|
||||
tokenizers.put("edge_ngram", null);
|
||||
tokenizers.put("pattern", null);
|
||||
tokenizers.put("thai", null);
|
||||
|
||||
// TODO drop aliases once they are moved to module
|
||||
tokenizers.put("nGram", tokenizers.get("ngram"));
|
||||
tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
|
||||
tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
|
||||
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -45,7 +45,7 @@ public class CommonGramsTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
.build();
|
||||
|
||||
try {
|
||||
AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
|
||||
AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new CommonAnalysisPlugin());
|
||||
Assert.fail("[common_words] or [common_words_path] is set");
|
||||
} catch (IllegalArgumentException e) {
|
||||
} catch (IOException e) {
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.query;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Query;
|
||||
@ -29,12 +29,22 @@ import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
|
||||
import org.elasticsearch.index.query.MatchQueryBuilder;
|
||||
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
|
||||
import org.elasticsearch.index.query.QueryShardContext;
|
||||
import org.elasticsearch.index.query.QueryStringQueryBuilder;
|
||||
import org.elasticsearch.index.query.SimpleQueryStringBuilder;
|
||||
import org.elasticsearch.index.query.SimpleQueryStringFlag;
|
||||
import org.elasticsearch.index.search.MatchQuery;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
@ -49,6 +59,11 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
|
||||
private static Query expectedQueryWithUnigram;
|
||||
private static Query expectedPhraseQueryWithUnigram;
|
||||
|
||||
@Override
|
||||
protected Collection<Class<? extends Plugin>> getPlugins() {
|
||||
return Collections.singleton(CommonAnalysisPlugin.class);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
Settings settings = Settings.builder()
|
||||
@ -150,42 +165,42 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
|
||||
public void testMatchPhraseQuery() throws IOException {
|
||||
MatchPhraseQueryBuilder builder =
|
||||
new MatchPhraseQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||
Query query = builder.doToQuery(shardContext);
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder =
|
||||
new MatchPhraseQueryBuilder("text_shingle", "foo bar baz biz");
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQuery, equalTo(query));
|
||||
}
|
||||
|
||||
public void testMatchQuery() throws IOException {
|
||||
MatchQueryBuilder builder =
|
||||
new MatchQueryBuilder("text_shingle_unigram", "foo bar baz");
|
||||
Query query = builder.doToQuery(shardContext);
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder = new MatchQueryBuilder("text_shingle", "foo bar baz biz");
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQuery, equalTo(query));
|
||||
}
|
||||
|
||||
public void testMultiMatchQuery() throws IOException {
|
||||
MultiMatchQueryBuilder builder = new MultiMatchQueryBuilder("foo bar baz",
|
||||
"text_shingle_unigram");
|
||||
Query query = builder.doToQuery(shardContext);
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder.type(MatchQuery.Type.PHRASE);
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder = new MultiMatchQueryBuilder("foo bar baz biz", "text_shingle");
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQuery, equalTo(query));
|
||||
|
||||
builder.type(MatchQuery.Type.PHRASE);
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQuery, equalTo(query));
|
||||
}
|
||||
|
||||
@ -193,47 +208,47 @@ public class DisableGraphQueryTests extends ESSingleNodeTestCase {
|
||||
SimpleQueryStringBuilder builder = new SimpleQueryStringBuilder("foo bar baz");
|
||||
builder.field("text_shingle_unigram");
|
||||
builder.flags(SimpleQueryStringFlag.NONE);
|
||||
Query query = builder.doToQuery(shardContext);
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder = new SimpleQueryStringBuilder("\"foo bar baz\"");
|
||||
builder.field("text_shingle_unigram");
|
||||
builder.flags(SimpleQueryStringFlag.PHRASE);
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder = new SimpleQueryStringBuilder("foo bar baz biz");
|
||||
builder.field("text_shingle");
|
||||
builder.flags(SimpleQueryStringFlag.NONE);
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQuery, equalTo(query));
|
||||
|
||||
builder = new SimpleQueryStringBuilder("\"foo bar baz biz\"");
|
||||
builder.field("text_shingle");
|
||||
builder.flags(SimpleQueryStringFlag.PHRASE);
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQuery, equalTo(query));
|
||||
}
|
||||
|
||||
public void testQueryString() throws IOException {
|
||||
QueryStringQueryBuilder builder = new QueryStringQueryBuilder("foo bar baz");
|
||||
builder.field("text_shingle_unigram");
|
||||
Query query = builder.doToQuery(shardContext);
|
||||
Query query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder = new QueryStringQueryBuilder("\"foo bar baz\"");
|
||||
builder.field("text_shingle_unigram");
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQueryWithUnigram, equalTo(query));
|
||||
|
||||
builder = new QueryStringQueryBuilder("foo bar baz biz");
|
||||
builder.field("text_shingle");
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedQuery, equalTo(query));
|
||||
|
||||
builder = new QueryStringQueryBuilder("\"foo bar baz biz\"");
|
||||
builder.field("text_shingle");
|
||||
query = builder.doToQuery(shardContext);
|
||||
query = builder.toQuery(shardContext);
|
||||
assertThat(expectedPhraseQuery, equalTo(query));
|
||||
}
|
||||
}
|
@ -30,8 +30,6 @@ import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.settings.Settings.Builder;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
@ -17,15 +17,13 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis.synonyms;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
@ -44,7 +42,6 @@ import static org.hamcrest.Matchers.instanceOf;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
public class SynonymsAnalysisTests extends ESTestCase {
|
||||
protected final Logger logger = Loggers.getLogger(getClass());
|
||||
private IndexAnalyzers indexAnalyzers;
|
||||
|
||||
public void testSynonymsAnalysis() throws IOException {
|
||||
@ -56,14 +53,14 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
||||
Files.copy(synonyms, config.resolve("synonyms.txt"));
|
||||
Files.copy(synonymsWordnet, config.resolve("synonyms_wordnet.txt"));
|
||||
|
||||
String json = "/org/elasticsearch/index/analysis/synonyms/synonyms.json";
|
||||
String json = "/org/elasticsearch/analysis/common/synonyms.json";
|
||||
Settings settings = Settings.builder().
|
||||
loadFromStream(json, getClass().getResourceAsStream(json), false)
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), home)
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
|
||||
match("synonymAnalyzer", "kimchy is the dude abides", "shay is the elasticsearch man!");
|
||||
match("synonymAnalyzer_file", "kimchy is the dude abides", "shay is the elasticsearch man!");
|
||||
@ -91,7 +88,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try {
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
fail("fail! due to synonym word deleted by analyzer");
|
||||
} catch (Exception e) {
|
||||
assertThat(e, instanceOf(IllegalArgumentException.class));
|
||||
@ -112,7 +109,7 @@ public class SynonymsAnalysisTests extends ESTestCase {
|
||||
.build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
|
||||
try {
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings).indexAnalyzers;
|
||||
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;
|
||||
fail("fail! due to synonym word deleted by analyzer");
|
||||
} catch (Exception e) {
|
||||
assertThat(e, instanceOf(IllegalArgumentException.class));
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||
|
@ -70,3 +70,374 @@
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: bar }
|
||||
|
||||
---
|
||||
"thai_tokenizer":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "ภาษาไทย"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: thai
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: ภาษา }
|
||||
- match: { detail.tokenizer.tokens.1.token: ไทย }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "ภาษาไทย"
|
||||
explain: true
|
||||
tokenizer: thai
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: thai }
|
||||
- match: { detail.tokenizer.tokens.0.token: ภาษา }
|
||||
- match: { detail.tokenizer.tokens.1.token: ไทย }
|
||||
|
||||
---
|
||||
"ngram":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foobar"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: ngram
|
||||
min_gram: 3
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: oob }
|
||||
- match: { detail.tokenizer.tokens.2.token: oba }
|
||||
- match: { detail.tokenizer.tokens.3.token: bar }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foobar"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: nGram
|
||||
min_gram: 3
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: oob }
|
||||
- match: { detail.tokenizer.tokens.2.token: oba }
|
||||
- match: { detail.tokenizer.tokens.3.token: bar }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foo"
|
||||
explain: true
|
||||
tokenizer: ngram
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: ngram }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
- match: { detail.tokenizer.tokens.2.token: o }
|
||||
- match: { detail.tokenizer.tokens.3.token: oo }
|
||||
- match: { detail.tokenizer.tokens.4.token: o }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foo"
|
||||
explain: true
|
||||
tokenizer: nGram
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: nGram }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
- match: { detail.tokenizer.tokens.2.token: o }
|
||||
- match: { detail.tokenizer.tokens.3.token: oo }
|
||||
- match: { detail.tokenizer.tokens.4.token: o }
|
||||
|
||||
---
|
||||
"edge_ngram":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foo"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: edge_ngram
|
||||
min_gram: 1
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
- match: { detail.tokenizer.tokens.2.token: foo }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foo"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: edgeNGram
|
||||
min_gram: 1
|
||||
max_gram: 3
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
- match: { detail.tokenizer.tokens.2.token: foo }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foo"
|
||||
explain: true
|
||||
tokenizer: edge_ngram
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: edge_ngram }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "foo"
|
||||
explain: true
|
||||
tokenizer: edgeNGram
|
||||
- length: { detail.tokenizer.tokens: 2 }
|
||||
- match: { detail.tokenizer.name: edgeNGram }
|
||||
- match: { detail.tokenizer.tokens.0.token: f }
|
||||
- match: { detail.tokenizer.tokens.1.token: fo }
|
||||
|
||||
---
|
||||
"classic":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Brown-Foxes don't jump."
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: classic
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: Brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: Foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don't }
|
||||
- match: { detail.tokenizer.tokens.3.token: jump }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Brown-Foxes don't jump."
|
||||
explain: true
|
||||
tokenizer: classic
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: classic }
|
||||
- match: { detail.tokenizer.tokens.0.token: Brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: Foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don't }
|
||||
- match: { detail.tokenizer.tokens.3.token: jump }
|
||||
|
||||
---
|
||||
"letter":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Brown-Foxes don't jump."
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: letter
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: Brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: Foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don }
|
||||
- match: { detail.tokenizer.tokens.3.token: t }
|
||||
- match: { detail.tokenizer.tokens.4.token: jump }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Brown-Foxes don't jump."
|
||||
explain: true
|
||||
tokenizer: letter
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: letter }
|
||||
- match: { detail.tokenizer.tokens.0.token: Brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: Foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don }
|
||||
- match: { detail.tokenizer.tokens.3.token: t }
|
||||
- match: { detail.tokenizer.tokens.4.token: jump }
|
||||
|
||||
---
|
||||
"lowercase":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Brown-Foxes don't jump."
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: lowercase
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don }
|
||||
- match: { detail.tokenizer.tokens.3.token: t }
|
||||
- match: { detail.tokenizer.tokens.4.token: jump }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Brown-Foxes don't jump."
|
||||
explain: true
|
||||
tokenizer: lowercase
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: lowercase }
|
||||
- match: { detail.tokenizer.tokens.0.token: brown }
|
||||
- match: { detail.tokenizer.tokens.1.token: foxes }
|
||||
- match: { detail.tokenizer.tokens.2.token: don }
|
||||
- match: { detail.tokenizer.tokens.3.token: t }
|
||||
- match: { detail.tokenizer.tokens.4.token: jump }
|
||||
|
||||
---
|
||||
"path_hierarchy":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "a/b/c"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: path_hierarchy
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: a }
|
||||
- match: { detail.tokenizer.tokens.1.token: a/b }
|
||||
- match: { detail.tokenizer.tokens.2.token: a/b/c }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "a/b/c"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: PathHierarchy
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: a }
|
||||
- match: { detail.tokenizer.tokens.1.token: a/b }
|
||||
- match: { detail.tokenizer.tokens.2.token: a/b/c }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "a/b/c"
|
||||
explain: true
|
||||
tokenizer: path_hierarchy
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: path_hierarchy }
|
||||
- match: { detail.tokenizer.tokens.0.token: a }
|
||||
- match: { detail.tokenizer.tokens.1.token: a/b }
|
||||
- match: { detail.tokenizer.tokens.2.token: a/b/c }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "a/b/c"
|
||||
explain: true
|
||||
tokenizer: PathHierarchy
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: PathHierarchy }
|
||||
- match: { detail.tokenizer.tokens.0.token: a }
|
||||
- match: { detail.tokenizer.tokens.1.token: a/b }
|
||||
- match: { detail.tokenizer.tokens.2.token: a/b/c }
|
||||
|
||||
---
|
||||
"pattern":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "split by whitespace by default"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: pattern
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: split }
|
||||
- match: { detail.tokenizer.tokens.1.token: by }
|
||||
- match: { detail.tokenizer.tokens.2.token: whitespace }
|
||||
- match: { detail.tokenizer.tokens.3.token: by }
|
||||
- match: { detail.tokenizer.tokens.4.token: default }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "split by whitespace by default"
|
||||
explain: true
|
||||
tokenizer: pattern
|
||||
- length: { detail.tokenizer.tokens: 5 }
|
||||
- match: { detail.tokenizer.name: pattern }
|
||||
- match: { detail.tokenizer.tokens.0.token: split }
|
||||
- match: { detail.tokenizer.tokens.1.token: by }
|
||||
- match: { detail.tokenizer.tokens.2.token: whitespace }
|
||||
- match: { detail.tokenizer.tokens.3.token: by }
|
||||
- match: { detail.tokenizer.tokens.4.token: default }
|
||||
|
||||
---
|
||||
"uax_url_email":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Email me at john.smith@global-international.com"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: uax_url_email
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: Email }
|
||||
- match: { detail.tokenizer.tokens.1.token: me }
|
||||
- match: { detail.tokenizer.tokens.2.token: at }
|
||||
- match: { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "Email me at john.smith@global-international.com"
|
||||
explain: true
|
||||
tokenizer: uax_url_email
|
||||
- length: { detail.tokenizer.tokens: 4 }
|
||||
- match: { detail.tokenizer.name: uax_url_email }
|
||||
- match: { detail.tokenizer.tokens.0.token: Email }
|
||||
- match: { detail.tokenizer.tokens.1.token: me }
|
||||
- match: { detail.tokenizer.tokens.2.token: at }
|
||||
- match: { detail.tokenizer.tokens.3.token: john.smith@global-international.com }
|
||||
|
||||
---
|
||||
"whitespace":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "split by whitespace"
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: whitespace
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: split }
|
||||
- match: { detail.tokenizer.tokens.1.token: by }
|
||||
- match: { detail.tokenizer.tokens.2.token: whitespace }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: "split by whitespace"
|
||||
explain: true
|
||||
tokenizer: whitespace
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: whitespace }
|
||||
- match: { detail.tokenizer.tokens.0.token: split }
|
||||
- match: { detail.tokenizer.tokens.1.token: by }
|
||||
- match: { detail.tokenizer.tokens.2.token: whitespace }
|
||||
|
@ -67,3 +67,33 @@
|
||||
text: "<html>foo</html>"
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: "\nfoo\n" }
|
||||
|
||||
---
|
||||
"Synonym filter with tokenizer":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test_synonym
|
||||
body:
|
||||
settings:
|
||||
index:
|
||||
analysis:
|
||||
tokenizer:
|
||||
trigram:
|
||||
type: nGram
|
||||
min_gram: 3
|
||||
max_gram: 3
|
||||
filter:
|
||||
synonym:
|
||||
type: synonym
|
||||
synonyms: ["kimchy => shay"]
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test_synonym
|
||||
body:
|
||||
tokenizer: trigram
|
||||
filter: [synonym]
|
||||
text: kimchy
|
||||
- length: { tokens: 2 }
|
||||
- match: { tokens.0.token: sha }
|
||||
- match: { tokens.1.token: hay }
|
||||
|
@ -39,3 +39,97 @@
|
||||
text:
|
||||
query: foa
|
||||
- match: {hits.total: 1}
|
||||
|
||||
---
|
||||
"testNGramCopyField":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 1
|
||||
number_of_replicas: 0
|
||||
max_ngram_diff: 9
|
||||
analysis:
|
||||
analyzer:
|
||||
my_ngram_analyzer:
|
||||
tokenizer: my_ngram_tokenizer
|
||||
tokenizer:
|
||||
my_ngram_tokenizer:
|
||||
type: ngram
|
||||
min: 1,
|
||||
max: 10
|
||||
token_chars: []
|
||||
mappings:
|
||||
doc:
|
||||
properties:
|
||||
origin:
|
||||
type: text
|
||||
copy_to: meta
|
||||
meta:
|
||||
type: text
|
||||
analyzer: my_ngram_analyzer
|
||||
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
type: doc
|
||||
id: 1
|
||||
body: { "origin": "C.A1234.5678" }
|
||||
refresh: true
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
meta:
|
||||
query: 1234
|
||||
- match: {hits.total: 1}
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
meta:
|
||||
query: 1234.56
|
||||
- match: {hits.total: 1}
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
meta:
|
||||
query: A1234
|
||||
- match: {hits.total: 1}
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
term:
|
||||
meta:
|
||||
value: a1234
|
||||
- match: {hits.total: 0}
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
meta:
|
||||
query: A1234
|
||||
analyzer: my_ngram_analyzer
|
||||
- match: {hits.total: 1}
|
||||
|
||||
- do:
|
||||
search:
|
||||
body:
|
||||
query:
|
||||
match:
|
||||
meta:
|
||||
query: a1234
|
||||
analyzer: my_ngram_analyzer
|
||||
- match: {hits.total: 1}
|
||||
|
@ -76,36 +76,6 @@
|
||||
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: bar }
|
||||
|
||||
---
|
||||
"Synonym filter with tokenizer":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test_synonym
|
||||
body:
|
||||
settings:
|
||||
index:
|
||||
analysis:
|
||||
tokenizer:
|
||||
trigram:
|
||||
type: nGram
|
||||
min_gram: 3
|
||||
max_gram: 3
|
||||
filter:
|
||||
synonym:
|
||||
type: synonym
|
||||
synonyms: ["kimchy => shay"]
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test_synonym
|
||||
body:
|
||||
tokenizer: trigram
|
||||
filter: [synonym]
|
||||
text: kimchy
|
||||
- length: { tokens: 2 }
|
||||
- match: { tokens.0.token: sha }
|
||||
- match: { tokens.1.token: hay }
|
||||
|
||||
---
|
||||
"Custom normalizer in request":
|
||||
- do:
|
||||
|
@ -24,6 +24,7 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
|
||||
|
||||
public class KeywordTokenizerFactory extends AbstractTokenizerFactory {
|
||||
|
||||
|
@ -39,11 +39,9 @@ import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.CjkAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.CzechAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.DanishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.DutchAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.EnglishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.FingerprintAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.FinnishAnalyzerProvider;
|
||||
@ -60,14 +58,9 @@ import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PatternAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
@ -88,13 +81,10 @@ import org.elasticsearch.index.analysis.StopAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SwedishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ThaiAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.TurkishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -223,36 +213,19 @@ public final class AnalysisModule {
|
||||
}
|
||||
preConfiguredTokenizers.register(name, preConfigured);
|
||||
}
|
||||
// Temporary shim for aliases. TODO deprecate after they are moved
|
||||
preConfiguredTokenizers.register("nGram", preConfiguredTokenizers.getRegistry().get("ngram"));
|
||||
preConfiguredTokenizers.register("edgeNGram", preConfiguredTokenizers.getRegistry().get("edge_ngram"));
|
||||
preConfiguredTokenizers.register("PathHierarchy", preConfiguredTokenizers.getRegistry().get("path_hierarchy"));
|
||||
|
||||
for (AnalysisPlugin plugin: plugins) {
|
||||
for (PreConfiguredTokenizer tokenizer : plugin.getPreConfiguredTokenizers()) {
|
||||
preConfiguredTokenizers.register(tokenizer.getName(), tokenizer);
|
||||
}
|
||||
}
|
||||
|
||||
return unmodifiableMap(preConfiguredTokenizers.getRegistry());
|
||||
}
|
||||
|
||||
private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
|
||||
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
|
||||
tokenizers.register("standard", StandardTokenizerFactory::new);
|
||||
tokenizers.register("uax_url_email", UAX29URLEmailTokenizerFactory::new);
|
||||
tokenizers.register("path_hierarchy", PathHierarchyTokenizerFactory::new);
|
||||
tokenizers.register("PathHierarchy", PathHierarchyTokenizerFactory::new);
|
||||
tokenizers.register("keyword", KeywordTokenizerFactory::new);
|
||||
tokenizers.register("letter", LetterTokenizerFactory::new);
|
||||
tokenizers.register("lowercase", LowerCaseTokenizerFactory::new);
|
||||
tokenizers.register("whitespace", WhitespaceTokenizerFactory::new);
|
||||
tokenizers.register("nGram", NGramTokenizerFactory::new);
|
||||
tokenizers.register("ngram", NGramTokenizerFactory::new);
|
||||
tokenizers.register("edgeNGram", EdgeNGramTokenizerFactory::new);
|
||||
tokenizers.register("edge_ngram", EdgeNGramTokenizerFactory::new);
|
||||
tokenizers.register("pattern", PatternTokenizerFactory::new);
|
||||
tokenizers.register("classic", ClassicTokenizerFactory::new);
|
||||
tokenizers.register("thai", ThaiTokenizerFactory::new);
|
||||
tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
|
||||
return tokenizers;
|
||||
}
|
||||
|
@ -19,18 +19,8 @@
|
||||
package org.elasticsearch.indices.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
|
||||
import org.apache.lucene.analysis.pattern.PatternTokenizer;
|
||||
import org.apache.lucene.analysis.standard.ClassicTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer;
|
||||
import org.apache.lucene.analysis.th.ThaiTokenizer;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.regex.Regex;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;
|
||||
|
||||
@ -41,69 +31,6 @@ public enum PreBuiltTokenizers {
|
||||
protected Tokenizer create(Version version) {
|
||||
return new StandardTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
CLASSIC(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new ClassicTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
UAX_URL_EMAIL(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new UAX29URLEmailTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
PATH_HIERARCHY(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new PathHierarchyTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
LETTER(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new LetterTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
WHITESPACE(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new WhitespaceTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
NGRAM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new NGramTokenizer();
|
||||
}
|
||||
},
|
||||
|
||||
EDGE_NGRAM(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
|
||||
}
|
||||
},
|
||||
|
||||
PATTERN(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new PatternTokenizer(Regex.compile("\\W+", null), -1);
|
||||
}
|
||||
},
|
||||
|
||||
THAI(CachingStrategy.ONE) {
|
||||
@Override
|
||||
protected Tokenizer create(Version version) {
|
||||
return new ThaiTokenizer();
|
||||
}
|
||||
}
|
||||
|
||||
;
|
||||
|
@ -287,7 +287,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
||||
e = expectThrows(IllegalArgumentException.class,
|
||||
() -> TransportAnalyzeAction.analyze(
|
||||
new AnalyzeRequest()
|
||||
.tokenizer("whitespace")
|
||||
.tokenizer("standard")
|
||||
.addTokenFilter("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
"text", null, notGlobal ? indexAnalyzers : null, registry, environment, maxTokenCount));
|
||||
@ -300,7 +300,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
||||
e = expectThrows(IllegalArgumentException.class,
|
||||
() -> TransportAnalyzeAction.analyze(
|
||||
new AnalyzeRequest()
|
||||
.tokenizer("whitespace")
|
||||
.tokenizer("standard")
|
||||
.addTokenFilter("lowercase")
|
||||
.addCharFilter("foobar")
|
||||
.text("the qu1ck brown fox"),
|
||||
@ -322,7 +322,7 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
||||
|
||||
public void testNonPreBuildTokenFilter() throws IOException {
|
||||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.tokenizer("whitespace");
|
||||
request.tokenizer("standard");
|
||||
request.addTokenFilter("stop"); // stop token filter is not prebuilt in AnalysisModule#setupPreConfiguredTokenFilters()
|
||||
request.text("the quick brown fox");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, "text", null, indexAnalyzers, registry, environment, maxTokenCount);
|
||||
|
@ -188,7 +188,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
||||
.addAlias(new Alias("alias"))
|
||||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
|
||||
for (int i = 0; i < 10; i++) {
|
||||
client().prepareIndex("test", "type1", Integer.toString(i))
|
||||
@ -260,7 +260,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
||||
.endObject().endObject();
|
||||
assertAcked(prepareCreate("test").addMapping("type1", mapping)
|
||||
.setSettings(Settings.builder()
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
|
||||
for (int i = 0; i < 10; i++) {
|
||||
client().prepareIndex("test", "type1", Integer.toString(i))
|
||||
@ -394,7 +394,7 @@ public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
|
||||
.addMapping("type1", mapping)
|
||||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.tv_test.filter", "lowercase")));
|
||||
|
||||
ensureGreen();
|
||||
|
@ -18,6 +18,7 @@
|
||||
*/
|
||||
package org.elasticsearch.action.termvectors;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.payloads.FloatEncoder;
|
||||
@ -35,6 +36,7 @@ import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
@ -93,6 +95,12 @@ public class GetTermVectorsTests extends ESSingleNodeTestCase {
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
|
||||
return Collections.singletonList(PreConfiguredTokenizer.singleton("mock-whitespace",
|
||||
() -> new MockTokenizer(MockTokenizer.WHITESPACE, false), null));
|
||||
}
|
||||
|
||||
// Based on DelimitedPayloadTokenFilter:
|
||||
final class MockPayloadTokenFilter extends TokenFilter {
|
||||
private final char delimiter;
|
||||
@ -151,7 +159,7 @@ public class GetTermVectorsTests extends ESSingleNodeTestCase {
|
||||
.startObject("field").field("type", "text").field("term_vector", "with_positions_offsets_payloads")
|
||||
.field("analyzer", "payload_test").endObject().endObject().endObject().endObject();
|
||||
Settings setting = Settings.builder()
|
||||
.put("index.analysis.analyzer.payload_test.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.payload_test.tokenizer", "mock-whitespace")
|
||||
.putList("index.analysis.analyzer.payload_test.filter", "my_delimited_payload")
|
||||
.put("index.analysis.filter.my_delimited_payload.delimiter", delimiter)
|
||||
.put("index.analysis.filter.my_delimited_payload.encoding", encodingString)
|
||||
|
@ -35,10 +35,8 @@ import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcke
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.hasSize;
|
||||
import static org.hamcrest.Matchers.is;
|
||||
import static org.hamcrest.Matchers.notNullValue;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
|
||||
public class AnalyzeActionIT extends ESIntegTestCase {
|
||||
public void testSimpleAnalyzerTests() throws Exception {
|
||||
assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
|
||||
@ -333,14 +331,14 @@ public class AnalyzeActionIT extends ESIntegTestCase {
|
||||
AnalyzeResponse analyzeResponse = client().admin().indices()
|
||||
.prepareAnalyze()
|
||||
.setText("Foo buzz test")
|
||||
.setTokenizer("whitespace")
|
||||
.setTokenizer("standard")
|
||||
.addTokenFilter("lowercase")
|
||||
.addTokenFilter(stopFilterSettings)
|
||||
.setExplain(true)
|
||||
.get();
|
||||
|
||||
//tokenizer
|
||||
assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("whitespace"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("standard"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(3));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("Foo"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
|
||||
@ -393,41 +391,6 @@ public class AnalyzeActionIT extends ESIntegTestCase {
|
||||
assertThat(analyzeResponse.detail().tokenfilters()[1].getTokens()[0].getPositionLength(), equalTo(1));
|
||||
}
|
||||
|
||||
public void testCustomTokenizerInRequest() throws Exception {
|
||||
Map<String, Object> tokenizerSettings = new HashMap<>();
|
||||
tokenizerSettings.put("type", "nGram");
|
||||
tokenizerSettings.put("min_gram", 2);
|
||||
tokenizerSettings.put("max_gram", 2);
|
||||
|
||||
AnalyzeResponse analyzeResponse = client().admin().indices()
|
||||
.prepareAnalyze()
|
||||
.setText("good")
|
||||
.setTokenizer(tokenizerSettings)
|
||||
.setExplain(true)
|
||||
.get();
|
||||
|
||||
//tokenizer
|
||||
assertThat(analyzeResponse.detail().tokenizer().getName(), equalTo("_anonymous_tokenizer"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens().length, equalTo(3));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getTerm(), equalTo("go"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getStartOffset(), equalTo(0));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getEndOffset(), equalTo(2));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPosition(), equalTo(0));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[0].getPositionLength(), equalTo(1));
|
||||
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getTerm(), equalTo("oo"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getStartOffset(), equalTo(1));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getEndOffset(), equalTo(3));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPosition(), equalTo(1));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[1].getPositionLength(), equalTo(1));
|
||||
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getTerm(), equalTo("od"));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getStartOffset(), equalTo(2));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getEndOffset(), equalTo(4));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPosition(), equalTo(2));
|
||||
assertThat(analyzeResponse.detail().tokenizer().getTokens()[2].getPositionLength(), equalTo(1));
|
||||
}
|
||||
|
||||
public void testAnalyzeKeywordField() throws IOException {
|
||||
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).addMapping("test", "keyword", "type=keyword"));
|
||||
ensureGreen("test");
|
||||
|
@ -677,7 +677,7 @@ public class SimpleIndexTemplateIT extends ESIntegTestCase {
|
||||
" \"analysis\" : {\n" +
|
||||
" \"analyzer\" : {\n" +
|
||||
" \"custom_1\" : {\n" +
|
||||
" \"tokenizer\" : \"whitespace\"\n" +
|
||||
" \"tokenizer\" : \"standard\"\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
|
@ -1359,7 +1359,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
public void testPhrasePrefix() throws IOException {
|
||||
Builder builder = Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.synonym.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.synonym.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
|
||||
.put("index.analysis.filter.synonym.type", "synonym")
|
||||
.putList("index.analysis.filter.synonym.synonyms", "quick => fast");
|
||||
@ -2804,7 +2804,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
public void testSynonyms() throws IOException {
|
||||
Builder builder = Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer.synonym.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.synonym.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase")
|
||||
.put("index.analysis.filter.synonym.type", "synonym")
|
||||
.putList("index.analysis.filter.synonym.synonyms", "fast,quick");
|
||||
|
@ -156,7 +156,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
|
||||
|
||||
public void testMoreDocs() throws Exception {
|
||||
Builder builder = Settings.builder();
|
||||
builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
|
||||
builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
|
||||
builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
|
||||
builder.put("index.analysis.filter.synonym.type", "synonym");
|
||||
builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
|
||||
@ -234,7 +234,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
|
||||
// Tests a rescore window smaller than number of hits:
|
||||
public void testSmallRescoreWindow() throws Exception {
|
||||
Builder builder = Settings.builder();
|
||||
builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
|
||||
builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
|
||||
builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
|
||||
builder.put("index.analysis.filter.synonym.type", "synonym");
|
||||
builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
|
||||
@ -306,7 +306,7 @@ public class QueryRescorerIT extends ESIntegTestCase {
|
||||
// Tests a rescorer that penalizes the scores:
|
||||
public void testRescorerMadeScoresWorse() throws Exception {
|
||||
Builder builder = Settings.builder();
|
||||
builder.put("index.analysis.analyzer.synonym.tokenizer", "whitespace");
|
||||
builder.put("index.analysis.analyzer.synonym.tokenizer", "standard");
|
||||
builder.putList("index.analysis.analyzer.synonym.filter", "synonym", "lowercase");
|
||||
builder.put("index.analysis.filter.synonym.type", "synonym");
|
||||
builder.putList("index.analysis.filter.synonym.synonyms", "ave => ave, avenue", "street => str, street");
|
||||
|
@ -82,7 +82,7 @@ public class MultiMatchQueryIT extends ESIntegTestCase {
|
||||
.put("index.analysis.analyzer.perfect_match.tokenizer", "keyword")
|
||||
.put("index.analysis.analyzer.perfect_match.filter", "lowercase")
|
||||
.put("index.analysis.analyzer.category.type", "custom")
|
||||
.put("index.analysis.analyzer.category.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.category.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.category.filter", "lowercase")
|
||||
);
|
||||
assertAcked(builder.addMapping("test", createMapping()));
|
||||
|
@ -20,7 +20,6 @@
|
||||
package org.elasticsearch.search.query;
|
||||
|
||||
import org.apache.lucene.util.English;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
|
||||
import org.elasticsearch.action.index.IndexRequestBuilder;
|
||||
import org.elasticsearch.action.search.SearchPhaseExecutionException;
|
||||
@ -30,7 +29,6 @@ import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.query.BoolQueryBuilder;
|
||||
import org.elasticsearch.index.query.MatchQueryBuilder;
|
||||
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
|
||||
@ -351,7 +349,7 @@ public class SearchQueryIT extends ESIntegTestCase {
|
||||
.put(SETTING_NUMBER_OF_SHARDS,1)
|
||||
.put("index.analysis.filter.syns.type","synonym")
|
||||
.putList("index.analysis.filter.syns.synonyms","quick,fast")
|
||||
.put("index.analysis.analyzer.syns.tokenizer","whitespace")
|
||||
.put("index.analysis.analyzer.syns.tokenizer","standard")
|
||||
.put("index.analysis.analyzer.syns.filter","syns")
|
||||
)
|
||||
.addMapping("type1", "field1", "type=text,analyzer=syns", "field2", "type=text,analyzer=syns"));
|
||||
@ -1764,56 +1762,6 @@ public class SearchQueryIT extends ESIntegTestCase {
|
||||
assertHitCount(client().prepareSearch().setQuery(matchAllQuery()).get(), 1L);
|
||||
}
|
||||
|
||||
// see #5120
|
||||
public void testNGramCopyField() {
|
||||
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put(IndexSettings.MAX_NGRAM_DIFF_SETTING.getKey(), 9)
|
||||
.put("index.analysis.analyzer.my_ngram_analyzer.type", "custom")
|
||||
.put("index.analysis.analyzer.my_ngram_analyzer.tokenizer", "my_ngram_tokenizer")
|
||||
.put("index.analysis.tokenizer.my_ngram_tokenizer.type", "nGram")
|
||||
.put("index.analysis.tokenizer.my_ngram_tokenizer.min_gram", "1")
|
||||
.put("index.analysis.tokenizer.my_ngram_tokenizer.max_gram", "10")
|
||||
.putList("index.analysis.tokenizer.my_ngram_tokenizer.token_chars", new String[0]));
|
||||
assertAcked(builder.addMapping("test", "origin", "type=text,copy_to=meta", "meta", "type=text,analyzer=my_ngram_analyzer"));
|
||||
// we only have ngrams as the index analyzer so searches will get standard analyzer
|
||||
|
||||
|
||||
client().prepareIndex("test", "test", "1").setSource("origin", "C.A1234.5678")
|
||||
.setRefreshPolicy(IMMEDIATE)
|
||||
.get();
|
||||
|
||||
SearchResponse searchResponse = client().prepareSearch("test")
|
||||
.setQuery(matchQuery("meta", "1234"))
|
||||
.get();
|
||||
assertHitCount(searchResponse, 1L);
|
||||
|
||||
searchResponse = client().prepareSearch("test")
|
||||
.setQuery(matchQuery("meta", "1234.56"))
|
||||
.get();
|
||||
assertHitCount(searchResponse, 1L);
|
||||
|
||||
searchResponse = client().prepareSearch("test")
|
||||
.setQuery(termQuery("meta", "A1234"))
|
||||
.get();
|
||||
assertHitCount(searchResponse, 1L);
|
||||
|
||||
searchResponse = client().prepareSearch("test")
|
||||
.setQuery(termQuery("meta", "a1234"))
|
||||
.get();
|
||||
assertHitCount(searchResponse, 0L); // it's upper case
|
||||
|
||||
searchResponse = client().prepareSearch("test")
|
||||
.setQuery(matchQuery("meta", "A1234").analyzer("my_ngram_analyzer"))
|
||||
.get(); // force ngram analyzer
|
||||
assertHitCount(searchResponse, 1L);
|
||||
|
||||
searchResponse = client().prepareSearch("test")
|
||||
.setQuery(matchQuery("meta", "a1234").analyzer("my_ngram_analyzer"))
|
||||
.get(); // this one returns a hit since it's default operator is OR
|
||||
assertHitCount(searchResponse, 1L);
|
||||
}
|
||||
|
||||
public void testMatchPhrasePrefixQuery() throws ExecutionException, InterruptedException {
|
||||
createIndex("test1");
|
||||
indexRandom(true, client().prepareIndex("test1", "type1", "1").setSource("field", "Johnnie Walker Black Label"),
|
||||
|
@ -427,7 +427,7 @@ public class SuggestSearchIT extends ESIntegTestCase {
|
||||
public void testStopwordsOnlyPhraseSuggest() throws IOException {
|
||||
assertAcked(prepareCreate("test").addMapping("typ1", "body", "type=text,analyzer=stopwd").setSettings(
|
||||
Settings.builder()
|
||||
.put("index.analysis.analyzer.stopwd.tokenizer", "whitespace")
|
||||
.put("index.analysis.analyzer.stopwd.tokenizer", "standard")
|
||||
.putList("index.analysis.analyzer.stopwd.filter", "stop")
|
||||
));
|
||||
ensureGreen();
|
||||
|
@ -22,18 +22,10 @@ package org.elasticsearch.indices.analysis;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.LetterTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.LowerCaseTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
@ -43,9 +35,6 @@ import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymGraphTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ThaiTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
@ -88,20 +77,20 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||
|
||||
static final Map<String,Class<?>> KNOWN_TOKENIZERS = new MapBuilder<String,Class<?>>()
|
||||
// exposed in ES
|
||||
.put("classic", ClassicTokenizerFactory.class)
|
||||
.put("edgengram", EdgeNGramTokenizerFactory.class)
|
||||
.put("classic", MovedToAnalysisCommon.class)
|
||||
.put("edgengram", MovedToAnalysisCommon.class)
|
||||
.put("keyword", KeywordTokenizerFactory.class)
|
||||
.put("letter", LetterTokenizerFactory.class)
|
||||
.put("lowercase", LowerCaseTokenizerFactory.class)
|
||||
.put("ngram", NGramTokenizerFactory.class)
|
||||
.put("pathhierarchy", PathHierarchyTokenizerFactory.class)
|
||||
.put("pattern", PatternTokenizerFactory.class)
|
||||
.put("letter", MovedToAnalysisCommon.class)
|
||||
.put("lowercase", MovedToAnalysisCommon.class)
|
||||
.put("ngram", MovedToAnalysisCommon.class)
|
||||
.put("pathhierarchy", MovedToAnalysisCommon.class)
|
||||
.put("pattern", MovedToAnalysisCommon.class)
|
||||
.put("simplepattern", MovedToAnalysisCommon.class)
|
||||
.put("simplepatternsplit", MovedToAnalysisCommon.class)
|
||||
.put("standard", StandardTokenizerFactory.class)
|
||||
.put("thai", ThaiTokenizerFactory.class)
|
||||
.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class)
|
||||
.put("whitespace", WhitespaceTokenizerFactory.class)
|
||||
.put("thai", MovedToAnalysisCommon.class)
|
||||
.put("uax29urlemail", MovedToAnalysisCommon.class)
|
||||
.put("whitespace", MovedToAnalysisCommon.class)
|
||||
|
||||
// this one "seems to mess up offsets". probably shouldn't be a tokenizer...
|
||||
.put("wikipedia", Void.class)
|
||||
@ -292,23 +281,8 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
||||
Map<String, Class<?>> tokenizers = new HashMap<>();
|
||||
// TODO drop this temporary shim when all the old style tokenizers have been migrated to new style
|
||||
for (PreBuiltTokenizers tokenizer : PreBuiltTokenizers.values()) {
|
||||
final Class<?> luceneFactoryClazz;
|
||||
switch (tokenizer) {
|
||||
case UAX_URL_EMAIL:
|
||||
luceneFactoryClazz = org.apache.lucene.analysis.standard.UAX29URLEmailTokenizerFactory.class;
|
||||
break;
|
||||
case PATH_HIERARCHY:
|
||||
luceneFactoryClazz = Void.class;
|
||||
break;
|
||||
default:
|
||||
luceneFactoryClazz = null;
|
||||
}
|
||||
tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), luceneFactoryClazz);
|
||||
tokenizers.put(tokenizer.name().toLowerCase(Locale.ROOT), null);
|
||||
}
|
||||
// TODO drop aliases once they are moved to module
|
||||
tokenizers.put("nGram", tokenizers.get("ngram"));
|
||||
tokenizers.put("edgeNGram", tokenizers.get("edge_ngram"));
|
||||
tokenizers.put("PathHierarchy", tokenizers.get("path_hierarchy"));
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user