Moved keyword tokenizer to analysis-common module (#30642)

Relates to #23658
This commit is contained in:
Martijn van Groningen 2018-05-29 19:22:28 +02:00 committed by GitHub
parent 363f1e84ca
commit 544822c78b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 228 additions and 75 deletions

View File

@ -193,6 +193,7 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
tokenizers.put("pattern", PatternTokenizerFactory::new); tokenizers.put("pattern", PatternTokenizerFactory::new);
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new); tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new); tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
tokenizers.put("keyword", KeywordTokenizerFactory::new);
return tokenizers; return tokenizers;
} }

View File

@ -17,7 +17,7 @@
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer;
@ -30,7 +30,7 @@ public class KeywordTokenizerFactory extends AbstractTokenizerFactory {
private final int bufferSize; private final int bufferSize;
public KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { KeywordTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
bufferSize = settings.getAsInt("buffer_size", 256); bufferSize = settings.getAsInt("buffer_size", 256);
} }

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory; import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory; import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
@ -56,6 +55,7 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
tokenizers.put("pattern", PatternTokenizerFactory.class); tokenizers.put("pattern", PatternTokenizerFactory.class);
tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class); tokenizers.put("uax29urlemail", UAX29URLEmailTokenizerFactory.class);
tokenizers.put("whitespace", WhitespaceTokenizerFactory.class); tokenizers.put("whitespace", WhitespaceTokenizerFactory.class);
tokenizers.put("keyword", KeywordTokenizerFactory.class);
return tokenizers; return tokenizers;
} }

View File

@ -5,9 +5,22 @@
indices.analyze: indices.analyze:
body: body:
text: Foo Bar! text: Foo Bar!
explain: true
tokenizer: keyword tokenizer: keyword
- length: { tokens: 1 } - length: { detail.tokenizer.tokens: 1 }
- match: { tokens.0.token: Foo Bar! } - match: { detail.tokenizer.name: keyword }
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }
- do:
indices.analyze:
body:
text: Foo Bar!
explain: true
tokenizer:
type: keyword
- length: { detail.tokenizer.tokens: 1 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }
--- ---
"nGram": "nGram":

View File

@ -97,3 +97,19 @@
- length: { tokens: 2 } - length: { tokens: 2 }
- match: { tokens.0.token: sha } - match: { tokens.0.token: sha }
- match: { tokens.1.token: hay } - match: { tokens.1.token: hay }
---
"Custom normalizer in request":
- do:
indices.analyze:
body:
text: ABc
explain: true
filter: ["lowercase"]
- length: { detail.tokenizer.tokens: 1 }
- length: { detail.tokenfilters.0.tokens: 1 }
- match: { detail.tokenizer.name: keyword_for_normalizer }
- match: { detail.tokenizer.tokens.0.token: ABc }
- match: { detail.tokenfilters.0.name: lowercase }
- match: { detail.tokenfilters.0.tokens.0.token: abc }

View File

@ -16,9 +16,11 @@
body: body:
filter: [icu_normalizer] filter: [icu_normalizer]
text: Foo Bar Ruß text: Foo Bar Ruß
tokenizer: keyword tokenizer: standard
- length: { tokens: 1 } - length: { tokens: 3 }
- match: { tokens.0.token: foo bar russ } - match: { tokens.0.token: foo}
- match: { tokens.1.token: bar }
- match: { tokens.2.token: russ }
--- ---
"Normalization charfilter": "Normalization charfilter":
- do: - do:
@ -26,9 +28,11 @@
body: body:
char_filter: [icu_normalizer] char_filter: [icu_normalizer]
text: Foo Bar Ruß text: Foo Bar Ruß
tokenizer: keyword tokenizer: standard
- length: { tokens: 1 } - length: { tokens: 3 }
- match: { tokens.0.token: foo bar russ } - match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
- match: { tokens.2.token: russ }
--- ---
"Folding filter": "Folding filter":
- do: - do:
@ -36,9 +40,11 @@
body: body:
filter: [icu_folding] filter: [icu_folding]
text: Foo Bar résumé text: Foo Bar résumé
tokenizer: keyword tokenizer: standard
- length: { tokens: 1 } - length: { tokens: 3 }
- match: { tokens.0.token: foo bar resume } - match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
- match: { tokens.2.token: resume }
--- ---
"Normalization with a UnicodeSet Filter": "Normalization with a UnicodeSet Filter":
- do: - do:
@ -64,25 +70,34 @@
index: test index: test
body: body:
char_filter: ["charfilter_icu_normalizer"] char_filter: ["charfilter_icu_normalizer"]
tokenizer: keyword tokenizer: standard
text: charfilter Föo Bâr Ruß text: charfilter Föo Bâr Ruß
- length: { tokens: 1 } - length: { tokens: 4 }
- match: { tokens.0.token: charfilter föo bâr ruß } - match: { tokens.0.token: charfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: ruß }
- do: - do:
indices.analyze: indices.analyze:
index: test index: test
body: body:
tokenizer: keyword tokenizer: standard
filter: ["tokenfilter_icu_normalizer"] filter: ["tokenfilter_icu_normalizer"]
text: tokenfilter Föo Bâr Ruß text: tokenfilter Föo Bâr Ruß
- length: { tokens: 1 } - length: { tokens: 4 }
- match: { tokens.0.token: tokenfilter föo Bâr ruß } - match: { tokens.0.token: tokenfilter }
- match: { tokens.1.token: föo }
- match: { tokens.2.token: Bâr }
- match: { tokens.3.token: ruß }
- do: - do:
indices.analyze: indices.analyze:
index: test index: test
body: body:
tokenizer: keyword tokenizer: standard
filter: ["tokenfilter_icu_folding"] filter: ["tokenfilter_icu_folding"]
text: icufolding Föo Bâr Ruß text: icufolding Föo Bâr Ruß
- length: { tokens: 1 } - length: { tokens: 4 }
- match: { tokens.0.token: icufolding foo bâr russ } - match: { tokens.0.token: icufolding }
- match: { tokens.1.token: foo }
- match: { tokens.2.token: bâr }
- match: { tokens.3.token: russ }

View File

@ -5,7 +5,7 @@
indices.analyze: indices.analyze:
body: body:
text: studenci text: studenci
tokenizer: keyword tokenizer: standard
filter: [polish_stem] filter: [polish_stem]
- length: { tokens: 1 } - length: { tokens: 1 }
- match: { tokens.0.token: student } - match: { tokens.0.token: student }

View File

@ -75,19 +75,3 @@
- match: { detail.tokenizer.tokens.2.token: buzz } - match: { detail.tokenizer.tokens.2.token: buzz }
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" } - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter" }
- match: { detail.tokenfilters.0.tokens.0.token: bar } - match: { detail.tokenfilters.0.tokens.0.token: bar }
---
"Custom normalizer in request":
- do:
indices.analyze:
body:
text: ABc
explain: true
filter: ["lowercase"]
- length: { detail.tokenizer.tokens: 1 }
- length: { detail.tokenfilters.0.tokens: 1 }
- match: { detail.tokenizer.name: keyword_for_normalizer }
- match: { detail.tokenizer.tokens.0.token: ABc }
- match: { detail.tokenfilters.0.name: lowercase }
- match: { detail.tokenfilters.0.tokens.0.token: abc }

View File

@ -548,6 +548,10 @@ public final class AnalysisRegistry implements Closeable {
TokenizerFactory keywordTokenizerFactory, TokenizerFactory keywordTokenizerFactory,
Map<String, TokenFilterFactory> tokenFilters, Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters) { Map<String, CharFilterFactory> charFilters) {
if (keywordTokenizerFactory == null) {
throw new IllegalStateException("keyword tokenizer factory is null, normalizers require analysis-common module");
}
if (normalizerFactory instanceof CustomNormalizerProvider) { if (normalizerFactory instanceof CustomNormalizerProvider) {
((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters); ((CustomNormalizerProvider) normalizerFactory).build(keywordTokenizerFactory, charFilters, tokenFilters);
} }

View File

@ -56,7 +56,6 @@ import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
import org.elasticsearch.index.analysis.IrishAnalyzerProvider; import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider; import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordAnalyzerProvider; import org.elasticsearch.index.analysis.KeywordAnalyzerProvider;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.LatvianAnalyzerProvider; import org.elasticsearch.index.analysis.LatvianAnalyzerProvider;
import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider; import org.elasticsearch.index.analysis.LithuanianAnalyzerProvider;
import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider; import org.elasticsearch.index.analysis.NorwegianAnalyzerProvider;
@ -225,7 +224,6 @@ public final class AnalysisModule {
private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) { private NamedRegistry<AnalysisProvider<TokenizerFactory>> setupTokenizers(List<AnalysisPlugin> plugins) {
NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer"); NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = new NamedRegistry<>("tokenizer");
tokenizers.register("standard", StandardTokenizerFactory::new); tokenizers.register("standard", StandardTokenizerFactory::new);
tokenizers.register("keyword", KeywordTokenizerFactory::new);
tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers); tokenizers.extractAndRegister(plugins, AnalysisPlugin::getTokenizers);
return tokenizers; return tokenizers;
} }

View File

@ -19,6 +19,7 @@
package org.elasticsearch.action.admin.indices; package org.elasticsearch.action.admin.indices;
import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest; import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
@ -37,6 +38,7 @@ import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.AnalysisModuleTests.AppendCharFilter; import org.elasticsearch.indices.analysis.AnalysisModuleTests.AppendCharFilter;
@ -107,6 +109,12 @@ public class TransportAnalyzeActionTests extends ESTestCase {
return singletonMap("append", AppendCharFilterFactory::new); return singletonMap("append", AppendCharFilterFactory::new);
} }
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
}
@Override @Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new); return singletonMap("mock", MockFactory::new);

View File

@ -37,10 +37,13 @@ import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.engine.VersionConflictEngineException;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.MockKeywordPlugin;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
@ -58,6 +61,12 @@ import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.Matchers.nullValue;
public class GetTermVectorsIT extends AbstractTermVectorsTestCase { public class GetTermVectorsIT extends AbstractTermVectorsTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singleton(MockKeywordPlugin.class);
}
public void testNoSuchDoc() throws Exception { public void testNoSuchDoc() throws Exception {
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1") XContentBuilder mapping = jsonBuilder().startObject().startObject("type1")
.startObject("properties") .startObject("properties")

View File

@ -432,7 +432,7 @@ public class GatewayIndexStateIT extends ESIntegTestCase {
logger.info("--> starting one node"); logger.info("--> starting one node");
internalCluster().startNode(); internalCluster().startNode();
prepareCreate("test").setSettings(Settings.builder() prepareCreate("test").setSettings(Settings.builder()
.put("index.analysis.analyzer.test.tokenizer", "keyword") .put("index.analysis.analyzer.test.tokenizer", "standard")
.put("index.number_of_shards", "1")) .put("index.number_of_shards", "1"))
.addMapping("type1", "{\n" + .addMapping("type1", "{\n" +
" \"type1\": {\n" + " \"type1\": {\n" +

View File

@ -20,6 +20,8 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
@ -71,7 +73,7 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build(); .build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class, IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings)); () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings, MOCK_ANALYSIS_PLUGIN));
assertEquals("Custom normalizer [my_normalizer] cannot configure a tokenizer", e.getMessage()); assertEquals("Custom normalizer [my_normalizer] cannot configure a tokenizer", e.getMessage());
} }
@ -135,7 +137,7 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
@Override @Override
public int read(char[] cbuf, int off, int len) throws IOException { public int read(char[] cbuf, int off, int len) throws IOException {
int result = reader.read(cbuf, off, len); int result = reader.read(cbuf, off, len);
for (int i = off; i < result; i++) { for (int i = off; i < off + len; i++) {
if (cbuf[i] == 'a') { if (cbuf[i] == 'a') {
cbuf[i] = 'z'; cbuf[i] = 'z';
} }
@ -157,5 +159,11 @@ public class CustomNormalizerTests extends ESTokenStreamTestCase {
return new Factory(); return new Factory();
}); });
} }
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
}
} }
} }

View File

@ -20,6 +20,8 @@
package org.elasticsearch.index.mapper; package org.elasticsearch.index.mapper;
import org.apache.lucene.analysis.MockLowerCaseFilter; import org.apache.lucene.analysis.MockLowerCaseFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableField;
@ -33,7 +35,9 @@ import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.IndexService; import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.mapper.MapperService.MergeReason; import org.elasticsearch.index.mapper.MapperService.MergeReason;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin; import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESSingleNodeTestCase; import org.elasticsearch.test.ESSingleNodeTestCase;
@ -44,8 +48,10 @@ import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map;
import static java.util.Collections.singletonList; import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
@ -58,6 +64,21 @@ public class KeywordFieldMapperTests extends ESSingleNodeTestCase {
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new)); return singletonList(PreConfiguredTokenFilter.singleton("mock_other_lowercase", true, MockLowerCaseFilter::new));
} }
@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
class Factory implements TokenizerFactory {
@Override
public Tokenizer create() {
return new MockTokenizer(MockTokenizer.KEYWORD, false);
}
}
return new Factory();
});
}
}; };
@Override @Override

View File

@ -21,6 +21,7 @@ package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.CharFilter;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -31,6 +32,7 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.store.SimpleFSDirectory;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.io.Streams;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
@ -49,6 +51,7 @@ import org.elasticsearch.index.analysis.StandardTokenizerFactory;
import org.elasticsearch.index.analysis.StopTokenFilterFactory; import org.elasticsearch.index.analysis.StopTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory; import org.elasticsearch.index.analysis.MyFilterTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
@ -60,6 +63,8 @@ import java.io.BufferedWriter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -222,7 +227,7 @@ public class AnalysisModuleTests extends ESTestCase {
public void testUnderscoreInAnalyzerName() throws IOException { public void testUnderscoreInAnalyzerName() throws IOException {
Settings settings = Settings.builder() Settings settings = Settings.builder()
.put("index.analysis.analyzer._invalid_name.tokenizer", "keyword") .put("index.analysis.analyzer._invalid_name.tokenizer", "standard")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetaData.SETTING_VERSION_CREATED, "1") .put(IndexMetaData.SETTING_VERSION_CREATED, "1")
.build(); .build();
@ -256,6 +261,13 @@ public class AnalysisModuleTests extends ESTestCase {
(tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString())) (tokenStream, esVersion) -> new AppendCharFilter(tokenStream, esVersion.toString()))
); );
} }
@Override
public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
// Need mock keyword tokenizer here, because alpha / beta versions are broken up by the dash.
return singletonMap("keyword", (indexSettings, environment, name, settings) ->
() -> new MockTokenizer(MockTokenizer.KEYWORD, false));
}
})).getAnalysisRegistry(); })).getAnalysisRegistry();
Version version = VersionUtils.randomVersion(random()); Version version = VersionUtils.randomVersion(random());
@ -305,11 +317,11 @@ public class AnalysisModuleTests extends ESTestCase {
Version version = VersionUtils.randomVersion(random()); Version version = VersionUtils.randomVersion(random());
IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder() IndexAnalyzers analyzers = getIndexAnalyzers(registry, Settings.builder()
.put("index.analysis.analyzer.no_version.tokenizer", "keyword") .put("index.analysis.analyzer.no_version.tokenizer", "standard")
.put("index.analysis.analyzer.no_version.filter", "no_version") .put("index.analysis.analyzer.no_version.filter", "no_version")
.put("index.analysis.analyzer.lucene_version.tokenizer", "keyword") .put("index.analysis.analyzer.lucene_version.tokenizer", "standard")
.put("index.analysis.analyzer.lucene_version.filter", "lucene_version") .put("index.analysis.analyzer.lucene_version.filter", "lucene_version")
.put("index.analysis.analyzer.elasticsearch_version.tokenizer", "keyword") .put("index.analysis.analyzer.elasticsearch_version.tokenizer", "standard")
.put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version") .put("index.analysis.analyzer.elasticsearch_version.filter", "elasticsearch_version")
.put(IndexMetaData.SETTING_VERSION_CREATED, version) .put(IndexMetaData.SETTING_VERSION_CREATED, version)
.build()); .build());
@ -425,12 +437,17 @@ public class AnalysisModuleTests extends ESTestCase {
// Simple char filter that appends text to the term // Simple char filter that appends text to the term
public static class AppendCharFilter extends CharFilter { public static class AppendCharFilter extends CharFilter {
private final char[] appendMe;
private int offsetInAppendMe = -1; static Reader append(Reader input, String appendMe) {
try {
return new StringReader(Streams.copyToString(input) + appendMe);
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
public AppendCharFilter(Reader input, String appendMe) { public AppendCharFilter(Reader input, String appendMe) {
super(input); super(append(input, appendMe));
this.appendMe = appendMe.toCharArray();
} }
@Override @Override
@ -440,24 +457,7 @@ public class AnalysisModuleTests extends ESTestCase {
@Override @Override
public int read(char[] cbuf, int off, int len) throws IOException { public int read(char[] cbuf, int off, int len) throws IOException {
if (offsetInAppendMe < 0) { return input.read(cbuf, off, len);
int read = input.read(cbuf, off, len);
if (read == len) {
return read;
}
off += read;
len -= read;
int allowedLen = Math.min(len, appendMe.length);
System.arraycopy(appendMe, 0, cbuf, off, allowedLen);
offsetInAppendMe = allowedLen;
return read + allowedLen;
}
if (offsetInAppendMe >= appendMe.length) {
return -1;
}
int allowedLen = Math.max(len, appendMe.length - offsetInAppendMe);
System.arraycopy(appendMe, offsetInAppendMe, cbuf, off, allowedLen);
return allowedLen;
} }
} }

View File

@ -22,11 +22,18 @@ import org.elasticsearch.action.admin.indices.alias.Alias;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequestBuilder; import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequestBuilder;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.CharFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.MockKeywordPlugin;
import org.hamcrest.core.IsNull; import org.hamcrest.core.IsNull;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.Map;
@ -38,6 +45,12 @@ import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.startsWith; import static org.hamcrest.Matchers.startsWith;
public class AnalyzeActionIT extends ESIntegTestCase { public class AnalyzeActionIT extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singleton(MockKeywordPlugin.class);
}
public void testSimpleAnalyzerTests() throws Exception { public void testSimpleAnalyzerTests() throws Exception {
assertAcked(prepareCreate("test").addAlias(new Alias("alias"))); assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
ensureGreen(); ensureGreen();

View File

@ -50,14 +50,15 @@ import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder.Field;
import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.InternalSettingsPlugin; import org.elasticsearch.test.InternalSettingsPlugin;
import org.elasticsearch.test.MockKeywordPlugin;
import org.hamcrest.Matcher; import org.hamcrest.Matcher;
import org.hamcrest.Matchers; import org.hamcrest.Matchers;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import org.joda.time.chrono.ISOChronology; import org.joda.time.chrono.ISOChronology;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays;
import java.util.Collection; import java.util.Collection;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
@ -105,7 +106,7 @@ public class HighlighterSearchIT extends ESIntegTestCase {
@Override @Override
protected Collection<Class<? extends Plugin>> nodePlugins() { protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singletonList(InternalSettingsPlugin.class); return Arrays.asList(InternalSettingsPlugin.class, MockKeywordPlugin.class);
} }
public void testHighlightingWithStoredKeyword() throws IOException { public void testHighlightingWithStoredKeyword() throws IOException {

View File

@ -32,15 +32,19 @@ import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.Operator; import org.elasticsearch.index.query.Operator;
import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.search.MatchQuery; import org.elasticsearch.index.search.MatchQuery;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortBuilders;
import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.search.sort.SortOrder;
import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.MockKeywordPlugin;
import org.junit.Before; import org.junit.Before;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
@ -72,6 +76,11 @@ import static org.hamcrest.Matchers.lessThan;
public class MultiMatchQueryIT extends ESIntegTestCase { public class MultiMatchQueryIT extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Collections.singleton(MockKeywordPlugin.class);
}
@Before @Before
public void init() throws Exception { public void init() throws Exception {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder() CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder()

View File

@ -24,7 +24,6 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory; import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent; import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.PreConfiguredCharFilter; import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter; import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
@ -79,7 +78,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
// exposed in ES // exposed in ES
.put("classic", MovedToAnalysisCommon.class) .put("classic", MovedToAnalysisCommon.class)
.put("edgengram", MovedToAnalysisCommon.class) .put("edgengram", MovedToAnalysisCommon.class)
.put("keyword", KeywordTokenizerFactory.class) .put("keyword", MovedToAnalysisCommon.class)
.put("letter", MovedToAnalysisCommon.class) .put("letter", MovedToAnalysisCommon.class)
.put("lowercase", MovedToAnalysisCommon.class) .put("lowercase", MovedToAnalysisCommon.class)
.put("ngram", MovedToAnalysisCommon.class) .put("ngram", MovedToAnalysisCommon.class)

View File

@ -0,0 +1,54 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import java.util.Map;
import static java.util.Collections.singletonMap;
/**
* Some tests rely on the keyword tokenizer, but this tokenizer isn't part of lucene-core and therefor not available
* in some modules. What this test plugin does, is use the mock tokenizer and advertise that as the keyword tokenizer.
*
* Most tests that need this test plugin use normalizers. When normalizers are constructed they try to resolve the
* keyword tokenizer, but if the keyword tokenizer isn't available then constructing normalizers will fail.
*/
public class MockKeywordPlugin extends Plugin implements AnalysisPlugin {
@Override
public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
return singletonMap("keyword", (indexSettings, environment, name, settings) -> {
class Factory implements TokenizerFactory {
@Override
public Tokenizer create() {
return new MockTokenizer(MockTokenizer.KEYWORD, false);
}
}
return new Factory();
});
}
}