Start building analysis-common module (#23614)

Start moving built in analysis components into the new analysis-common
module. The goal of this project is:
1. Remove core's dependency on lucene-analyzers-common.jar which should
shrink the dependencies for transport client and high level rest client.
2. Prove that analysis plugins can do all the "built in" things by moving all
"built in" behavior to a plugin.
3. Force tests not to depend on any oddball analyzer behavior. If tests
need anything more than the standard analyzer they can use the mock
analyzer provided by Lucene's test infrastructure.
This commit is contained in:
Nik Everett 2017-04-19 18:51:34 -04:00 committed by GitHub
parent 151a65ed17
commit caf376c8af
33 changed files with 959 additions and 476 deletions

View File

@ -1096,7 +1096,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" />
@ -1225,8 +1224,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
@ -2686,11 +2683,8 @@
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTestsHelper.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]BaseWordDelimiterTokenFilterFactoryTestCase.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" />
@ -2709,8 +2703,6 @@
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" /> <suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" />

View File

@ -71,4 +71,9 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
return result; return result;
} }
@Override
public boolean breaksFastVectorHighlighter() {
return true;
}
} }

View File

@ -20,10 +20,20 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
public interface TokenFilterFactory { public interface TokenFilterFactory {
String name(); String name();
TokenStream create(TokenStream tokenStream); TokenStream create(TokenStream tokenStream);
/**
* Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the
* {@link FastVectorHighlighter}? If this is {@code true} then the
* {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets.
*/
default boolean breaksFastVectorHighlighter() {
return false;
}
} }

View File

@ -25,7 +25,6 @@ import org.elasticsearch.common.NamedRegistry;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ApostropheFilterFactory;
@ -140,8 +139,6 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.AnalysisPlugin;
@ -205,7 +202,6 @@ public final class AnalysisModule {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter"); NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
tokenFilters.register("stop", StopTokenFilterFactory::new); tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("reverse", ReverseTokenFilterFactory::new); tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new);
tokenFilters.register("length", LengthTokenFilterFactory::new); tokenFilters.register("length", LengthTokenFilterFactory::new);
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new); tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new); tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
@ -225,8 +221,6 @@ public final class AnalysisModule {
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("snowball", SnowballTokenFilterFactory::new); tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new); tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new); tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);

View File

@ -26,15 +26,9 @@ import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.util.CollectionUtil; import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.Version;
import org.elasticsearch.index.analysis.CustomAnalyzer; import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.FieldMapper;
import java.util.Comparator; import java.util.Comparator;
@ -56,7 +50,7 @@ public final class FragmentBuilderHelper {
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) { public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) {
assert fragInfo != null : "FragInfo must not be null"; assert fragInfo != null : "FragInfo must not be null";
assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name(); assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.fieldType().indexAnalyzer()))) { if (!fragInfo.getSubInfos().isEmpty() && containsBrokenAnalysis(mapper.fieldType().indexAnalyzer())) {
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time /* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort * which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
* the fragments based on their offsets rather than using soley the positions as it is done in * the fragments based on their offsets rather than using soley the positions as it is done in
@ -91,8 +85,7 @@ public final class FragmentBuilderHelper {
final CustomAnalyzer a = (CustomAnalyzer) analyzer; final CustomAnalyzer a = (CustomAnalyzer) analyzer;
TokenFilterFactory[] tokenFilters = a.tokenFilters(); TokenFilterFactory[] tokenFilters = a.tokenFilters();
for (TokenFilterFactory tokenFilterFactory : tokenFilters) { for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory if (tokenFilterFactory.breaksFastVectorHighlighter()) {
|| tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) {
return true; return true;
} }
} }

View File

@ -18,6 +18,8 @@
*/ */
package org.elasticsearch.action.admin.indices; package org.elasticsearch.action.admin.indices;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version; import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest; import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
@ -27,18 +29,28 @@ import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.mapper.AllFieldMapper; import org.elasticsearch.index.mapper.AllFieldMapper;
import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Map;
import static java.util.Collections.emptyList; import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
/**
* Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the
* {@code common-analysis} module.
*/
public class TransportAnalyzeActionTests extends ESTestCase { public class TransportAnalyzeActionTests extends ESTestCase {
private IndexAnalyzers indexAnalyzers; private IndexAnalyzers indexAnalyzers;
@ -53,23 +65,28 @@ public class TransportAnalyzeActionTests extends ESTestCase {
Settings indexSettings = Settings.builder() Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.analyzer.custom_analyzer.filter", "mock").build();
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
.put("index.analysis.tokenizer.trigram.type", "ngram")
.put("index.analysis.tokenizer.trigram.min_gram", 3)
.put("index.analysis.tokenizer.trigram.max_gram", 3)
.put("index.analysis.filter.synonym.type", "synonym")
.putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay")
.put("index.analysis.filter.synonym.tokenizer", "trigram")
.put("index.analysis.filter.synonym.min_gram", 3)
.put("index.analysis.filter.synonym.max_gram", 3).build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
environment = new Environment(settings); environment = new Environment(settings);
registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry(); AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
}
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
}
};
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
indexAnalyzers = registry.build(idxSettings); indexAnalyzers = registry.build(idxSettings);
} }
@ -143,51 +160,44 @@ public class TransportAnalyzeActionTests extends ESTestCase {
} }
public void testWithIndexAnalyzers() throws IOException { public void testWithIndexAnalyzers() throws IOException {
AnalyzeRequest request = new AnalyzeRequest(); AnalyzeRequest request = new AnalyzeRequest();
request.analyzer("standard");
request.text("the quick brown fox"); request.text("the quick brown fox");
request.analyzer("custom_analyzer"); request.analyzer("custom_analyzer");
request.text("the qu1ck brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens(); List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(4, tokens.size()); assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
assertEquals("brown", tokens.get(1).getTerm());
assertEquals("fox", tokens.get(2).getTerm());
request.analyzer("whitespace"); request.analyzer("standard");
request.text("the qu1ck brown fox-dog");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens(); tokens = analyze.getTokens();
assertEquals(4, tokens.size()); assertEquals(4, tokens.size());
request.analyzer("custom_analyzer");
request.text("the qu1ck brown fox-dog");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(5, tokens.size());
request.analyzer(null);
request.tokenizer("whitespace");
request.addTokenFilter("lowercase");
request.addTokenFilter("wordDelimiter");
request.text("the qu1ck brown fox-dog");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(5, tokens.size());
assertEquals("the", tokens.get(0).getTerm()); assertEquals("the", tokens.get(0).getTerm());
assertEquals("qu1ck", tokens.get(1).getTerm()); assertEquals("quick", tokens.get(1).getTerm());
assertEquals("brown", tokens.get(2).getTerm()); assertEquals("brown", tokens.get(2).getTerm());
assertEquals("fox", tokens.get(3).getTerm()); assertEquals("fox", tokens.get(3).getTerm());
assertEquals("dog", tokens.get(4).getTerm());
// Switch the analyzer out for just a tokenizer
request.analyzer(null); request.analyzer(null);
request.tokenizer("trigram"); request.tokenizer("standard");
request.addTokenFilter("synonym");
request.text("kimchy");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens(); tokens = analyze.getTokens();
assertEquals(2, tokens.size()); assertEquals(4, tokens.size());
assertEquals("sha", tokens.get(0).getTerm()); assertEquals("the", tokens.get(0).getTerm());
assertEquals("hay", tokens.get(1).getTerm()); assertEquals("quick", tokens.get(1).getTerm());
assertEquals("brown", tokens.get(2).getTerm());
assertEquals("fox", tokens.get(3).getTerm());
// Now try applying our token filter
request.addTokenFilter("mock");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
assertEquals("brown", tokens.get(1).getTerm());
assertEquals("fox", tokens.get(2).getTerm());
} }
public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException { public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException {

View File

@ -22,5 +22,5 @@ package org.elasticsearch.index.analysis;
import org.elasticsearch.AnalysisFactoryTestCase; import org.elasticsearch.AnalysisFactoryTestCase;
public class AnalysisFactoryTests extends AnalysisFactoryTestCase { public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
// tests are inherited // tests are inherited and nothing needs to be defined here
} }

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -31,17 +32,20 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils; import org.elasticsearch.test.VersionUtils;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.Collections;
import java.util.List; import java.util.HashMap;
import java.util.Map;
import static java.util.Collections.emptyList;
import static java.util.Collections.emptyMap; import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap; import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.instanceOf;
@ -112,51 +116,73 @@ public class AnalysisRegistryTests extends ESTestCase {
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
} }
/**
* Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide.
*/
public void testConfigureCamelCaseTokenFilter() throws IOException { public void testConfigureCamelCaseTokenFilter() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder() Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter") .put("index.analysis.filter.testFilter.type", "mock")
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false) .put("index.analysis.filter.test_filter.type", "mock")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") .put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard")
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") .putArray("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter")
.put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") .put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard")
.putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build(); .putArray("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry() /* The snake_case version of the name should not filter out any stopwords while the
* camelCase version will filter out English stopwords. */
AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (name().equals("test_filter")) {
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
}
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
}
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
}
};
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), singletonList(plugin)).getAnalysisRegistry()
.build(idxSettings); .build(idxSettings);
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
// This shouldn't contain English stopwords
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) {
assertNotNull(custom_analyser); assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset(); tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
List<String> token = new ArrayList<>(); assertTrue(tokenStream.incrementToken());
while(tokenStream.incrementToken()) { assertEquals("has", charTermAttribute.toString());
token.add(charTermAttribute.toString()); assertTrue(tokenStream.incrementToken());
} assertEquals("foo", charTermAttribute.toString());
assertEquals(token.toString(), 2, token.size()); assertFalse(tokenStream.incrementToken());
assertEquals("j2se", token.get(0));
assertEquals("j2ee", token.get(1));
} }
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { // This *should* contain English stopwords
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) {
assertNotNull(custom_analyser); assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset(); tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
List<String> token = new ArrayList<>(); assertTrue(tokenStream.incrementToken());
while(tokenStream.incrementToken()) { assertEquals("has", charTermAttribute.toString());
token.add(charTermAttribute.toString()); assertTrue(tokenStream.incrementToken());
} assertEquals("a", charTermAttribute.toString());
assertEquals(token.toString(), 6, token.size()); assertTrue(tokenStream.incrementToken());
assertEquals("j", token.get(0)); assertEquals("foo", charTermAttribute.toString());
assertEquals("2", token.get(1)); assertFalse(tokenStream.incrementToken());
assertEquals("se", token.get(2));
assertEquals("j", token.get(3));
assertEquals("2", token.get(4));
assertEquals("ee", token.get(5));
} }
} }

View File

@ -19,6 +19,7 @@
package org.elasticsearch.search.fetch.subphase.highlight; package org.elasticsearch.search.fetch.subphase.highlight;
import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchRequestBuilder; import org.elasticsearch.action.search.SearchRequestBuilder;
@ -100,6 +101,7 @@ import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.startsWith; import static org.hamcrest.Matchers.startsWith;
public class HighlighterSearchIT extends ESIntegTestCase { public class HighlighterSearchIT extends ESIntegTestCase {
// TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"}; private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"};
private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"}; private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"};
@ -113,12 +115,11 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject(); mappings.startObject();
mappings.startObject("type") mappings.startObject("type")
.startObject("properties") .startObject("properties")
.startObject("text") .startObject("text")
.field("type", "keyword") .field("type", "keyword")
.field("store", true) .field("store", true)
.endObject() .endObject()
.endObject() .endObject().endObject();
.endObject();
mappings.endObject(); mappings.endObject();
assertAcked(prepareCreate("test") assertAcked(prepareCreate("test")
.addMapping("type", mappings)); .addMapping("type", mappings));
@ -139,14 +140,13 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject(); mappings.startObject();
mappings.startObject("type") mappings.startObject("type")
.startObject("properties") .startObject("properties")
.startObject("text") .startObject("text")
.field("type", "text") .field("type", "text")
.field("analyzer", "keyword") .field("analyzer", "keyword")
.field("index_options", "offsets") .field("index_options", "offsets")
.field("term_vector", "with_positions_offsets") .field("term_vector", "with_positions_offsets")
.endObject() .endObject()
.endObject() .endObject().endObject();
.endObject();
mappings.endObject(); mappings.endObject();
assertAcked(prepareCreate("test") assertAcked(prepareCreate("test")
.addMapping("type", mappings)); .addMapping("type", mappings));
@ -166,23 +166,22 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject(); mappings.startObject();
mappings.startObject("type") mappings.startObject("type")
.startObject("_source") .startObject("_source")
.field("enabled", false) .field("enabled", false)
.endObject() .endObject()
.startObject("properties") .startObject("properties")
.startObject("unstored_field") .startObject("unstored_field")
.field("index_options", "offsets") .field("index_options", "offsets")
.field("term_vector", "with_positions_offsets") .field("term_vector", "with_positions_offsets")
.field("type", "text") .field("type", "text")
.field("store", false) .field("store", false)
.endObject() .endObject()
.startObject("text") .startObject("text")
.field("index_options", "offsets") .field("index_options", "offsets")
.field("term_vector", "with_positions_offsets") .field("term_vector", "with_positions_offsets")
.field("type", "text") .field("type", "text")
.field("store", true) .field("store", true)
.endObject() .endObject()
.endObject() .endObject().endObject();
.endObject();
mappings.endObject(); mappings.endObject();
assertAcked(prepareCreate("test") assertAcked(prepareCreate("test")
.addMapping("type", mappings)); .addMapping("type", mappings));
@ -218,103 +217,6 @@ public class HighlighterSearchIT extends ESIntegTestCase {
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>")); assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
} }
public void testNgramHighlightingWithBrokenPositions() throws IOException {
assertAcked(prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("name")
.startObject("fields")
.startObject("autocomplete")
.field("type", "text")
.field("analyzer", "autocomplete")
.field("search_analyzer", "search_autocomplete")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject()
.field("type", "text")
.endObject()
.endObject()
.endObject()
.endObject())
.setSettings(Settings.builder()
.put(indexSettings())
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putArray("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM",
"+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM",
"^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
.putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
.putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")));
client().prepareIndex("test", "test", "1")
.setSource("name", "ARCOTEL Hotels Deutschland").get();
refresh();
SearchResponse search = client().prepareSearch("test").setTypes("test")
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
.highlighter(new HighlightBuilder().field("name.autocomplete")).execute().actionGet();
assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
}
public void testMultiPhraseCutoff() throws IOException {
/*
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
* query. We cut off and extract terms if there are more than 16 terms in the query
*/
assertAcked(prepareCreate("test")
.addMapping("test",
"body", "type=text,analyzer=custom_analyzer,search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
.setSettings(
Settings.builder().put(indexSettings())
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter"))
);
ensureGreen();
client().prepareIndex("test", "test", "1")
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ "http://twitter.com this is a test for highlighting feature")
.get();
refresh();
SearchResponse search = client().prepareSearch().setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
search = client()
.prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ "http://twitter.com this is a test for highlighting feature"))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: <em>http://www.facebook.com</em> "
+ "<em>http://elasticsearch.org</em> <em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
}
public void testNgramHighlighting() throws IOException { public void testNgramHighlighting() throws IOException {
assertAcked(prepareCreate("test") assertAcked(prepareCreate("test")
.addMapping("test", .addMapping("test",

View File

@ -1605,33 +1605,6 @@ public class SearchQueryIT extends ESIntegTestCase {
assertHitCount(searchResponse, 2); assertHitCount(searchResponse, 2);
} }
// see #3898
public void testCustomWordDelimiterQueryString() {
assertAcked(client().admin().indices().prepareCreate("test")
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
"analysis.filter.custom_word_delimiter.catenate_words", "false",
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
.addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer"));
client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get();
refresh();
SearchResponse response = client()
.prepareSearch("test")
.setQuery(
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
.field("field1").field("field2")).get();
assertHitCount(response, 1L);
}
// see #3797 // see #3797
public void testMultiMatchLenientIssue3797() { public void testMultiMatchLenientIssue3797() {
createIndex("test"); createIndex("test");

View File

@ -0,0 +1,23 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
esplugin {
description 'Adds "built in" analyzers to Elasticsearch.'
classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
}

View File

@ -17,7 +17,7 @@
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
@ -25,20 +25,26 @@ import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
/** /**
* Factory for ASCIIFoldingFilter. * Factory for ASCIIFoldingFilter.
*/ */
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original"); implements MultiTermAwareComponent {
public static boolean DEFAULT_PRESERVE_ORIGINAL = false; public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final boolean preserveOriginal; private final boolean preserveOriginal;
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment,
String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices( preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices(
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL, deprecationLogger); indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(),
DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
} }
@Override @Override

View File

@ -0,0 +1,39 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import java.util.HashMap;
import java.util.Map;
public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
return filters;
}
}

View File

@ -17,7 +17,7 @@
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -26,20 +26,22 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE;
import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes; import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.parseTypes;
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory { public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
@ -47,7 +49,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
private final int flags; private final int flags;
private final CharArraySet protoWords; private final CharArraySet protoWords;
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
// Sample Format for the type table: // Sample Format for the type table:
@ -82,7 +85,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited // If not null is the set of tokens to protect from being delimited
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags; this.flags = flags;
} }

View File

@ -17,7 +17,7 @@
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
@ -52,7 +54,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
private final int flags; private final int flags;
private final CharArraySet protoWords; private final CharArraySet protoWords;
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
super(indexSettings, name, settings); super(indexSettings, name, settings);
// Sample Format for the type table: // Sample Format for the type table:
@ -87,7 +90,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited // If not null is the set of tokens to protect from being delimited
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags; this.flags = flags;
} }
@ -101,7 +105,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
} }
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) { public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), key, defaultValue, deprecationLogger)) { if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(),
key, defaultValue, deprecationLogger)) {
return flag; return flag;
} }
return 0; return 0;
@ -122,14 +127,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
String lhs = parseString(m.group(1).trim()); String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim()); Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1) if (lhs.length() != 1)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); throw new RuntimeException("Invalid Mapping Rule : ["
+ rule + "]. Only a single character is allowed.");
if (rhs == null) if (rhs == null)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
typeMap.put(lhs.charAt(0), rhs); typeMap.put(lhs.charAt(0), rhs);
} }
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; byte types[] = new byte[Math.max(
typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
for (int i = 0; i < types.length; i++) for (int i = 0; i < types.length; i++)
types[i] = WordDelimiterIterator.getType(i); types[i] = WordDelimiterIterator.getType(i);
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet()) for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
@ -196,4 +203,9 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
} }
return new String(out, 0, writePos); return new String(out, 0, writePos);
} }
@Override
public boolean breaksFastVectorHighlighter() {
return true;
}
} }

View File

@ -17,12 +17,15 @@
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase; import org.elasticsearch.test.ESTokenStreamTestCase;
@ -31,10 +34,12 @@ import java.io.StringReader;
public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase { public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException { public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build()); .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche"; String source = "Ansprüche";
String[] expected = new String[]{"Anspruche"}; String[] expected = new String[]{"Anspruche"};
@ -44,11 +49,13 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
} }
public void testPreserveOriginal() throws IOException { public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding") .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_ascii_folding.preserve_original", true) .put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.build()); .put("index.analysis.filter.my_ascii_folding.preserve_original", true)
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche"; String source = "Ansprüche";
String[] expected = new String[]{"Anspruche", "Ansprüche"}; String[] expected = new String[]{"Anspruche", "Ansprüche"};
@ -57,7 +64,8 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
assertTokenStreamContents(tokenFilter.create(tokenizer), expected); assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
// but the multi-term aware component still emits a single token // but the multi-term aware component still emits a single token
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent(); tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter)
.getMultiTermComponent();
tokenizer = new WhitespaceTokenizer(); tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
expected = new String[]{"Anspruche"}; expected = new String[]{"Anspruche"};

View File

@ -16,13 +16,15 @@
* specific language governing permissions and limitations * specific language governing permissions and limitations
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase; import org.elasticsearch.test.ESTokenStreamTestCase;
@ -30,7 +32,8 @@ import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
/** /**
* Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory} * Base class to test {@link WordDelimiterTokenFilterFactory} and
* {@link WordDelimiterGraphTokenFilterFactory}.
*/ */
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase { public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
final String type; final String type;
@ -40,10 +43,12 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
} }
public void testDefault() throws IOException { public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build()); .put("index.analysis.filter.my_word_delimiter.type", type)
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
@ -54,44 +59,51 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
} }
public void testCatenateWords() throws IOException { public void testCatenateWords() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.build()); .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j",
"2", "se", "ONeil" };
Tokenizer tokenizer = new WhitespaceTokenizer(); Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected); assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
} }
public void testCatenateNumbers() throws IOException { public void testCatenateNumbers() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.build()); .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000",
"se", "O", "Neil"}; "j", "2", "se", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer(); Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected); assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
} }
public void testCatenateAll() throws IOException { public void testCatenateAll() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true") .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.build()); .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
@ -101,11 +113,13 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
} }
public void testSplitOnCaseChange() throws IOException { public void testSplitOnCaseChange() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") .put("index.analysis.filter.my_word_delimiter.type", type)
.build()); .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot"; String source = "PowerShot";
String[] expected = new String[]{"PowerShot"}; String[] expected = new String[]{"PowerShot"};
@ -115,30 +129,35 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
} }
public void testPreserveOriginal() throws IOException { public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true") .put("index.analysis.filter.my_word_delimiter.type", type)
.build()); .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42",
"wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se",
"O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer(); Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected); assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
} }
public void testStemEnglishPossessive() throws IOException { public void testStemEnglishPossessive() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") .put("index.analysis.filter.my_word_delimiter.type", type)
.build()); .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi",
"se", "O", "Neil", "s"}; "4000", "j", "2", "se", "O", "Neil", "s" };
Tokenizer tokenizer = new WhitespaceTokenizer(); Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected); assertTokenStreamContents(tokenFilter.create(tokenizer), expected);

View File

@ -0,0 +1,36 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import com.carrotsearch.randomizedtesting.annotations.Name;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
public class CommonAnalysisClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
public CommonAnalysisClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) {
super(testCandidate);
}
@ParametersFactory
public static Iterable<Object[]> parameters() throws Exception {
return ESClientYamlSuiteTestCase.createParameters();
}
}

View File

@ -0,0 +1,86 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.AnalysisFactoryTestCase;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;
public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
@Override
protected Map<String, Class<?>> getTokenizers() {
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
return tokenizers;
}
@Override
protected Map<String, Class<?>> getTokenFilters() {
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class);
filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class);
return filters;
}
@Override
protected Map<String, Class<?>> getCharFilters() {
Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
return filters;
}
/**
* Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
* hasn't been marked in this class with its proper factory.
*/
public void testAllTokenizersMarked() {
markedTestCase("char filter", getTokenizers());
}
/**
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
* hasn't been marked in this class with its proper factory.
*/
public void testAllCharFiltersMarked() {
markedTestCase("char filter", getCharFilters());
}
/**
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
* hasn't been marked in this class with its proper factory.
*/
public void testAllTokenFiltersMarked() {
markedTestCase("token filter", getTokenFilters());
}
private void markedTestCase(String name, Map<String, Class<?>> map) {
List<String> unmarked = map.entrySet().stream()
.filter(e -> e.getValue() == MovedToAnalysisCommon.class)
.map(Map.Entry::getKey)
.sorted()
.collect(toList());
assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common "
+ "but not mapped here", emptyList(), unmarked);
}
}

View File

@ -0,0 +1,154 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.test.ESIntegTestCase;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.startsWith;
public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Arrays.asList(CommonAnalysisPlugin.class);
}
public void testNgramHighlightingWithBrokenPositions() throws IOException {
assertAcked(prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("name")
.field("type", "text")
.startObject("fields")
.startObject("autocomplete")
.field("type", "text")
.field("analyzer", "autocomplete")
.field("search_analyzer", "search_autocomplete")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
.endObject())
.setSettings(Settings.builder()
.put(indexSettings())
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putArray("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM",
"# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM",
", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM",
"/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM",
"@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM",
"{ => ALPHANUM")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
.putArray("analysis.analyzer.autocomplete.filter",
"lowercase", "wordDelimiter")
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
.putArray("analysis.analyzer.search_autocomplete.filter",
"lowercase", "wordDelimiter")));
client().prepareIndex("test", "test", "1")
.setSource("name", "ARCOTEL Hotels Deutschland").get();
refresh();
SearchResponse search = client().prepareSearch("test").setTypes("test")
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
.highlighter(new HighlightBuilder().field("name.autocomplete")).get();
assertHighlight(search, 0, "name.autocomplete", 0,
equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
}
public void testMultiPhraseCutoff() throws IOException {
/*
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
* query. We cut off and extract terms if there are more than 16 terms in the query
*/
assertAcked(prepareCreate("test")
.addMapping("test", "body", "type=text,analyzer=custom_analyzer,"
+ "search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
.setSettings(
Settings.builder().put(indexSettings())
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("analysis.analyzer.custom_analyzer.filter",
"lowercase", "wordDelimiter"))
);
ensureGreen();
client().prepareIndex("test", "test", "1")
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org "
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this is "
+ "a test for highlighting feature Test: http://www.facebook.com "
+ "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ "http://twitter.com this is a test for highlighting feature")
.get();
refresh();
SearchResponse search = client().prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
.highlighter(new HighlightBuilder().field("body")).get();
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
search = client()
.prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "
+ "http://elasticsearch.org http://xing.com http://cnn.com "
+ "http://quora.com http://twitter.com this is a test for highlighting "
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
+ "is a test for highlighting feature"))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
}
}

View File

@ -0,0 +1,72 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import java.util.Arrays;
import java.util.Collection;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
public class QueryStringWithAnalyzersTests extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Arrays.asList(CommonAnalysisPlugin.class);
}
/**
* Validates that we properly split fields using the word delimiter filter in query_string.
*/
public void testCustomWordDelimiterQueryString() {
assertAcked(client().admin().indices().prepareCreate("test")
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
"analysis.filter.custom_word_delimiter.catenate_words", "false",
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
.addMapping("type1",
"field1", "type=text,analyzer=my_analyzer",
"field2", "type=text,analyzer=my_analyzer"));
client().prepareIndex("test", "type1", "1").setSource(
"field1", "foo bar baz",
"field2", "not needed").get();
refresh();
SearchResponse response = client()
.prepareSearch("test")
.setQuery(
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
.field("field1").field("field2")).get();
assertHitCount(response, 1L);
}
}

View File

@ -16,52 +16,62 @@
* specific language governing permissions and limitations * specific language governing permissions and limitations
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { public class WordDelimiterGraphTokenFilterFactoryTests
extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterGraphTokenFilterFactoryTests() { public WordDelimiterGraphTokenFilterFactoryTests() {
super("word_delimiter_graph"); super("word_delimiter_graph");
} }
public void testMultiTerms() throws IOException { public void testMultiTerms() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true") .put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build()); .put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42", String[] expected = new String[] { "PowerShot", "PowerShot", "Power", "Shot", "50042",
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se", "500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi",
"ONeil", "O'Neil's", "O", "Neil" }; "fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer(); Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source)); tokenizer.setReader(new StringReader(source));
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}; int[] expectedIncr = new int[] { 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1}; 1, 1, 1, 0, 0, 1 };
int[] expectedPosLen = new int[] { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3,
1, 1, 1, 2, 2, 1, 1 };
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null); expectedIncr, expectedPosLen, null);
} }
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ /**
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
*/
public void testPartsAndCatenate() throws IOException { public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.build()); .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot"; String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1}; int[] expectedIncr = new int[]{1, 0, 1};

View File

@ -16,31 +16,38 @@
* specific language governing permissions and limitations * specific language governing permissions and limitations
* under the License. * under the License.
*/ */
package org.elasticsearch.index.analysis; package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { public class WordDelimiterTokenFilterFactoryTests
extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterTokenFilterFactoryTests() { public WordDelimiterTokenFilterFactoryTests() {
super("word_delimiter"); super("word_delimiter");
} }
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ /**
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
*/
public void testPartsAndCatenate() throws IOException { public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) Settings.builder()
.put("index.analysis.filter.my_word_delimiter.type", type) .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true") .put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") .put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.build()); .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot"; String source = "PowerShot";
String[] expected = new String[]{"Power", "PowerShot", "Shot" }; String[] expected = new String[]{"Power", "PowerShot", "Shot" };

View File

@ -0,0 +1,11 @@
"Module loaded":
- do:
cluster.state: {}
# Get master node id
- set: { master_node: master }
- do:
nodes.info: {}
- match: { nodes.$master.modules.0.name: analysis-common }

View File

@ -0,0 +1,11 @@
## Smoke tests for analyzers included in the analysis-common module
"whitespace":
- do:
indices.analyze:
body:
text: Foo Bar!
analyzer: whitespace
- length: { tokens: 2 }
- match: { tokens.0.token: Foo }
- match: { tokens.1.token: Bar! }

View File

@ -0,0 +1,27 @@
## Smoke tests for tokenizers included in the analysis-common module
"keyword":
- do:
indices.analyze:
body:
text: Foo Bar!
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: Foo Bar! }
---
"nGram":
- do:
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: nGram
min_gram: 2
max_gram: 2
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: go }
- match: { detail.tokenizer.tokens.1.token: oo }
- match: { detail.tokenizer.tokens.2.token: od }

View File

@ -0,0 +1,82 @@
## Smoke tests for token filters included in the analysis-common module
"asciifolding":
- do:
indices.analyze:
body:
text: Musée d'Orsay
tokenizer: keyword
filter: [asciifolding]
- length: { tokens: 1 }
- match: { tokens.0.token: Musee d'Orsay }
---
"lowercase":
- do:
indices.analyze:
body:
text: Foo Bar!
tokenizer: keyword
filter: [lowercase]
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar! }
---
"word_delimiter":
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter: [word_delimiter]
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.2.token: "1" }
- match: { tokens.3.token: ck }
- match: { tokens.4.token: brown }
- match: { tokens.5.token: fox }
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter
split_on_numerics: false
- length: { tokens: 4 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu1ck }
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }
---
"word_delimiter_graph":
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter: [word_delimiter_graph]
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.2.token: "1" }
- match: { tokens.3.token: ck }
- match: { tokens.4.token: brown }
- match: { tokens.5.token: fox }
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter_graph
split_on_numerics: false
- length: { tokens: 4 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu1ck }
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }

View File

@ -0,0 +1,13 @@
## Smoke tests for analyzers included in the analysis-common module
"mapping":
- do:
indices.analyze:
body:
text: jeff quit phish
tokenizer: keyword
char_filter:
- type: mapping
mappings: ["ph => f", "qu => q"]
- length: { tokens: 1 }
- match: { tokens.0.token: "jeff qit fish" }

View File

@ -19,14 +19,9 @@
package org.elasticsearch.index.analysis; package org.elasticsearch.index.analysis;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.AnalysisFactoryTestCase; import org.elasticsearch.AnalysisFactoryTestCase;
import org.elasticsearch.Version; import org.elasticsearch.Version;
@ -37,6 +32,10 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory; import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase { public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
@Override @Override

View File

@ -1,29 +1,11 @@
# Will be performed before each test as a part of the test setup
#
setup:
- do:
ping: {}
---
"Basic test": "Basic test":
- do: - do:
indices.analyze: indices.analyze:
body: body:
text: Foo Bar text: Foo Bar
- length: { tokens: 2 } - length: { tokens: 2 }
- match: { tokens.0.token: foo } - match: { tokens.0.token: foo }
- match: { tokens.1.token: bar } - match: { tokens.1.token: bar }
---
"Tokenizer and filter":
- do:
indices.analyze:
body:
filter: [lowercase]
text: Foo Bar
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar }
--- ---
"Index and field": "Index and field":
@ -36,7 +18,7 @@ setup:
properties: properties:
text: text:
type: text type: text
analyzer: whitespace analyzer: standard
- do: - do:
indices.analyze: indices.analyze:
@ -45,84 +27,51 @@ setup:
field: text field: text
text: Foo Bar! text: Foo Bar!
- length: { tokens: 2 } - length: { tokens: 2 }
- match: { tokens.0.token: Foo } - match: { tokens.0.token: foo }
- match: { tokens.1.token: Bar! } - match: { tokens.1.token: bar }
---
"JSON in Body":
- do:
indices.analyze:
body: { "text": "Foo Bar", "filter": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: foo bar }
--- ---
"Array text": "Array text":
- do: - do:
indices.analyze: indices.analyze:
body: { "text": ["Foo Bar", "Baz"], "filter": ["lowercase"], "tokenizer": keyword } body:
- length: {tokens: 2 } text: ["Foo Bar", "Baz"]
- match: { tokens.0.token: foo bar } tokenizer: standard
- match: { tokens.1.token: baz } - length: { tokens: 3 }
- match: { tokens.0.token: Foo }
- match: { tokens.1.token: Bar }
- match: { tokens.2.token: Baz }
--- ---
"Detail response with Analyzer": "Detail response with Analyzer":
- do: - do:
indices.analyze: indices.analyze:
body: {"text": "This is troubled", "analyzer": standard, "explain": "true"} body:
text: This is troubled
analyzer: standard
explain: true
- length: { detail.analyzer.tokens: 3 } - length: { detail.analyzer.tokens: 3 }
- match: { detail.analyzer.name: standard } - match: { detail.analyzer.name: standard }
- match: { detail.analyzer.tokens.0.token: this } - match: { detail.analyzer.tokens.0.token: this }
- match: { detail.analyzer.tokens.1.token: is } - match: { detail.analyzer.tokens.1.token: is }
- match: { detail.analyzer.tokens.2.token: troubled } - match: { detail.analyzer.tokens.2.token: troubled }
---
"Detail output spcified attribute":
- do:
indices.analyze:
body: {"text": "<text>This is troubled</text>", "char_filter": ["html_strip"], "filter": ["snowball"], "tokenizer": standard, "explain": true, "attributes": ["keyword"]}
- length: { detail.charfilters: 1 }
- length: { detail.tokenizer.tokens: 3 }
- length: { detail.tokenfilters.0.tokens: 3 }
- match: { detail.tokenizer.name: standard }
- match: { detail.tokenizer.tokens.0.token: This }
- match: { detail.tokenizer.tokens.1.token: is }
- match: { detail.tokenizer.tokens.2.token: troubled }
- match: { detail.tokenfilters.0.name: snowball }
- match: { detail.tokenfilters.0.tokens.0.token: This }
- match: { detail.tokenfilters.0.tokens.1.token: is }
- match: { detail.tokenfilters.0.tokens.2.token: troubl }
- match: { detail.tokenfilters.0.tokens.2.keyword: false }
--- ---
"Custom filter in request": "Custom filter in request":
- do: - do:
indices.analyze: indices.analyze:
body: { "text": "Foo Bar Buzz", "filter": ["lowercase", { "type": "stop", "stopwords": ["foo", "buzz"]}], "tokenizer": whitespace, "explain": true } body:
- length: {detail.tokenizer.tokens: 3 } text: foo bar buzz
- length: {detail.tokenfilters.0.tokens: 3 } tokenizer: standard
- length: {detail.tokenfilters.1.tokens: 1 } explain: true
- match: { detail.tokenizer.name: whitespace } filter:
- match: { detail.tokenizer.tokens.0.token: Foo } - type: stop
- match: { detail.tokenizer.tokens.1.token: Bar } stopwords: ["foo", "buzz"]
- match: { detail.tokenizer.tokens.2.token: Buzz } - length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenfilters.0.name: lowercase } - length: { detail.tokenfilters.0.tokens: 1 }
- match: { detail.tokenfilters.0.tokens.0.token: foo } - match: { detail.tokenizer.name: standard }
- match: { detail.tokenfilters.0.tokens.1.token: bar } - match: { detail.tokenizer.tokens.0.token: foo }
- match: { detail.tokenfilters.0.tokens.2.token: buzz } - match: { detail.tokenizer.tokens.1.token: bar }
- match: { detail.tokenfilters.1.name: "_anonymous_tokenfilter_[1]" } - match: { detail.tokenizer.tokens.2.token: buzz }
- match: { detail.tokenfilters.1.tokens.0.token: bar } - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
--- - match: { detail.tokenfilters.0.tokens.0.token: bar }
"Custom char_filter in request":
- do:
indices.analyze:
body: { "text": "jeff quit phish", "char_filter": [{"type": "mapping", "mappings": ["ph => f", "qu => q"]}], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: "jeff qit fish" }
---
"Custom tokenizer in request":
- do:
indices.analyze:
body: { "text": "good", "tokenizer": {"type": "nGram", "min_gram": 2, "max_gram": 2}, "explain": true }
- length: {detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: go }
- match: { detail.tokenizer.tokens.1.token: oo }
- match: { detail.tokenizer.tokens.2.token: od }

View File

@ -26,14 +26,15 @@ List projects = [
'test:fixtures:hdfs-fixture', 'test:fixtures:hdfs-fixture',
'test:logger-usage', 'test:logger-usage',
'modules:aggs-matrix-stats', 'modules:aggs-matrix-stats',
'modules:analysis-common',
'modules:ingest-common', 'modules:ingest-common',
'modules:lang-expression', 'modules:lang-expression',
'modules:lang-mustache', 'modules:lang-mustache',
'modules:lang-painless', 'modules:lang-painless',
'modules:transport-netty4',
'modules:reindex',
'modules:percolator', 'modules:percolator',
'modules:reindex',
'modules:repository-url', 'modules:repository-url',
'modules:transport-netty4',
'plugins:analysis-icu', 'plugins:analysis-icu',
'plugins:analysis-kuromoji', 'plugins:analysis-kuromoji',
'plugins:analysis-phonetic', 'plugins:analysis-phonetic',

View File

@ -20,14 +20,12 @@
package org.elasticsearch; package org.elasticsearch;
import org.apache.lucene.analysis.en.PorterStemFilterFactory; import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ApostropheFilterFactory;
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
@ -92,7 +90,6 @@ import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCharFilters; import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
@ -110,7 +107,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
/** /**
* Alerts us if new analyzers are added to lucene, so we don't miss them. * Alerts us if new analysis components are added to Lucene, so we don't miss them.
* <p> * <p>
* If we don't want to expose one for a specific reason, just map it to Void. * If we don't want to expose one for a specific reason, just map it to Void.
* The deprecated ones can be mapped to Deprecated.class. * The deprecated ones can be mapped to Deprecated.class.
@ -178,7 +175,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("apostrophe", ApostropheFilterFactory.class) .put("apostrophe", ApostropheFilterFactory.class)
.put("arabicnormalization", ArabicNormalizationFilterFactory.class) .put("arabicnormalization", ArabicNormalizationFilterFactory.class)
.put("arabicstem", ArabicStemTokenFilterFactory.class) .put("arabicstem", ArabicStemTokenFilterFactory.class)
.put("asciifolding", ASCIIFoldingTokenFilterFactory.class) .put("asciifolding", MovedToAnalysisCommon.class)
.put("brazilianstem", BrazilianStemTokenFilterFactory.class) .put("brazilianstem", BrazilianStemTokenFilterFactory.class)
.put("bulgarianstem", StemmerTokenFilterFactory.class) .put("bulgarianstem", StemmerTokenFilterFactory.class)
.put("cjkbigram", CJKBigramFilterFactory.class) .put("cjkbigram", CJKBigramFilterFactory.class)
@ -253,8 +250,8 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("turkishlowercase", LowerCaseTokenFilterFactory.class) .put("turkishlowercase", LowerCaseTokenFilterFactory.class)
.put("type", KeepTypesFilterFactory.class) .put("type", KeepTypesFilterFactory.class)
.put("uppercase", UpperCaseTokenFilterFactory.class) .put("uppercase", UpperCaseTokenFilterFactory.class)
.put("worddelimiter", WordDelimiterTokenFilterFactory.class) .put("worddelimiter", MovedToAnalysisCommon.class)
.put("worddelimitergraph", WordDelimiterGraphFilterFactory.class) .put("worddelimitergraph", MovedToAnalysisCommon.class)
.put("flattengraph", FlattenGraphTokenFilterFactory.class) .put("flattengraph", FlattenGraphTokenFilterFactory.class)
// TODO: these tokenfilters are not yet exposed: useful? // TODO: these tokenfilters are not yet exposed: useful?
@ -401,6 +398,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
} }
} }
expected.remove(Void.class); expected.remove(Void.class);
expected.remove(MovedToAnalysisCommon.class);
expected.remove(Deprecated.class); expected.remove(Deprecated.class);
Collection<Class<?>> actual = new HashSet<>(); Collection<Class<?>> actual = new HashSet<>();
@ -489,4 +487,11 @@ public class AnalysisFactoryTestCase extends ESTestCase {
classesThatShouldNotHaveMultiTermSupport.isEmpty()); classesThatShouldNotHaveMultiTermSupport.isEmpty());
} }
/**
* Marker class for components that have moved to the analysis-common modules. This will be
* removed when the module is complete and these analysis components aren't available to core.
*/
protected static final class MovedToAnalysisCommon {
private MovedToAnalysisCommon() {}
}
} }

View File

@ -25,17 +25,18 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment; import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule; import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays;
import static java.util.Collections.emptyList;
public class AnalysisTestsHelper { public class AnalysisTestsHelper {
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, String resource) throws IOException { public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir,
String resource) throws IOException {
Settings settings = Settings.builder() Settings settings = Settings.builder()
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource)) .loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString()) .put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
@ -45,12 +46,15 @@ public class AnalysisTestsHelper {
} }
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings( public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
Settings settings) throws IOException { Settings settings, AnalysisPlugin... plugins) throws IOException {
if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) { if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) {
settings = Settings.builder().put(settings).put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); settings = Settings.builder().put(settings)
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
} }
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisRegistry analysisRegistry = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry(); AnalysisRegistry analysisRegistry =
new AnalysisModule(new Environment(settings), Arrays.asList(plugins))
.getAnalysisRegistry();
return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings), return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings),
analysisRegistry.buildTokenFilterFactories(indexSettings), analysisRegistry.buildTokenFilterFactories(indexSettings),
analysisRegistry.buildTokenizerFactories(indexSettings), analysisRegistry.buildTokenizerFactories(indexSettings),