mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-25 09:28:27 +00:00
Start building analysis-common module (#23614)
Start moving built in analysis components into the new analysis-common module. The goal of this project is: 1. Remove core's dependency on lucene-analyzers-common.jar which should shrink the dependencies for transport client and high level rest client. 2. Prove that analysis plugins can do all the "built in" things by moving all "built in" behavior to a plugin. 3. Force tests not to depend on any oddball analyzer behavior. If tests need anything more than the standard analyzer they can use the mock analyzer provided by Lucene's test infrastructure.
This commit is contained in:
parent
151a65ed17
commit
caf376c8af
@ -1096,7 +1096,6 @@
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" />
|
||||
@ -1225,8 +1224,6 @@
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||
@ -2686,11 +2683,8 @@
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactoryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTestsHelper.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]BaseWordDelimiterTokenFilterFactoryTestCase.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" />
|
||||
@ -2709,8 +2703,6 @@
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactoryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactoryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" />
|
||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" />
|
||||
|
@ -71,4 +71,9 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean breaksFastVectorHighlighter() {
|
||||
return true;
|
||||
}
|
||||
}
|
@ -20,10 +20,20 @@
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
||||
|
||||
public interface TokenFilterFactory {
|
||||
|
||||
String name();
|
||||
|
||||
TokenStream create(TokenStream tokenStream);
|
||||
|
||||
/**
|
||||
* Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the
|
||||
* {@link FastVectorHighlighter}? If this is {@code true} then the
|
||||
* {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets.
|
||||
*/
|
||||
default boolean breaksFastVectorHighlighter() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -25,7 +25,6 @@ import org.elasticsearch.common.NamedRegistry;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||
@ -140,8 +139,6 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
@ -205,7 +202,6 @@ public final class AnalysisModule {
|
||||
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
|
||||
tokenFilters.register("stop", StopTokenFilterFactory::new);
|
||||
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
|
||||
tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
||||
tokenFilters.register("length", LengthTokenFilterFactory::new);
|
||||
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
|
||||
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
|
||||
@ -225,8 +221,6 @@ public final class AnalysisModule {
|
||||
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
||||
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
|
||||
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
|
||||
tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
|
||||
tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
|
||||
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
|
||||
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
|
||||
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);
|
||||
|
@ -26,15 +26,9 @@ import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
|
||||
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
|
||||
import org.apache.lucene.util.CollectionUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
||||
import org.elasticsearch.index.mapper.FieldMapper;
|
||||
|
||||
import java.util.Comparator;
|
||||
@ -56,7 +50,7 @@ public final class FragmentBuilderHelper {
|
||||
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) {
|
||||
assert fragInfo != null : "FragInfo must not be null";
|
||||
assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
|
||||
if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.fieldType().indexAnalyzer()))) {
|
||||
if (!fragInfo.getSubInfos().isEmpty() && containsBrokenAnalysis(mapper.fieldType().indexAnalyzer())) {
|
||||
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
|
||||
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
|
||||
* the fragments based on their offsets rather than using soley the positions as it is done in
|
||||
@ -91,8 +85,7 @@ public final class FragmentBuilderHelper {
|
||||
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
||||
TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
||||
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
|
||||
if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory
|
||||
|| tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) {
|
||||
if (tokenFilterFactory.breaksFastVectorHighlighter()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -18,6 +18,8 @@
|
||||
*/
|
||||
package org.elasticsearch.action.admin.indices;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
|
||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
||||
@ -27,18 +29,28 @@ import org.elasticsearch.common.UUIDs;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.mapper.AllFieldMapper;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.util.Collections.emptyList;
|
||||
import static java.util.Collections.singletonList;
|
||||
import static java.util.Collections.singletonMap;
|
||||
|
||||
/**
|
||||
* Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the
|
||||
* {@code common-analysis} module.
|
||||
*/
|
||||
public class TransportAnalyzeActionTests extends ESTestCase {
|
||||
|
||||
private IndexAnalyzers indexAnalyzers;
|
||||
@ -53,23 +65,28 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
||||
.put("index.analysis.tokenizer.trigram.type", "ngram")
|
||||
.put("index.analysis.tokenizer.trigram.min_gram", 3)
|
||||
.put("index.analysis.tokenizer.trigram.max_gram", 3)
|
||||
.put("index.analysis.filter.synonym.type", "synonym")
|
||||
.putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay")
|
||||
.put("index.analysis.filter.synonym.tokenizer", "trigram")
|
||||
.put("index.analysis.filter.synonym.min_gram", 3)
|
||||
.put("index.analysis.filter.synonym.max_gram", 3).build();
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||
.put("index.analysis.analyzer.custom_analyzer.filter", "mock").build();
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
environment = new Environment(settings);
|
||||
registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry();
|
||||
AnalysisPlugin plugin = new AnalysisPlugin() {
|
||||
class MockFactory extends AbstractTokenFilterFactory {
|
||||
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
return singletonMap("mock", MockFactory::new);
|
||||
}
|
||||
};
|
||||
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
|
||||
indexAnalyzers = registry.build(idxSettings);
|
||||
}
|
||||
|
||||
@ -143,51 +160,44 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
||||
}
|
||||
|
||||
public void testWithIndexAnalyzers() throws IOException {
|
||||
|
||||
AnalyzeRequest request = new AnalyzeRequest();
|
||||
request.analyzer("standard");
|
||||
request.text("the quick brown fox");
|
||||
request.analyzer("custom_analyzer");
|
||||
request.text("the qu1ck brown fox");
|
||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
assertEquals("brown", tokens.get(1).getTerm());
|
||||
assertEquals("fox", tokens.get(2).getTerm());
|
||||
|
||||
request.analyzer("whitespace");
|
||||
request.text("the qu1ck brown fox-dog");
|
||||
request.analyzer("standard");
|
||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(4, tokens.size());
|
||||
|
||||
request.analyzer("custom_analyzer");
|
||||
request.text("the qu1ck brown fox-dog");
|
||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(5, tokens.size());
|
||||
|
||||
request.analyzer(null);
|
||||
request.tokenizer("whitespace");
|
||||
request.addTokenFilter("lowercase");
|
||||
request.addTokenFilter("wordDelimiter");
|
||||
request.text("the qu1ck brown fox-dog");
|
||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(5, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
assertEquals("qu1ck", tokens.get(1).getTerm());
|
||||
assertEquals("quick", tokens.get(1).getTerm());
|
||||
assertEquals("brown", tokens.get(2).getTerm());
|
||||
assertEquals("fox", tokens.get(3).getTerm());
|
||||
assertEquals("dog", tokens.get(4).getTerm());
|
||||
|
||||
// Switch the analyzer out for just a tokenizer
|
||||
request.analyzer(null);
|
||||
request.tokenizer("trigram");
|
||||
request.addTokenFilter("synonym");
|
||||
request.text("kimchy");
|
||||
request.tokenizer("standard");
|
||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(2, tokens.size());
|
||||
assertEquals("sha", tokens.get(0).getTerm());
|
||||
assertEquals("hay", tokens.get(1).getTerm());
|
||||
assertEquals(4, tokens.size());
|
||||
assertEquals("the", tokens.get(0).getTerm());
|
||||
assertEquals("quick", tokens.get(1).getTerm());
|
||||
assertEquals("brown", tokens.get(2).getTerm());
|
||||
assertEquals("fox", tokens.get(3).getTerm());
|
||||
|
||||
// Now try applying our token filter
|
||||
request.addTokenFilter("mock");
|
||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||
tokens = analyze.getTokens();
|
||||
assertEquals(3, tokens.size());
|
||||
assertEquals("quick", tokens.get(0).getTerm());
|
||||
assertEquals("brown", tokens.get(1).getTerm());
|
||||
assertEquals("fox", tokens.get(2).getTerm());
|
||||
}
|
||||
|
||||
public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException {
|
||||
|
@ -22,5 +22,5 @@ package org.elasticsearch.index.analysis;
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||
// tests are inherited
|
||||
// tests are inherited and nothing needs to be defined here
|
||||
}
|
||||
|
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
@ -31,17 +32,20 @@ import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
import org.elasticsearch.test.VersionUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.util.Collections.emptyList;
|
||||
import static java.util.Collections.emptyMap;
|
||||
import static java.util.Collections.singletonList;
|
||||
import static java.util.Collections.singletonMap;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
@ -112,51 +116,73 @@ public class AnalysisRegistryTests extends ESTestCase {
|
||||
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide.
|
||||
*/
|
||||
public void testConfigureCamelCaseTokenFilter() throws IOException {
|
||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||
Settings indexSettings = Settings.builder()
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
|
||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
||||
.put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
|
||||
.putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build();
|
||||
.put("index.analysis.filter.testFilter.type", "mock")
|
||||
.put("index.analysis.filter.test_filter.type", "mock")
|
||||
.put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter")
|
||||
.put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard")
|
||||
.putArray("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build();
|
||||
|
||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry()
|
||||
/* The snake_case version of the name should not filter out any stopwords while the
|
||||
* camelCase version will filter out English stopwords. */
|
||||
AnalysisPlugin plugin = new AnalysisPlugin() {
|
||||
class MockFactory extends AbstractTokenFilterFactory {
|
||||
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (name().equals("test_filter")) {
|
||||
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
|
||||
}
|
||||
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
return singletonMap("mock", MockFactory::new);
|
||||
}
|
||||
};
|
||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), singletonList(plugin)).getAnalysisRegistry()
|
||||
.build(idxSettings);
|
||||
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
|
||||
|
||||
// This shouldn't contain English stopwords
|
||||
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) {
|
||||
assertNotNull(custom_analyser);
|
||||
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
|
||||
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
|
||||
tokenStream.reset();
|
||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
List<String> token = new ArrayList<>();
|
||||
while(tokenStream.incrementToken()) {
|
||||
token.add(charTermAttribute.toString());
|
||||
}
|
||||
assertEquals(token.toString(), 2, token.size());
|
||||
assertEquals("j2se", token.get(0));
|
||||
assertEquals("j2ee", token.get(1));
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("has", charTermAttribute.toString());
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("foo", charTermAttribute.toString());
|
||||
assertFalse(tokenStream.incrementToken());
|
||||
}
|
||||
|
||||
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
|
||||
// This *should* contain English stopwords
|
||||
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) {
|
||||
assertNotNull(custom_analyser);
|
||||
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
|
||||
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
|
||||
tokenStream.reset();
|
||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||
List<String> token = new ArrayList<>();
|
||||
while(tokenStream.incrementToken()) {
|
||||
token.add(charTermAttribute.toString());
|
||||
}
|
||||
assertEquals(token.toString(), 6, token.size());
|
||||
assertEquals("j", token.get(0));
|
||||
assertEquals("2", token.get(1));
|
||||
assertEquals("se", token.get(2));
|
||||
assertEquals("j", token.get(3));
|
||||
assertEquals("2", token.get(4));
|
||||
assertEquals("ee", token.get(5));
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("has", charTermAttribute.toString());
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("a", charTermAttribute.toString());
|
||||
assertTrue(tokenStream.incrementToken());
|
||||
assertEquals("foo", charTermAttribute.toString());
|
||||
assertFalse(tokenStream.incrementToken());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,7 @@
|
||||
package org.elasticsearch.search.fetch.subphase.highlight;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||
|
||||
import org.apache.lucene.search.join.ScoreMode;
|
||||
import org.elasticsearch.action.index.IndexRequestBuilder;
|
||||
import org.elasticsearch.action.search.SearchRequestBuilder;
|
||||
@ -100,6 +101,7 @@ import static org.hamcrest.Matchers.not;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
// TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
|
||||
private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"};
|
||||
private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"};
|
||||
|
||||
@ -113,12 +115,11 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
mappings.startObject();
|
||||
mappings.startObject("type")
|
||||
.startObject("properties")
|
||||
.startObject("text")
|
||||
.field("type", "keyword")
|
||||
.field("store", true)
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
.startObject("text")
|
||||
.field("type", "keyword")
|
||||
.field("store", true)
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
mappings.endObject();
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("type", mappings));
|
||||
@ -139,14 +140,13 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
mappings.startObject();
|
||||
mappings.startObject("type")
|
||||
.startObject("properties")
|
||||
.startObject("text")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "keyword")
|
||||
.field("index_options", "offsets")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
.startObject("text")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "keyword")
|
||||
.field("index_options", "offsets")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
mappings.endObject();
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("type", mappings));
|
||||
@ -166,23 +166,22 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
mappings.startObject();
|
||||
mappings.startObject("type")
|
||||
.startObject("_source")
|
||||
.field("enabled", false)
|
||||
.field("enabled", false)
|
||||
.endObject()
|
||||
.startObject("properties")
|
||||
.startObject("unstored_field")
|
||||
.field("index_options", "offsets")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.field("type", "text")
|
||||
.field("store", false)
|
||||
.endObject()
|
||||
.startObject("text")
|
||||
.field("index_options", "offsets")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.field("type", "text")
|
||||
.field("store", true)
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject();
|
||||
.startObject("unstored_field")
|
||||
.field("index_options", "offsets")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.field("type", "text")
|
||||
.field("store", false)
|
||||
.endObject()
|
||||
.startObject("text")
|
||||
.field("index_options", "offsets")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.field("type", "text")
|
||||
.field("store", true)
|
||||
.endObject()
|
||||
.endObject().endObject();
|
||||
mappings.endObject();
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("type", mappings));
|
||||
@ -218,103 +217,6 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
||||
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
|
||||
}
|
||||
|
||||
public void testNgramHighlightingWithBrokenPositions() throws IOException {
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test", jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("test")
|
||||
.startObject("properties")
|
||||
.startObject("name")
|
||||
.startObject("fields")
|
||||
.startObject("autocomplete")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "autocomplete")
|
||||
.field("search_analyzer", "search_autocomplete")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.field("type", "text")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject())
|
||||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
||||
.put("analysis.tokenizer.autocomplete.min_gram", 1)
|
||||
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
||||
.put("analysis.tokenizer.autocomplete.type", "nGram")
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.putArray("analysis.filter.wordDelimiter.type_table",
|
||||
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
|
||||
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM",
|
||||
"+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM",
|
||||
"^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
|
||||
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM")
|
||||
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
|
||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
|
||||
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
|
||||
.putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
|
||||
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
|
||||
.putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")));
|
||||
client().prepareIndex("test", "test", "1")
|
||||
.setSource("name", "ARCOTEL Hotels Deutschland").get();
|
||||
refresh();
|
||||
SearchResponse search = client().prepareSearch("test").setTypes("test")
|
||||
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
|
||||
.highlighter(new HighlightBuilder().field("name.autocomplete")).execute().actionGet();
|
||||
assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
|
||||
}
|
||||
|
||||
public void testMultiPhraseCutoff() throws IOException {
|
||||
/*
|
||||
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
|
||||
* query. We cut off and extract terms if there are more than 16 terms in the query
|
||||
*/
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test",
|
||||
"body", "type=text,analyzer=custom_analyzer,search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
|
||||
.setSettings(
|
||||
Settings.builder().put(indexSettings())
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter"))
|
||||
);
|
||||
|
||||
ensureGreen();
|
||||
client().prepareIndex("test", "test", "1")
|
||||
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
|
||||
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
|
||||
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
||||
+ "http://twitter.com this is a test for highlighting feature")
|
||||
.get();
|
||||
refresh();
|
||||
SearchResponse search = client().prepareSearch().setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
||||
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
||||
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
|
||||
search = client()
|
||||
.prepareSearch()
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
|
||||
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
|
||||
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
||||
+ "http://twitter.com this is a test for highlighting feature"))
|
||||
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
||||
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: <em>http://www.facebook.com</em> "
|
||||
+ "<em>http://elasticsearch.org</em> <em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
|
||||
}
|
||||
|
||||
public void testNgramHighlighting() throws IOException {
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test",
|
||||
|
@ -1605,33 +1605,6 @@ public class SearchQueryIT extends ESIntegTestCase {
|
||||
assertHitCount(searchResponse, 2);
|
||||
}
|
||||
|
||||
// see #3898
|
||||
public void testCustomWordDelimiterQueryString() {
|
||||
assertAcked(client().admin().indices().prepareCreate("test")
|
||||
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
|
||||
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
|
||||
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
|
||||
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
|
||||
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
|
||||
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
|
||||
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
|
||||
"analysis.filter.custom_word_delimiter.catenate_words", "false",
|
||||
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
|
||||
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
|
||||
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
|
||||
.addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer"));
|
||||
|
||||
client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get();
|
||||
refresh();
|
||||
|
||||
SearchResponse response = client()
|
||||
.prepareSearch("test")
|
||||
.setQuery(
|
||||
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
|
||||
.field("field1").field("field2")).get();
|
||||
assertHitCount(response, 1L);
|
||||
}
|
||||
|
||||
// see #3797
|
||||
public void testMultiMatchLenientIssue3797() {
|
||||
createIndex("test");
|
||||
|
23
modules/analysis-common/build.gradle
Normal file
23
modules/analysis-common/build.gradle
Normal file
@ -0,0 +1,23 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
esplugin {
|
||||
description 'Adds "built in" analyzers to Elasticsearch.'
|
||||
classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
|
||||
}
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||
@ -25,20 +25,26 @@ import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for ASCIIFoldingFilter.
|
||||
*/
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
||||
public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
implements MultiTermAwareComponent {
|
||||
public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
||||
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||
|
||||
private final boolean preserveOriginal;
|
||||
|
||||
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment,
|
||||
String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices(
|
||||
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
|
||||
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(),
|
||||
DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
|
||||
}
|
||||
|
||||
@Override
|
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||
@Override
|
||||
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
|
||||
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
||||
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
|
||||
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
|
||||
return filters;
|
||||
}
|
||||
}
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -26,20 +26,22 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE;
|
||||
import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
|
||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE;
|
||||
import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.parseTypes;
|
||||
|
||||
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
@ -47,7 +49,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
|
||||
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
// Sample Format for the type table:
|
||||
@ -82,7 +85,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
||||
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
||||
// If not null is the set of tokens to protect from being delimited
|
||||
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
|
||||
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
|
||||
settings, "protected_words");
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
||||
this.flags = flags;
|
||||
}
|
@ -17,7 +17,7 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.Analysis;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
@ -52,7 +54,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
private final int flags;
|
||||
private final CharArraySet protoWords;
|
||||
|
||||
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||
String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
|
||||
// Sample Format for the type table:
|
||||
@ -87,7 +90,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
||||
// If not null is the set of tokens to protect from being delimited
|
||||
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
|
||||
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
|
||||
settings, "protected_words");
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
||||
this.flags = flags;
|
||||
}
|
||||
@ -101,7 +105,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
}
|
||||
|
||||
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
|
||||
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), key, defaultValue, deprecationLogger)) {
|
||||
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(),
|
||||
key, defaultValue, deprecationLogger)) {
|
||||
return flag;
|
||||
}
|
||||
return 0;
|
||||
@ -122,14 +127,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
String lhs = parseString(m.group(1).trim());
|
||||
Byte rhs = parseType(m.group(2).trim());
|
||||
if (lhs.length() != 1)
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
|
||||
throw new RuntimeException("Invalid Mapping Rule : ["
|
||||
+ rule + "]. Only a single character is allowed.");
|
||||
if (rhs == null)
|
||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
|
||||
typeMap.put(lhs.charAt(0), rhs);
|
||||
}
|
||||
|
||||
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
|
||||
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
||||
byte types[] = new byte[Math.max(
|
||||
typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
||||
for (int i = 0; i < types.length; i++)
|
||||
types[i] = WordDelimiterIterator.getType(i);
|
||||
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
|
||||
@ -196,4 +203,9 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
}
|
||||
return new String(out, 0, writePos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean breaksFastVectorHighlighter() {
|
||||
return true;
|
||||
}
|
||||
}
|
@ -17,12 +17,15 @@
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
|
||||
@ -31,10 +34,12 @@ import java.io.StringReader;
|
||||
|
||||
public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
public void testDefault() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
||||
String source = "Ansprüche";
|
||||
String[] expected = new String[]{"Anspruche"};
|
||||
@ -44,11 +49,13 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
}
|
||||
|
||||
public void testPreserveOriginal() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
||||
String source = "Ansprüche";
|
||||
String[] expected = new String[]{"Anspruche", "Ansprüche"};
|
||||
@ -57,7 +64,8 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
|
||||
// but the multi-term aware component still emits a single token
|
||||
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
|
||||
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter)
|
||||
.getMultiTermComponent();
|
||||
tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
expected = new String[]{"Anspruche"};
|
@ -16,13 +16,15 @@
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
|
||||
@ -30,7 +32,8 @@ import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory}
|
||||
* Base class to test {@link WordDelimiterTokenFilterFactory} and
|
||||
* {@link WordDelimiterGraphTokenFilterFactory}.
|
||||
*/
|
||||
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
|
||||
final String type;
|
||||
@ -40,10 +43,12 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
||||
}
|
||||
|
||||
public void testDefault() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
|
||||
@ -54,44 +59,51 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
||||
}
|
||||
|
||||
public void testCatenateWords() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
|
||||
String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j",
|
||||
"2", "se", "ONeil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
public void testCatenateNumbers() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
|
||||
"se", "O", "Neil"};
|
||||
String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000",
|
||||
"j", "2", "se", "O", "Neil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
public void testCatenateAll() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
|
||||
@ -101,11 +113,13 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
||||
}
|
||||
|
||||
public void testSplitOnCaseChange() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot";
|
||||
String[] expected = new String[]{"PowerShot"};
|
||||
@ -115,30 +129,35 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
||||
}
|
||||
|
||||
public void testPreserveOriginal() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
|
||||
"wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
|
||||
String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42",
|
||||
"wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se",
|
||||
"O'Neil's", "O", "Neil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
public void testStemEnglishPossessive() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
|
||||
"se", "O", "Neil", "s"};
|
||||
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi",
|
||||
"4000", "j", "2", "se", "O", "Neil", "s" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
@ -0,0 +1,36 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.Name;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
|
||||
import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
|
||||
import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
|
||||
|
||||
public class CommonAnalysisClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
|
||||
public CommonAnalysisClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) {
|
||||
super(testCandidate);
|
||||
}
|
||||
|
||||
@ParametersFactory
|
||||
public static Iterable<Object[]> parameters() throws Exception {
|
||||
return ESClientYamlSuiteTestCase.createParameters();
|
||||
}
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static java.util.Collections.emptyList;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenizers() {
|
||||
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
|
||||
return tokenizers;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getTokenFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
|
||||
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
|
||||
filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class);
|
||||
filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Map<String, Class<?>> getCharFilters() {
|
||||
Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
|
||||
return filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||
* hasn't been marked in this class with its proper factory.
|
||||
*/
|
||||
public void testAllTokenizersMarked() {
|
||||
markedTestCase("char filter", getTokenizers());
|
||||
}
|
||||
|
||||
/**
|
||||
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||
* hasn't been marked in this class with its proper factory.
|
||||
*/
|
||||
public void testAllCharFiltersMarked() {
|
||||
markedTestCase("char filter", getCharFilters());
|
||||
}
|
||||
|
||||
/**
|
||||
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||
* hasn't been marked in this class with its proper factory.
|
||||
*/
|
||||
public void testAllTokenFiltersMarked() {
|
||||
markedTestCase("token filter", getTokenFilters());
|
||||
}
|
||||
|
||||
private void markedTestCase(String name, Map<String, Class<?>> map) {
|
||||
List<String> unmarked = map.entrySet().stream()
|
||||
.filter(e -> e.getValue() == MovedToAnalysisCommon.class)
|
||||
.map(Map.Entry::getKey)
|
||||
.sorted()
|
||||
.collect(toList());
|
||||
assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common "
|
||||
+ "but not mapped here", emptyList(), unmarked);
|
||||
}
|
||||
}
|
@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.query.Operator;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
|
||||
import org.elasticsearch.test.ESIntegTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||
import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
|
||||
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.startsWith;
|
||||
|
||||
public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
|
||||
@Override
|
||||
protected Collection<Class<? extends Plugin>> nodePlugins() {
|
||||
return Arrays.asList(CommonAnalysisPlugin.class);
|
||||
}
|
||||
|
||||
public void testNgramHighlightingWithBrokenPositions() throws IOException {
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test", jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("test")
|
||||
.startObject("properties")
|
||||
.startObject("name")
|
||||
.field("type", "text")
|
||||
.startObject("fields")
|
||||
.startObject("autocomplete")
|
||||
.field("type", "text")
|
||||
.field("analyzer", "autocomplete")
|
||||
.field("search_analyzer", "search_autocomplete")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject())
|
||||
.setSettings(Settings.builder()
|
||||
.put(indexSettings())
|
||||
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
||||
.put("analysis.tokenizer.autocomplete.min_gram", 1)
|
||||
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
||||
.put("analysis.tokenizer.autocomplete.type", "nGram")
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.putArray("analysis.filter.wordDelimiter.type_table",
|
||||
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
|
||||
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM",
|
||||
"# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM",
|
||||
", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM",
|
||||
"/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM",
|
||||
"@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
|
||||
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM",
|
||||
"{ => ALPHANUM")
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
|
||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
|
||||
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
|
||||
.putArray("analysis.analyzer.autocomplete.filter",
|
||||
"lowercase", "wordDelimiter")
|
||||
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
|
||||
.putArray("analysis.analyzer.search_autocomplete.filter",
|
||||
"lowercase", "wordDelimiter")));
|
||||
client().prepareIndex("test", "test", "1")
|
||||
.setSource("name", "ARCOTEL Hotels Deutschland").get();
|
||||
refresh();
|
||||
SearchResponse search = client().prepareSearch("test").setTypes("test")
|
||||
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
|
||||
.highlighter(new HighlightBuilder().field("name.autocomplete")).get();
|
||||
assertHighlight(search, 0, "name.autocomplete", 0,
|
||||
equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
|
||||
}
|
||||
|
||||
public void testMultiPhraseCutoff() throws IOException {
|
||||
/*
|
||||
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
|
||||
* query. We cut off and extract terms if there are more than 16 terms in the query
|
||||
*/
|
||||
assertAcked(prepareCreate("test")
|
||||
.addMapping("test", "body", "type=text,analyzer=custom_analyzer,"
|
||||
+ "search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
|
||||
.setSettings(
|
||||
Settings.builder().put(indexSettings())
|
||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||
.putArray("analysis.analyzer.custom_analyzer.filter",
|
||||
"lowercase", "wordDelimiter"))
|
||||
);
|
||||
|
||||
ensureGreen();
|
||||
client().prepareIndex("test", "test", "1")
|
||||
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org "
|
||||
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this is "
|
||||
+ "a test for highlighting feature Test: http://www.facebook.com "
|
||||
+ "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
||||
+ "http://twitter.com this is a test for highlighting feature")
|
||||
.get();
|
||||
refresh();
|
||||
SearchResponse search = client().prepareSearch()
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
||||
.highlighter(new HighlightBuilder().field("body")).get();
|
||||
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
|
||||
search = client()
|
||||
.prepareSearch()
|
||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "
|
||||
+ "http://elasticsearch.org http://xing.com http://cnn.com "
|
||||
+ "http://quora.com http://twitter.com this is a test for highlighting "
|
||||
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
|
||||
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
|
||||
+ "is a test for highlighting feature"))
|
||||
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
||||
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
|
||||
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
|
||||
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
|
||||
}
|
||||
}
|
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.index.query.Operator;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESIntegTestCase;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
|
||||
|
||||
public class QueryStringWithAnalyzersTests extends ESIntegTestCase {
|
||||
@Override
|
||||
protected Collection<Class<? extends Plugin>> nodePlugins() {
|
||||
return Arrays.asList(CommonAnalysisPlugin.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates that we properly split fields using the word delimiter filter in query_string.
|
||||
*/
|
||||
public void testCustomWordDelimiterQueryString() {
|
||||
assertAcked(client().admin().indices().prepareCreate("test")
|
||||
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
|
||||
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
|
||||
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
|
||||
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
|
||||
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
|
||||
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
|
||||
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
|
||||
"analysis.filter.custom_word_delimiter.catenate_words", "false",
|
||||
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
|
||||
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
|
||||
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
|
||||
.addMapping("type1",
|
||||
"field1", "type=text,analyzer=my_analyzer",
|
||||
"field2", "type=text,analyzer=my_analyzer"));
|
||||
|
||||
client().prepareIndex("test", "type1", "1").setSource(
|
||||
"field1", "foo bar baz",
|
||||
"field2", "not needed").get();
|
||||
refresh();
|
||||
|
||||
SearchResponse response = client()
|
||||
.prepareSearch("test")
|
||||
.setQuery(
|
||||
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
|
||||
.field("field1").field("field2")).get();
|
||||
assertHitCount(response, 1L);
|
||||
}
|
||||
}
|
@ -16,52 +16,62 @@
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
||||
public class WordDelimiterGraphTokenFilterFactoryTests
|
||||
extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
||||
public WordDelimiterGraphTokenFilterFactoryTests() {
|
||||
super("word_delimiter_graph");
|
||||
}
|
||||
|
||||
public void testMultiTerms() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
|
||||
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
|
||||
"ONeil", "O'Neil's", "O", "Neil" };
|
||||
String[] expected = new String[] { "PowerShot", "PowerShot", "Power", "Shot", "50042",
|
||||
"500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi",
|
||||
"fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||
tokenizer.setReader(new StringReader(source));
|
||||
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
|
||||
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
|
||||
int[] expectedIncr = new int[] { 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
|
||||
1, 1, 1, 0, 0, 1 };
|
||||
int[] expectedPosLen = new int[] { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3,
|
||||
1, 1, 1, 2, 2, 1, 1 };
|
||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
|
||||
expectedIncr, expectedPosLen, null);
|
||||
}
|
||||
|
||||
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
|
||||
/**
|
||||
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
|
||||
*/
|
||||
public void testPartsAndCatenate() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot";
|
||||
int[] expectedIncr = new int[]{1, 0, 1};
|
@ -16,31 +16,38 @@
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
||||
public class WordDelimiterTokenFilterFactoryTests
|
||||
extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
||||
public WordDelimiterTokenFilterFactoryTests() {
|
||||
super("word_delimiter");
|
||||
}
|
||||
|
||||
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
|
||||
/**
|
||||
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
|
||||
*/
|
||||
public void testPartsAndCatenate() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||
.build());
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||
Settings.builder()
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||
.build(),
|
||||
new CommonAnalysisPlugin());
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||
String source = "PowerShot";
|
||||
String[] expected = new String[]{"Power", "PowerShot", "Shot" };
|
@ -0,0 +1,11 @@
|
||||
"Module loaded":
|
||||
- do:
|
||||
cluster.state: {}
|
||||
|
||||
# Get master node id
|
||||
- set: { master_node: master }
|
||||
|
||||
- do:
|
||||
nodes.info: {}
|
||||
|
||||
- match: { nodes.$master.modules.0.name: analysis-common }
|
@ -0,0 +1,11 @@
|
||||
## Smoke tests for analyzers included in the analysis-common module
|
||||
|
||||
"whitespace":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: Foo Bar!
|
||||
analyzer: whitespace
|
||||
- length: { tokens: 2 }
|
||||
- match: { tokens.0.token: Foo }
|
||||
- match: { tokens.1.token: Bar! }
|
@ -0,0 +1,27 @@
|
||||
## Smoke tests for tokenizers included in the analysis-common module
|
||||
|
||||
"keyword":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: Foo Bar!
|
||||
tokenizer: keyword
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: Foo Bar! }
|
||||
|
||||
---
|
||||
"nGram":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: good
|
||||
explain: true
|
||||
tokenizer:
|
||||
type: nGram
|
||||
min_gram: 2
|
||||
max_gram: 2
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: go }
|
||||
- match: { detail.tokenizer.tokens.1.token: oo }
|
||||
- match: { detail.tokenizer.tokens.2.token: od }
|
@ -0,0 +1,82 @@
|
||||
## Smoke tests for token filters included in the analysis-common module
|
||||
|
||||
"asciifolding":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: Musée d'Orsay
|
||||
tokenizer: keyword
|
||||
filter: [asciifolding]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: Musee d'Orsay }
|
||||
|
||||
---
|
||||
"lowercase":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: Foo Bar!
|
||||
tokenizer: keyword
|
||||
filter: [lowercase]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: foo bar! }
|
||||
|
||||
---
|
||||
"word_delimiter":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: the qu1ck brown fox
|
||||
tokenizer: standard
|
||||
filter: [word_delimiter]
|
||||
- length: { tokens: 6 }
|
||||
- match: { tokens.0.token: the }
|
||||
- match: { tokens.1.token: qu }
|
||||
- match: { tokens.2.token: "1" }
|
||||
- match: { tokens.3.token: ck }
|
||||
- match: { tokens.4.token: brown }
|
||||
- match: { tokens.5.token: fox }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: the qu1ck brown fox
|
||||
tokenizer: standard
|
||||
filter:
|
||||
- type: word_delimiter
|
||||
split_on_numerics: false
|
||||
- length: { tokens: 4 }
|
||||
- match: { tokens.0.token: the }
|
||||
- match: { tokens.1.token: qu1ck }
|
||||
- match: { tokens.2.token: brown }
|
||||
- match: { tokens.3.token: fox }
|
||||
|
||||
---
|
||||
"word_delimiter_graph":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: the qu1ck brown fox
|
||||
tokenizer: standard
|
||||
filter: [word_delimiter_graph]
|
||||
- length: { tokens: 6 }
|
||||
- match: { tokens.0.token: the }
|
||||
- match: { tokens.1.token: qu }
|
||||
- match: { tokens.2.token: "1" }
|
||||
- match: { tokens.3.token: ck }
|
||||
- match: { tokens.4.token: brown }
|
||||
- match: { tokens.5.token: fox }
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: the qu1ck brown fox
|
||||
tokenizer: standard
|
||||
filter:
|
||||
- type: word_delimiter_graph
|
||||
split_on_numerics: false
|
||||
- length: { tokens: 4 }
|
||||
- match: { tokens.0.token: the }
|
||||
- match: { tokens.1.token: qu1ck }
|
||||
- match: { tokens.2.token: brown }
|
||||
- match: { tokens.3.token: fox }
|
@ -0,0 +1,13 @@
|
||||
## Smoke tests for analyzers included in the analysis-common module
|
||||
|
||||
"mapping":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: jeff quit phish
|
||||
tokenizer: keyword
|
||||
char_filter:
|
||||
- type: mapping
|
||||
mappings: ["ph => f", "qu => q"]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: "jeff qit fish" }
|
@ -19,14 +19,9 @@
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||
import org.elasticsearch.Version;
|
||||
@ -37,6 +32,10 @@ import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
|
||||
|
||||
@Override
|
||||
|
@ -1,29 +1,11 @@
|
||||
# Will be performed before each test as a part of the test setup
|
||||
#
|
||||
setup:
|
||||
- do:
|
||||
ping: {}
|
||||
|
||||
---
|
||||
"Basic test":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: Foo Bar
|
||||
- length: { tokens: 2 }
|
||||
- match: { tokens.0.token: foo }
|
||||
- match: { tokens.1.token: bar }
|
||||
|
||||
---
|
||||
"Tokenizer and filter":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
filter: [lowercase]
|
||||
text: Foo Bar
|
||||
tokenizer: keyword
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: foo bar }
|
||||
- match: { tokens.0.token: foo }
|
||||
- match: { tokens.1.token: bar }
|
||||
|
||||
---
|
||||
"Index and field":
|
||||
@ -36,7 +18,7 @@ setup:
|
||||
properties:
|
||||
text:
|
||||
type: text
|
||||
analyzer: whitespace
|
||||
analyzer: standard
|
||||
|
||||
- do:
|
||||
indices.analyze:
|
||||
@ -45,84 +27,51 @@ setup:
|
||||
field: text
|
||||
text: Foo Bar!
|
||||
- length: { tokens: 2 }
|
||||
- match: { tokens.0.token: Foo }
|
||||
- match: { tokens.1.token: Bar! }
|
||||
---
|
||||
"JSON in Body":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: { "text": "Foo Bar", "filter": ["lowercase"], "tokenizer": keyword }
|
||||
- length: {tokens: 1 }
|
||||
- match: { tokens.0.token: foo bar }
|
||||
- match: { tokens.0.token: foo }
|
||||
- match: { tokens.1.token: bar }
|
||||
|
||||
---
|
||||
"Array text":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: { "text": ["Foo Bar", "Baz"], "filter": ["lowercase"], "tokenizer": keyword }
|
||||
- length: {tokens: 2 }
|
||||
- match: { tokens.0.token: foo bar }
|
||||
- match: { tokens.1.token: baz }
|
||||
body:
|
||||
text: ["Foo Bar", "Baz"]
|
||||
tokenizer: standard
|
||||
- length: { tokens: 3 }
|
||||
- match: { tokens.0.token: Foo }
|
||||
- match: { tokens.1.token: Bar }
|
||||
- match: { tokens.2.token: Baz }
|
||||
|
||||
---
|
||||
"Detail response with Analyzer":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: {"text": "This is troubled", "analyzer": standard, "explain": "true"}
|
||||
body:
|
||||
text: This is troubled
|
||||
analyzer: standard
|
||||
explain: true
|
||||
- length: { detail.analyzer.tokens: 3 }
|
||||
- match: { detail.analyzer.name: standard }
|
||||
- match: { detail.analyzer.tokens.0.token: this }
|
||||
- match: { detail.analyzer.tokens.1.token: is }
|
||||
- match: { detail.analyzer.tokens.2.token: troubled }
|
||||
---
|
||||
"Detail output spcified attribute":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: {"text": "<text>This is troubled</text>", "char_filter": ["html_strip"], "filter": ["snowball"], "tokenizer": standard, "explain": true, "attributes": ["keyword"]}
|
||||
- length: { detail.charfilters: 1 }
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- length: { detail.tokenfilters.0.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: standard }
|
||||
- match: { detail.tokenizer.tokens.0.token: This }
|
||||
- match: { detail.tokenizer.tokens.1.token: is }
|
||||
- match: { detail.tokenizer.tokens.2.token: troubled }
|
||||
- match: { detail.tokenfilters.0.name: snowball }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: This }
|
||||
- match: { detail.tokenfilters.0.tokens.1.token: is }
|
||||
- match: { detail.tokenfilters.0.tokens.2.token: troubl }
|
||||
- match: { detail.tokenfilters.0.tokens.2.keyword: false }
|
||||
- match: { detail.analyzer.name: standard }
|
||||
- match: { detail.analyzer.tokens.0.token: this }
|
||||
- match: { detail.analyzer.tokens.1.token: is }
|
||||
- match: { detail.analyzer.tokens.2.token: troubled }
|
||||
|
||||
---
|
||||
"Custom filter in request":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: { "text": "Foo Bar Buzz", "filter": ["lowercase", { "type": "stop", "stopwords": ["foo", "buzz"]}], "tokenizer": whitespace, "explain": true }
|
||||
- length: {detail.tokenizer.tokens: 3 }
|
||||
- length: {detail.tokenfilters.0.tokens: 3 }
|
||||
- length: {detail.tokenfilters.1.tokens: 1 }
|
||||
- match: { detail.tokenizer.name: whitespace }
|
||||
- match: { detail.tokenizer.tokens.0.token: Foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: Bar }
|
||||
- match: { detail.tokenizer.tokens.2.token: Buzz }
|
||||
- match: { detail.tokenfilters.0.name: lowercase }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: foo }
|
||||
- match: { detail.tokenfilters.0.tokens.1.token: bar }
|
||||
- match: { detail.tokenfilters.0.tokens.2.token: buzz }
|
||||
- match: { detail.tokenfilters.1.name: "_anonymous_tokenfilter_[1]" }
|
||||
- match: { detail.tokenfilters.1.tokens.0.token: bar }
|
||||
---
|
||||
"Custom char_filter in request":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: { "text": "jeff quit phish", "char_filter": [{"type": "mapping", "mappings": ["ph => f", "qu => q"]}], "tokenizer": keyword }
|
||||
- length: {tokens: 1 }
|
||||
- match: { tokens.0.token: "jeff qit fish" }
|
||||
|
||||
---
|
||||
"Custom tokenizer in request":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: { "text": "good", "tokenizer": {"type": "nGram", "min_gram": 2, "max_gram": 2}, "explain": true }
|
||||
- length: {detail.tokenizer.tokens: 3 }
|
||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||
- match: { detail.tokenizer.tokens.0.token: go }
|
||||
- match: { detail.tokenizer.tokens.1.token: oo }
|
||||
- match: { detail.tokenizer.tokens.2.token: od }
|
||||
body:
|
||||
text: foo bar buzz
|
||||
tokenizer: standard
|
||||
explain: true
|
||||
filter:
|
||||
- type: stop
|
||||
stopwords: ["foo", "buzz"]
|
||||
- length: { detail.tokenizer.tokens: 3 }
|
||||
- length: { detail.tokenfilters.0.tokens: 1 }
|
||||
- match: { detail.tokenizer.name: standard }
|
||||
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||
- match: { detail.tokenizer.tokens.1.token: bar }
|
||||
- match: { detail.tokenizer.tokens.2.token: buzz }
|
||||
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
|
||||
- match: { detail.tokenfilters.0.tokens.0.token: bar }
|
||||
|
@ -26,14 +26,15 @@ List projects = [
|
||||
'test:fixtures:hdfs-fixture',
|
||||
'test:logger-usage',
|
||||
'modules:aggs-matrix-stats',
|
||||
'modules:analysis-common',
|
||||
'modules:ingest-common',
|
||||
'modules:lang-expression',
|
||||
'modules:lang-mustache',
|
||||
'modules:lang-painless',
|
||||
'modules:transport-netty4',
|
||||
'modules:reindex',
|
||||
'modules:percolator',
|
||||
'modules:reindex',
|
||||
'modules:repository-url',
|
||||
'modules:transport-netty4',
|
||||
'plugins:analysis-icu',
|
||||
'plugins:analysis-kuromoji',
|
||||
'plugins:analysis-phonetic',
|
||||
|
@ -20,14 +20,12 @@
|
||||
package org.elasticsearch;
|
||||
|
||||
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
|
||||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
||||
@ -92,7 +90,6 @@ import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
|
||||
@ -110,7 +107,7 @@ import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Alerts us if new analyzers are added to lucene, so we don't miss them.
|
||||
* Alerts us if new analysis components are added to Lucene, so we don't miss them.
|
||||
* <p>
|
||||
* If we don't want to expose one for a specific reason, just map it to Void.
|
||||
* The deprecated ones can be mapped to Deprecated.class.
|
||||
@ -178,7 +175,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||
.put("apostrophe", ApostropheFilterFactory.class)
|
||||
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
|
||||
.put("arabicstem", ArabicStemTokenFilterFactory.class)
|
||||
.put("asciifolding", ASCIIFoldingTokenFilterFactory.class)
|
||||
.put("asciifolding", MovedToAnalysisCommon.class)
|
||||
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
|
||||
.put("bulgarianstem", StemmerTokenFilterFactory.class)
|
||||
.put("cjkbigram", CJKBigramFilterFactory.class)
|
||||
@ -253,8 +250,8 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
|
||||
.put("type", KeepTypesFilterFactory.class)
|
||||
.put("uppercase", UpperCaseTokenFilterFactory.class)
|
||||
.put("worddelimiter", WordDelimiterTokenFilterFactory.class)
|
||||
.put("worddelimitergraph", WordDelimiterGraphFilterFactory.class)
|
||||
.put("worddelimiter", MovedToAnalysisCommon.class)
|
||||
.put("worddelimitergraph", MovedToAnalysisCommon.class)
|
||||
.put("flattengraph", FlattenGraphTokenFilterFactory.class)
|
||||
|
||||
// TODO: these tokenfilters are not yet exposed: useful?
|
||||
@ -401,6 +398,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||
}
|
||||
}
|
||||
expected.remove(Void.class);
|
||||
expected.remove(MovedToAnalysisCommon.class);
|
||||
expected.remove(Deprecated.class);
|
||||
|
||||
Collection<Class<?>> actual = new HashSet<>();
|
||||
@ -489,4 +487,11 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
||||
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
||||
}
|
||||
|
||||
/**
|
||||
* Marker class for components that have moved to the analysis-common modules. This will be
|
||||
* removed when the module is complete and these analysis components aren't available to core.
|
||||
*/
|
||||
protected static final class MovedToAnalysisCommon {
|
||||
private MovedToAnalysisCommon() {}
|
||||
}
|
||||
}
|
||||
|
@ -25,17 +25,18 @@ import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.IndexSettingsModule;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static java.util.Collections.emptyList;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class AnalysisTestsHelper {
|
||||
|
||||
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, String resource) throws IOException {
|
||||
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir,
|
||||
String resource) throws IOException {
|
||||
Settings settings = Settings.builder()
|
||||
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
|
||||
@ -45,12 +46,15 @@ public class AnalysisTestsHelper {
|
||||
}
|
||||
|
||||
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
|
||||
Settings settings) throws IOException {
|
||||
Settings settings, AnalysisPlugin... plugins) throws IOException {
|
||||
if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) {
|
||||
settings = Settings.builder().put(settings).put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
||||
settings = Settings.builder().put(settings)
|
||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
||||
}
|
||||
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||
AnalysisRegistry analysisRegistry = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry();
|
||||
AnalysisRegistry analysisRegistry =
|
||||
new AnalysisModule(new Environment(settings), Arrays.asList(plugins))
|
||||
.getAnalysisRegistry();
|
||||
return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings),
|
||||
analysisRegistry.buildTokenFilterFactories(indexSettings),
|
||||
analysisRegistry.buildTokenizerFactories(indexSettings),
|
Loading…
x
Reference in New Issue
Block a user