mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-25 17:38:44 +00:00
Start building analysis-common module (#23614)
Start moving built in analysis components into the new analysis-common module. The goal of this project is: 1. Remove core's dependency on lucene-analyzers-common.jar which should shrink the dependencies for transport client and high level rest client. 2. Prove that analysis plugins can do all the "built in" things by moving all "built in" behavior to a plugin. 3. Force tests not to depend on any oddball analyzer behavior. If tests need anything more than the standard analyzer they can use the mock analyzer provided by Lucene's test infrastructure.
This commit is contained in:
parent
151a65ed17
commit
caf376c8af
@ -1096,7 +1096,6 @@
|
|||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactory.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" />
|
||||||
@ -1225,8 +1224,6 @@
|
|||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactory.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactory.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
|
||||||
@ -2686,11 +2683,8 @@
|
|||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactoryTests.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTestsHelper.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]BaseWordDelimiterTokenFilterFactoryTestCase.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" />
|
||||||
@ -2709,8 +2703,6 @@
|
|||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactoryTests.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactoryTests.java" checks="LineLength" />
|
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" />
|
||||||
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" />
|
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" />
|
||||||
|
@ -71,4 +71,9 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean breaksFastVectorHighlighter() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
@ -20,10 +20,20 @@
|
|||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
|
||||||
|
|
||||||
public interface TokenFilterFactory {
|
public interface TokenFilterFactory {
|
||||||
|
|
||||||
String name();
|
String name();
|
||||||
|
|
||||||
TokenStream create(TokenStream tokenStream);
|
TokenStream create(TokenStream tokenStream);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the
|
||||||
|
* {@link FastVectorHighlighter}? If this is {@code true} then the
|
||||||
|
* {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets.
|
||||||
|
*/
|
||||||
|
default boolean breaksFastVectorHighlighter() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,7 +25,6 @@ import org.elasticsearch.common.NamedRegistry;
|
|||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||||
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||||
@ -140,8 +139,6 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
|
|||||||
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
|
||||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.plugins.AnalysisPlugin;
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
@ -205,7 +202,6 @@ public final class AnalysisModule {
|
|||||||
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
|
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
|
||||||
tokenFilters.register("stop", StopTokenFilterFactory::new);
|
tokenFilters.register("stop", StopTokenFilterFactory::new);
|
||||||
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
|
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
|
||||||
tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
|
||||||
tokenFilters.register("length", LengthTokenFilterFactory::new);
|
tokenFilters.register("length", LengthTokenFilterFactory::new);
|
||||||
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
|
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
|
||||||
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
|
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
|
||||||
@ -225,8 +221,6 @@ public final class AnalysisModule {
|
|||||||
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
||||||
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
|
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
|
||||||
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
|
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
|
||||||
tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
|
|
||||||
tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
|
|
||||||
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
|
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
|
||||||
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
|
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
|
||||||
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);
|
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);
|
||||||
|
@ -26,15 +26,9 @@ import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
|
|||||||
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
|
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
|
||||||
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
|
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
|
||||||
import org.apache.lucene.util.CollectionUtil;
|
import org.apache.lucene.util.CollectionUtil;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
import org.elasticsearch.index.analysis.CustomAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
|
||||||
import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
|
||||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.mapper.FieldMapper;
|
import org.elasticsearch.index.mapper.FieldMapper;
|
||||||
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
@ -56,7 +50,7 @@ public final class FragmentBuilderHelper {
|
|||||||
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) {
|
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) {
|
||||||
assert fragInfo != null : "FragInfo must not be null";
|
assert fragInfo != null : "FragInfo must not be null";
|
||||||
assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
|
assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
|
||||||
if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.fieldType().indexAnalyzer()))) {
|
if (!fragInfo.getSubInfos().isEmpty() && containsBrokenAnalysis(mapper.fieldType().indexAnalyzer())) {
|
||||||
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
|
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
|
||||||
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
|
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
|
||||||
* the fragments based on their offsets rather than using soley the positions as it is done in
|
* the fragments based on their offsets rather than using soley the positions as it is done in
|
||||||
@ -91,8 +85,7 @@ public final class FragmentBuilderHelper {
|
|||||||
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
||||||
TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
||||||
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
|
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
|
||||||
if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory
|
if (tokenFilterFactory.breaksFastVectorHighlighter()) {
|
||||||
|| tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) {
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,8 @@
|
|||||||
*/
|
*/
|
||||||
package org.elasticsearch.action.admin.indices;
|
package org.elasticsearch.action.admin.indices;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
|
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
|
||||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
||||||
@ -27,18 +29,28 @@ import org.elasticsearch.common.UUIDs;
|
|||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
import org.elasticsearch.index.analysis.AnalysisRegistry;
|
||||||
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
import org.elasticsearch.index.analysis.IndexAnalyzers;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.index.mapper.AllFieldMapper;
|
import org.elasticsearch.index.mapper.AllFieldMapper;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||||
|
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||||
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.IndexSettingsModule;
|
import org.elasticsearch.test.IndexSettingsModule;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import static java.util.Collections.emptyList;
|
import static java.util.Collections.singletonList;
|
||||||
|
import static java.util.Collections.singletonMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the
|
||||||
|
* {@code common-analysis} module.
|
||||||
|
*/
|
||||||
public class TransportAnalyzeActionTests extends ESTestCase {
|
public class TransportAnalyzeActionTests extends ESTestCase {
|
||||||
|
|
||||||
private IndexAnalyzers indexAnalyzers;
|
private IndexAnalyzers indexAnalyzers;
|
||||||
@ -53,23 +65,28 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||||||
Settings indexSettings = Settings.builder()
|
Settings indexSettings = Settings.builder()
|
||||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
|
||||||
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
|
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
|
||||||
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
|
.put("index.analysis.analyzer.custom_analyzer.filter", "mock").build();
|
||||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
|
||||||
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
|
||||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
|
||||||
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
|
||||||
.put("index.analysis.tokenizer.trigram.type", "ngram")
|
|
||||||
.put("index.analysis.tokenizer.trigram.min_gram", 3)
|
|
||||||
.put("index.analysis.tokenizer.trigram.max_gram", 3)
|
|
||||||
.put("index.analysis.filter.synonym.type", "synonym")
|
|
||||||
.putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay")
|
|
||||||
.put("index.analysis.filter.synonym.tokenizer", "trigram")
|
|
||||||
.put("index.analysis.filter.synonym.min_gram", 3)
|
|
||||||
.put("index.analysis.filter.synonym.max_gram", 3).build();
|
|
||||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||||
environment = new Environment(settings);
|
environment = new Environment(settings);
|
||||||
registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry();
|
AnalysisPlugin plugin = new AnalysisPlugin() {
|
||||||
|
class MockFactory extends AbstractTokenFilterFactory {
|
||||||
|
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
super(indexSettings, name, settings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||||
|
return singletonMap("mock", MockFactory::new);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
|
||||||
indexAnalyzers = registry.build(idxSettings);
|
indexAnalyzers = registry.build(idxSettings);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -143,51 +160,44 @@ public class TransportAnalyzeActionTests extends ESTestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testWithIndexAnalyzers() throws IOException {
|
public void testWithIndexAnalyzers() throws IOException {
|
||||||
|
|
||||||
AnalyzeRequest request = new AnalyzeRequest();
|
AnalyzeRequest request = new AnalyzeRequest();
|
||||||
request.analyzer("standard");
|
|
||||||
request.text("the quick brown fox");
|
request.text("the quick brown fox");
|
||||||
request.analyzer("custom_analyzer");
|
request.analyzer("custom_analyzer");
|
||||||
request.text("the qu1ck brown fox");
|
|
||||||
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||||
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
|
||||||
assertEquals(4, tokens.size());
|
assertEquals(3, tokens.size());
|
||||||
|
assertEquals("quick", tokens.get(0).getTerm());
|
||||||
|
assertEquals("brown", tokens.get(1).getTerm());
|
||||||
|
assertEquals("fox", tokens.get(2).getTerm());
|
||||||
|
|
||||||
request.analyzer("whitespace");
|
request.analyzer("standard");
|
||||||
request.text("the qu1ck brown fox-dog");
|
|
||||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||||
tokens = analyze.getTokens();
|
tokens = analyze.getTokens();
|
||||||
assertEquals(4, tokens.size());
|
assertEquals(4, tokens.size());
|
||||||
|
|
||||||
request.analyzer("custom_analyzer");
|
|
||||||
request.text("the qu1ck brown fox-dog");
|
|
||||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
|
||||||
tokens = analyze.getTokens();
|
|
||||||
assertEquals(5, tokens.size());
|
|
||||||
|
|
||||||
request.analyzer(null);
|
|
||||||
request.tokenizer("whitespace");
|
|
||||||
request.addTokenFilter("lowercase");
|
|
||||||
request.addTokenFilter("wordDelimiter");
|
|
||||||
request.text("the qu1ck brown fox-dog");
|
|
||||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
|
||||||
tokens = analyze.getTokens();
|
|
||||||
assertEquals(5, tokens.size());
|
|
||||||
assertEquals("the", tokens.get(0).getTerm());
|
assertEquals("the", tokens.get(0).getTerm());
|
||||||
assertEquals("qu1ck", tokens.get(1).getTerm());
|
assertEquals("quick", tokens.get(1).getTerm());
|
||||||
assertEquals("brown", tokens.get(2).getTerm());
|
assertEquals("brown", tokens.get(2).getTerm());
|
||||||
assertEquals("fox", tokens.get(3).getTerm());
|
assertEquals("fox", tokens.get(3).getTerm());
|
||||||
assertEquals("dog", tokens.get(4).getTerm());
|
|
||||||
|
|
||||||
|
// Switch the analyzer out for just a tokenizer
|
||||||
request.analyzer(null);
|
request.analyzer(null);
|
||||||
request.tokenizer("trigram");
|
request.tokenizer("standard");
|
||||||
request.addTokenFilter("synonym");
|
|
||||||
request.text("kimchy");
|
|
||||||
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||||
tokens = analyze.getTokens();
|
tokens = analyze.getTokens();
|
||||||
assertEquals(2, tokens.size());
|
assertEquals(4, tokens.size());
|
||||||
assertEquals("sha", tokens.get(0).getTerm());
|
assertEquals("the", tokens.get(0).getTerm());
|
||||||
assertEquals("hay", tokens.get(1).getTerm());
|
assertEquals("quick", tokens.get(1).getTerm());
|
||||||
|
assertEquals("brown", tokens.get(2).getTerm());
|
||||||
|
assertEquals("fox", tokens.get(3).getTerm());
|
||||||
|
|
||||||
|
// Now try applying our token filter
|
||||||
|
request.addTokenFilter("mock");
|
||||||
|
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
|
||||||
|
tokens = analyze.getTokens();
|
||||||
|
assertEquals(3, tokens.size());
|
||||||
|
assertEquals("quick", tokens.get(0).getTerm());
|
||||||
|
assertEquals("brown", tokens.get(1).getTerm());
|
||||||
|
assertEquals("fox", tokens.get(2).getTerm());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException {
|
public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException {
|
||||||
|
@ -22,5 +22,5 @@ package org.elasticsearch.index.analysis;
|
|||||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||||
|
|
||||||
public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
|
public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
// tests are inherited
|
// tests are inherited and nothing needs to be defined here
|
||||||
}
|
}
|
||||||
|
@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
|
|||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
@ -31,17 +32,20 @@ import org.elasticsearch.common.settings.Settings;
|
|||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||||
|
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
|
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
|
||||||
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.IndexSettingsModule;
|
import org.elasticsearch.test.IndexSettingsModule;
|
||||||
import org.elasticsearch.test.VersionUtils;
|
import org.elasticsearch.test.VersionUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import static java.util.Collections.emptyList;
|
|
||||||
import static java.util.Collections.emptyMap;
|
import static java.util.Collections.emptyMap;
|
||||||
|
import static java.util.Collections.singletonList;
|
||||||
import static java.util.Collections.singletonMap;
|
import static java.util.Collections.singletonMap;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
import static org.hamcrest.Matchers.instanceOf;
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
@ -112,51 +116,73 @@ public class AnalysisRegistryTests extends ESTestCase {
|
|||||||
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
|
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide.
|
||||||
|
*/
|
||||||
public void testConfigureCamelCaseTokenFilter() throws IOException {
|
public void testConfigureCamelCaseTokenFilter() throws IOException {
|
||||||
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
|
||||||
Settings indexSettings = Settings.builder()
|
Settings indexSettings = Settings.builder()
|
||||||
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
|
||||||
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
|
.put("index.analysis.filter.testFilter.type", "mock")
|
||||||
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
|
.put("index.analysis.filter.test_filter.type", "mock")
|
||||||
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
.put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard")
|
||||||
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
|
.putArray("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter")
|
||||||
.put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
|
.put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard")
|
||||||
.putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build();
|
.putArray("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build();
|
||||||
|
|
||||||
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
|
||||||
|
|
||||||
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry()
|
/* The snake_case version of the name should not filter out any stopwords while the
|
||||||
|
* camelCase version will filter out English stopwords. */
|
||||||
|
AnalysisPlugin plugin = new AnalysisPlugin() {
|
||||||
|
class MockFactory extends AbstractTokenFilterFactory {
|
||||||
|
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||||
|
super(indexSettings, name, settings);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
if (name().equals("test_filter")) {
|
||||||
|
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
|
||||||
|
}
|
||||||
|
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||||
|
return singletonMap("mock", MockFactory::new);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), singletonList(plugin)).getAnalysisRegistry()
|
||||||
.build(idxSettings);
|
.build(idxSettings);
|
||||||
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
|
|
||||||
|
// This shouldn't contain English stopwords
|
||||||
|
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) {
|
||||||
assertNotNull(custom_analyser);
|
assertNotNull(custom_analyser);
|
||||||
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
|
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
List<String> token = new ArrayList<>();
|
assertTrue(tokenStream.incrementToken());
|
||||||
while(tokenStream.incrementToken()) {
|
assertEquals("has", charTermAttribute.toString());
|
||||||
token.add(charTermAttribute.toString());
|
assertTrue(tokenStream.incrementToken());
|
||||||
}
|
assertEquals("foo", charTermAttribute.toString());
|
||||||
assertEquals(token.toString(), 2, token.size());
|
assertFalse(tokenStream.incrementToken());
|
||||||
assertEquals("j2se", token.get(0));
|
|
||||||
assertEquals("j2ee", token.get(1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
|
// This *should* contain English stopwords
|
||||||
|
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) {
|
||||||
assertNotNull(custom_analyser);
|
assertNotNull(custom_analyser);
|
||||||
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
|
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
List<String> token = new ArrayList<>();
|
assertTrue(tokenStream.incrementToken());
|
||||||
while(tokenStream.incrementToken()) {
|
assertEquals("has", charTermAttribute.toString());
|
||||||
token.add(charTermAttribute.toString());
|
assertTrue(tokenStream.incrementToken());
|
||||||
}
|
assertEquals("a", charTermAttribute.toString());
|
||||||
assertEquals(token.toString(), 6, token.size());
|
assertTrue(tokenStream.incrementToken());
|
||||||
assertEquals("j", token.get(0));
|
assertEquals("foo", charTermAttribute.toString());
|
||||||
assertEquals("2", token.get(1));
|
assertFalse(tokenStream.incrementToken());
|
||||||
assertEquals("se", token.get(2));
|
|
||||||
assertEquals("j", token.get(3));
|
|
||||||
assertEquals("2", token.get(4));
|
|
||||||
assertEquals("ee", token.get(5));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
package org.elasticsearch.search.fetch.subphase.highlight;
|
package org.elasticsearch.search.fetch.subphase.highlight;
|
||||||
|
|
||||||
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
|
||||||
|
|
||||||
import org.apache.lucene.search.join.ScoreMode;
|
import org.apache.lucene.search.join.ScoreMode;
|
||||||
import org.elasticsearch.action.index.IndexRequestBuilder;
|
import org.elasticsearch.action.index.IndexRequestBuilder;
|
||||||
import org.elasticsearch.action.search.SearchRequestBuilder;
|
import org.elasticsearch.action.search.SearchRequestBuilder;
|
||||||
@ -100,6 +101,7 @@ import static org.hamcrest.Matchers.not;
|
|||||||
import static org.hamcrest.Matchers.startsWith;
|
import static org.hamcrest.Matchers.startsWith;
|
||||||
|
|
||||||
public class HighlighterSearchIT extends ESIntegTestCase {
|
public class HighlighterSearchIT extends ESIntegTestCase {
|
||||||
|
// TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
|
||||||
private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"};
|
private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"};
|
||||||
private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"};
|
private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"};
|
||||||
|
|
||||||
@ -113,12 +115,11 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||||||
mappings.startObject();
|
mappings.startObject();
|
||||||
mappings.startObject("type")
|
mappings.startObject("type")
|
||||||
.startObject("properties")
|
.startObject("properties")
|
||||||
.startObject("text")
|
.startObject("text")
|
||||||
.field("type", "keyword")
|
.field("type", "keyword")
|
||||||
.field("store", true)
|
.field("store", true)
|
||||||
.endObject()
|
.endObject()
|
||||||
.endObject()
|
.endObject().endObject();
|
||||||
.endObject();
|
|
||||||
mappings.endObject();
|
mappings.endObject();
|
||||||
assertAcked(prepareCreate("test")
|
assertAcked(prepareCreate("test")
|
||||||
.addMapping("type", mappings));
|
.addMapping("type", mappings));
|
||||||
@ -139,14 +140,13 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||||||
mappings.startObject();
|
mappings.startObject();
|
||||||
mappings.startObject("type")
|
mappings.startObject("type")
|
||||||
.startObject("properties")
|
.startObject("properties")
|
||||||
.startObject("text")
|
.startObject("text")
|
||||||
.field("type", "text")
|
.field("type", "text")
|
||||||
.field("analyzer", "keyword")
|
.field("analyzer", "keyword")
|
||||||
.field("index_options", "offsets")
|
.field("index_options", "offsets")
|
||||||
.field("term_vector", "with_positions_offsets")
|
.field("term_vector", "with_positions_offsets")
|
||||||
.endObject()
|
.endObject()
|
||||||
.endObject()
|
.endObject().endObject();
|
||||||
.endObject();
|
|
||||||
mappings.endObject();
|
mappings.endObject();
|
||||||
assertAcked(prepareCreate("test")
|
assertAcked(prepareCreate("test")
|
||||||
.addMapping("type", mappings));
|
.addMapping("type", mappings));
|
||||||
@ -166,23 +166,22 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||||||
mappings.startObject();
|
mappings.startObject();
|
||||||
mappings.startObject("type")
|
mappings.startObject("type")
|
||||||
.startObject("_source")
|
.startObject("_source")
|
||||||
.field("enabled", false)
|
.field("enabled", false)
|
||||||
.endObject()
|
.endObject()
|
||||||
.startObject("properties")
|
.startObject("properties")
|
||||||
.startObject("unstored_field")
|
.startObject("unstored_field")
|
||||||
.field("index_options", "offsets")
|
.field("index_options", "offsets")
|
||||||
.field("term_vector", "with_positions_offsets")
|
.field("term_vector", "with_positions_offsets")
|
||||||
.field("type", "text")
|
.field("type", "text")
|
||||||
.field("store", false)
|
.field("store", false)
|
||||||
.endObject()
|
.endObject()
|
||||||
.startObject("text")
|
.startObject("text")
|
||||||
.field("index_options", "offsets")
|
.field("index_options", "offsets")
|
||||||
.field("term_vector", "with_positions_offsets")
|
.field("term_vector", "with_positions_offsets")
|
||||||
.field("type", "text")
|
.field("type", "text")
|
||||||
.field("store", true)
|
.field("store", true)
|
||||||
.endObject()
|
.endObject()
|
||||||
.endObject()
|
.endObject().endObject();
|
||||||
.endObject();
|
|
||||||
mappings.endObject();
|
mappings.endObject();
|
||||||
assertAcked(prepareCreate("test")
|
assertAcked(prepareCreate("test")
|
||||||
.addMapping("type", mappings));
|
.addMapping("type", mappings));
|
||||||
@ -218,103 +217,6 @@ public class HighlighterSearchIT extends ESIntegTestCase {
|
|||||||
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
|
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgramHighlightingWithBrokenPositions() throws IOException {
|
|
||||||
assertAcked(prepareCreate("test")
|
|
||||||
.addMapping("test", jsonBuilder()
|
|
||||||
.startObject()
|
|
||||||
.startObject("test")
|
|
||||||
.startObject("properties")
|
|
||||||
.startObject("name")
|
|
||||||
.startObject("fields")
|
|
||||||
.startObject("autocomplete")
|
|
||||||
.field("type", "text")
|
|
||||||
.field("analyzer", "autocomplete")
|
|
||||||
.field("search_analyzer", "search_autocomplete")
|
|
||||||
.field("term_vector", "with_positions_offsets")
|
|
||||||
.endObject()
|
|
||||||
.endObject()
|
|
||||||
.field("type", "text")
|
|
||||||
.endObject()
|
|
||||||
.endObject()
|
|
||||||
.endObject()
|
|
||||||
.endObject())
|
|
||||||
.setSettings(Settings.builder()
|
|
||||||
.put(indexSettings())
|
|
||||||
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
|
||||||
.put("analysis.tokenizer.autocomplete.min_gram", 1)
|
|
||||||
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
|
||||||
.put("analysis.tokenizer.autocomplete.type", "nGram")
|
|
||||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
|
||||||
.putArray("analysis.filter.wordDelimiter.type_table",
|
|
||||||
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
|
|
||||||
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM",
|
|
||||||
"+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM",
|
|
||||||
"^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
|
|
||||||
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM")
|
|
||||||
|
|
||||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
|
||||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
|
|
||||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
|
||||||
|
|
||||||
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
|
|
||||||
.putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
|
|
||||||
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
|
|
||||||
.putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")));
|
|
||||||
client().prepareIndex("test", "test", "1")
|
|
||||||
.setSource("name", "ARCOTEL Hotels Deutschland").get();
|
|
||||||
refresh();
|
|
||||||
SearchResponse search = client().prepareSearch("test").setTypes("test")
|
|
||||||
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
|
|
||||||
.highlighter(new HighlightBuilder().field("name.autocomplete")).execute().actionGet();
|
|
||||||
assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testMultiPhraseCutoff() throws IOException {
|
|
||||||
/*
|
|
||||||
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
|
|
||||||
* query. We cut off and extract terms if there are more than 16 terms in the query
|
|
||||||
*/
|
|
||||||
assertAcked(prepareCreate("test")
|
|
||||||
.addMapping("test",
|
|
||||||
"body", "type=text,analyzer=custom_analyzer,search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
|
|
||||||
.setSettings(
|
|
||||||
Settings.builder().put(indexSettings())
|
|
||||||
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
|
||||||
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
|
||||||
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
|
||||||
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
|
||||||
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
|
||||||
.putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter"))
|
|
||||||
);
|
|
||||||
|
|
||||||
ensureGreen();
|
|
||||||
client().prepareIndex("test", "test", "1")
|
|
||||||
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
|
|
||||||
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
|
|
||||||
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
|
||||||
+ "http://twitter.com this is a test for highlighting feature")
|
|
||||||
.get();
|
|
||||||
refresh();
|
|
||||||
SearchResponse search = client().prepareSearch().setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
|
||||||
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
|
||||||
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
|
|
||||||
search = client()
|
|
||||||
.prepareSearch()
|
|
||||||
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
|
|
||||||
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
|
|
||||||
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
|
||||||
+ "http://twitter.com this is a test for highlighting feature"))
|
|
||||||
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
|
||||||
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: <em>http://www.facebook.com</em> "
|
|
||||||
+ "<em>http://elasticsearch.org</em> <em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testNgramHighlighting() throws IOException {
|
public void testNgramHighlighting() throws IOException {
|
||||||
assertAcked(prepareCreate("test")
|
assertAcked(prepareCreate("test")
|
||||||
.addMapping("test",
|
.addMapping("test",
|
||||||
|
@ -1605,33 +1605,6 @@ public class SearchQueryIT extends ESIntegTestCase {
|
|||||||
assertHitCount(searchResponse, 2);
|
assertHitCount(searchResponse, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// see #3898
|
|
||||||
public void testCustomWordDelimiterQueryString() {
|
|
||||||
assertAcked(client().admin().indices().prepareCreate("test")
|
|
||||||
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
|
|
||||||
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
|
|
||||||
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
|
|
||||||
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
|
|
||||||
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
|
|
||||||
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
|
|
||||||
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
|
|
||||||
"analysis.filter.custom_word_delimiter.catenate_words", "false",
|
|
||||||
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
|
|
||||||
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
|
|
||||||
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
|
|
||||||
.addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer"));
|
|
||||||
|
|
||||||
client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get();
|
|
||||||
refresh();
|
|
||||||
|
|
||||||
SearchResponse response = client()
|
|
||||||
.prepareSearch("test")
|
|
||||||
.setQuery(
|
|
||||||
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
|
|
||||||
.field("field1").field("field2")).get();
|
|
||||||
assertHitCount(response, 1L);
|
|
||||||
}
|
|
||||||
|
|
||||||
// see #3797
|
// see #3797
|
||||||
public void testMultiMatchLenientIssue3797() {
|
public void testMultiMatchLenientIssue3797() {
|
||||||
createIndex("test");
|
createIndex("test");
|
||||||
|
23
modules/analysis-common/build.gradle
Normal file
23
modules/analysis-common/build.gradle
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
esplugin {
|
||||||
|
description 'Adds "built in" analyzers to Elasticsearch.'
|
||||||
|
classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
|
||||||
|
}
|
@ -17,7 +17,7 @@
|
|||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
|
||||||
@ -25,20 +25,26 @@ import org.elasticsearch.common.ParseField;
|
|||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for ASCIIFoldingFilter.
|
* Factory for ASCIIFoldingFilter.
|
||||||
*/
|
*/
|
||||||
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
|
||||||
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
implements MultiTermAwareComponent {
|
||||||
public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
|
||||||
|
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
|
||||||
|
|
||||||
private final boolean preserveOriginal;
|
private final boolean preserveOriginal;
|
||||||
|
|
||||||
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment,
|
||||||
|
String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices(
|
preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices(
|
||||||
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
|
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(),
|
||||||
|
DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
|
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||||
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
|
import org.elasticsearch.plugins.Plugin;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
||||||
|
@Override
|
||||||
|
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
|
||||||
|
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
|
||||||
|
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
|
||||||
|
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
|
||||||
|
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
|
||||||
|
return filters;
|
||||||
|
}
|
||||||
|
}
|
@ -17,7 +17,7 @@
|
|||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
@ -26,20 +26,22 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
|||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.Analysis;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
|
||||||
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE;
|
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE;
|
||||||
import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes;
|
import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.parseTypes;
|
||||||
|
|
||||||
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
|
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
@ -47,7 +49,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||||||
private final int flags;
|
private final int flags;
|
||||||
private final CharArraySet protoWords;
|
private final CharArraySet protoWords;
|
||||||
|
|
||||||
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||||
|
String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
|
|
||||||
// Sample Format for the type table:
|
// Sample Format for the type table:
|
||||||
@ -82,7 +85,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
|
|||||||
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
||||||
// If not null is the set of tokens to protect from being delimited
|
// If not null is the set of tokens to protect from being delimited
|
||||||
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
|
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
|
||||||
|
settings, "protected_words");
|
||||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
}
|
}
|
@ -17,7 +17,7 @@
|
|||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
|
|||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
|
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||||
|
import org.elasticsearch.index.analysis.Analysis;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -52,7 +54,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||||||
private final int flags;
|
private final int flags;
|
||||||
private final CharArraySet protoWords;
|
private final CharArraySet protoWords;
|
||||||
|
|
||||||
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
|
||||||
|
String name, Settings settings) {
|
||||||
super(indexSettings, name, settings);
|
super(indexSettings, name, settings);
|
||||||
|
|
||||||
// Sample Format for the type table:
|
// Sample Format for the type table:
|
||||||
@ -87,7 +90,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||||||
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
||||||
// If not null is the set of tokens to protect from being delimited
|
// If not null is the set of tokens to protect from being delimited
|
||||||
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
|
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
|
||||||
|
settings, "protected_words");
|
||||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
}
|
}
|
||||||
@ -101,7 +105,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
|
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
|
||||||
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), key, defaultValue, deprecationLogger)) {
|
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(),
|
||||||
|
key, defaultValue, deprecationLogger)) {
|
||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
@ -122,14 +127,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||||||
String lhs = parseString(m.group(1).trim());
|
String lhs = parseString(m.group(1).trim());
|
||||||
Byte rhs = parseType(m.group(2).trim());
|
Byte rhs = parseType(m.group(2).trim());
|
||||||
if (lhs.length() != 1)
|
if (lhs.length() != 1)
|
||||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
|
throw new RuntimeException("Invalid Mapping Rule : ["
|
||||||
|
+ rule + "]. Only a single character is allowed.");
|
||||||
if (rhs == null)
|
if (rhs == null)
|
||||||
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
|
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
|
||||||
typeMap.put(lhs.charAt(0), rhs);
|
typeMap.put(lhs.charAt(0), rhs);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
|
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
|
||||||
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
byte types[] = new byte[Math.max(
|
||||||
|
typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
|
||||||
for (int i = 0; i < types.length; i++)
|
for (int i = 0; i < types.length; i++)
|
||||||
types[i] = WordDelimiterIterator.getType(i);
|
types[i] = WordDelimiterIterator.getType(i);
|
||||||
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
|
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
|
||||||
@ -196,4 +203,9 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
|||||||
}
|
}
|
||||||
return new String(out, 0, writePos);
|
return new String(out, 0, writePos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean breaksFastVectorHighlighter() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
@ -17,12 +17,15 @@
|
|||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||||
|
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||||
|
|
||||||
@ -31,10 +34,12 @@ import java.io.StringReader;
|
|||||||
|
|
||||||
public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
||||||
public void testDefault() throws IOException {
|
public void testDefault() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.build());
|
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
||||||
String source = "Ansprüche";
|
String source = "Ansprüche";
|
||||||
String[] expected = new String[]{"Anspruche"};
|
String[] expected = new String[]{"Anspruche"};
|
||||||
@ -44,11 +49,13 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testPreserveOriginal() throws IOException {
|
public void testPreserveOriginal() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
|
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
|
||||||
.build());
|
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
|
||||||
String source = "Ansprüche";
|
String source = "Ansprüche";
|
||||||
String[] expected = new String[]{"Anspruche", "Ansprüche"};
|
String[] expected = new String[]{"Anspruche", "Ansprüche"};
|
||||||
@ -57,7 +64,8 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
|
|||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||||
|
|
||||||
// but the multi-term aware component still emits a single token
|
// but the multi-term aware component still emits a single token
|
||||||
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
|
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter)
|
||||||
|
.getMultiTermComponent();
|
||||||
tokenizer = new WhitespaceTokenizer();
|
tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
expected = new String[]{"Anspruche"};
|
expected = new String[]{"Anspruche"};
|
@ -16,13 +16,15 @@
|
|||||||
* specific language governing permissions and limitations
|
* specific language governing permissions and limitations
|
||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||||
|
|
||||||
@ -30,7 +32,8 @@ import java.io.IOException;
|
|||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory}
|
* Base class to test {@link WordDelimiterTokenFilterFactory} and
|
||||||
|
* {@link WordDelimiterGraphTokenFilterFactory}.
|
||||||
*/
|
*/
|
||||||
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
|
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
|
||||||
final String type;
|
final String type;
|
||||||
@ -40,10 +43,12 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testDefault() throws IOException {
|
public void testDefault() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
|
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
|
||||||
@ -54,44 +59,51 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testCatenateWords() throws IOException {
|
public void testCatenateWords() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
|
String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j",
|
||||||
|
"2", "se", "ONeil" };
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCatenateNumbers() throws IOException {
|
public void testCatenateNumbers() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
|
String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000",
|
||||||
"se", "O", "Neil"};
|
"j", "2", "se", "O", "Neil" };
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCatenateAll() throws IOException {
|
public void testCatenateAll() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
|
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
|
||||||
@ -101,11 +113,13 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testSplitOnCaseChange() throws IOException {
|
public void testSplitOnCaseChange() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot";
|
String source = "PowerShot";
|
||||||
String[] expected = new String[]{"PowerShot"};
|
String[] expected = new String[]{"PowerShot"};
|
||||||
@ -115,30 +129,35 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void testPreserveOriginal() throws IOException {
|
public void testPreserveOriginal() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
|
String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42",
|
||||||
"wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
|
"wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se",
|
||||||
|
"O'Neil's", "O", "Neil" };
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStemEnglishPossessive() throws IOException {
|
public void testStemEnglishPossessive() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
|
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi",
|
||||||
"se", "O", "Neil", "s"};
|
"4000", "j", "2", "se", "O", "Neil", "s" };
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
|
@ -0,0 +1,36 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.Name;
|
||||||
|
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||||
|
|
||||||
|
import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
|
||||||
|
import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
|
||||||
|
|
||||||
|
public class CommonAnalysisClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
|
||||||
|
public CommonAnalysisClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) {
|
||||||
|
super(testCandidate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParametersFactory
|
||||||
|
public static Iterable<Object[]> parameters() throws Exception {
|
||||||
|
return ESClientYamlSuiteTestCase.createParameters();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,86 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static java.util.Collections.emptyList;
|
||||||
|
import static java.util.stream.Collectors.toList;
|
||||||
|
|
||||||
|
public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
||||||
|
@Override
|
||||||
|
protected Map<String, Class<?>> getTokenizers() {
|
||||||
|
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
|
||||||
|
return tokenizers;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, Class<?>> getTokenFilters() {
|
||||||
|
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
|
||||||
|
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
|
||||||
|
filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class);
|
||||||
|
filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class);
|
||||||
|
return filters;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Map<String, Class<?>> getCharFilters() {
|
||||||
|
Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
|
||||||
|
return filters;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||||
|
* hasn't been marked in this class with its proper factory.
|
||||||
|
*/
|
||||||
|
public void testAllTokenizersMarked() {
|
||||||
|
markedTestCase("char filter", getTokenizers());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||||
|
* hasn't been marked in this class with its proper factory.
|
||||||
|
*/
|
||||||
|
public void testAllCharFiltersMarked() {
|
||||||
|
markedTestCase("char filter", getCharFilters());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
|
||||||
|
* hasn't been marked in this class with its proper factory.
|
||||||
|
*/
|
||||||
|
public void testAllTokenFiltersMarked() {
|
||||||
|
markedTestCase("token filter", getTokenFilters());
|
||||||
|
}
|
||||||
|
|
||||||
|
private void markedTestCase(String name, Map<String, Class<?>> map) {
|
||||||
|
List<String> unmarked = map.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() == MovedToAnalysisCommon.class)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.sorted()
|
||||||
|
.collect(toList());
|
||||||
|
assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common "
|
||||||
|
+ "but not mapped here", emptyList(), unmarked);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,154 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import org.elasticsearch.action.search.SearchResponse;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.query.Operator;
|
||||||
|
import org.elasticsearch.plugins.Plugin;
|
||||||
|
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
|
||||||
|
import org.elasticsearch.test.ESIntegTestCase;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||||
|
import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
|
||||||
|
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.hamcrest.Matchers.startsWith;
|
||||||
|
|
||||||
|
public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
|
||||||
|
@Override
|
||||||
|
protected Collection<Class<? extends Plugin>> nodePlugins() {
|
||||||
|
return Arrays.asList(CommonAnalysisPlugin.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNgramHighlightingWithBrokenPositions() throws IOException {
|
||||||
|
assertAcked(prepareCreate("test")
|
||||||
|
.addMapping("test", jsonBuilder()
|
||||||
|
.startObject()
|
||||||
|
.startObject("test")
|
||||||
|
.startObject("properties")
|
||||||
|
.startObject("name")
|
||||||
|
.field("type", "text")
|
||||||
|
.startObject("fields")
|
||||||
|
.startObject("autocomplete")
|
||||||
|
.field("type", "text")
|
||||||
|
.field("analyzer", "autocomplete")
|
||||||
|
.field("search_analyzer", "search_autocomplete")
|
||||||
|
.field("term_vector", "with_positions_offsets")
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject()
|
||||||
|
.endObject())
|
||||||
|
.setSettings(Settings.builder()
|
||||||
|
.put(indexSettings())
|
||||||
|
.put("analysis.tokenizer.autocomplete.max_gram", 20)
|
||||||
|
.put("analysis.tokenizer.autocomplete.min_gram", 1)
|
||||||
|
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
|
||||||
|
.put("analysis.tokenizer.autocomplete.type", "nGram")
|
||||||
|
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||||
|
.putArray("analysis.filter.wordDelimiter.type_table",
|
||||||
|
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
|
||||||
|
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM",
|
||||||
|
"# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM",
|
||||||
|
", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM",
|
||||||
|
"/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM",
|
||||||
|
"@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
|
||||||
|
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM",
|
||||||
|
"{ => ALPHANUM")
|
||||||
|
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||||
|
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
|
||||||
|
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||||
|
|
||||||
|
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
|
||||||
|
.putArray("analysis.analyzer.autocomplete.filter",
|
||||||
|
"lowercase", "wordDelimiter")
|
||||||
|
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
|
||||||
|
.putArray("analysis.analyzer.search_autocomplete.filter",
|
||||||
|
"lowercase", "wordDelimiter")));
|
||||||
|
client().prepareIndex("test", "test", "1")
|
||||||
|
.setSource("name", "ARCOTEL Hotels Deutschland").get();
|
||||||
|
refresh();
|
||||||
|
SearchResponse search = client().prepareSearch("test").setTypes("test")
|
||||||
|
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
|
||||||
|
.highlighter(new HighlightBuilder().field("name.autocomplete")).get();
|
||||||
|
assertHighlight(search, 0, "name.autocomplete", 0,
|
||||||
|
equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultiPhraseCutoff() throws IOException {
|
||||||
|
/*
|
||||||
|
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
|
||||||
|
* query. We cut off and extract terms if there are more than 16 terms in the query
|
||||||
|
*/
|
||||||
|
assertAcked(prepareCreate("test")
|
||||||
|
.addMapping("test", "body", "type=text,analyzer=custom_analyzer,"
|
||||||
|
+ "search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
|
||||||
|
.setSettings(
|
||||||
|
Settings.builder().put(indexSettings())
|
||||||
|
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
|
||||||
|
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
|
||||||
|
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.catenate_words", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
|
||||||
|
.put("analysis.filter.wordDelimiter.catenate_all", false)
|
||||||
|
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
|
||||||
|
.putArray("analysis.analyzer.custom_analyzer.filter",
|
||||||
|
"lowercase", "wordDelimiter"))
|
||||||
|
);
|
||||||
|
|
||||||
|
ensureGreen();
|
||||||
|
client().prepareIndex("test", "test", "1")
|
||||||
|
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org "
|
||||||
|
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this is "
|
||||||
|
+ "a test for highlighting feature Test: http://www.facebook.com "
|
||||||
|
+ "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
|
||||||
|
+ "http://twitter.com this is a test for highlighting feature")
|
||||||
|
.get();
|
||||||
|
refresh();
|
||||||
|
SearchResponse search = client().prepareSearch()
|
||||||
|
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
|
||||||
|
.highlighter(new HighlightBuilder().field("body")).get();
|
||||||
|
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
|
||||||
|
search = client()
|
||||||
|
.prepareSearch()
|
||||||
|
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "
|
||||||
|
+ "http://elasticsearch.org http://xing.com http://cnn.com "
|
||||||
|
+ "http://quora.com http://twitter.com this is a test for highlighting "
|
||||||
|
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
|
||||||
|
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
|
||||||
|
+ "is a test for highlighting feature"))
|
||||||
|
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
|
||||||
|
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
|
||||||
|
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
|
||||||
|
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,72 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to Elasticsearch under one or more contributor
|
||||||
|
* license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright
|
||||||
|
* ownership. Elasticsearch licenses this file to you under
|
||||||
|
* the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
* not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
import org.elasticsearch.action.search.SearchResponse;
|
||||||
|
import org.elasticsearch.index.query.Operator;
|
||||||
|
import org.elasticsearch.plugins.Plugin;
|
||||||
|
import org.elasticsearch.test.ESIntegTestCase;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
|
|
||||||
|
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
|
||||||
|
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
|
||||||
|
|
||||||
|
public class QueryStringWithAnalyzersTests extends ESIntegTestCase {
|
||||||
|
@Override
|
||||||
|
protected Collection<Class<? extends Plugin>> nodePlugins() {
|
||||||
|
return Arrays.asList(CommonAnalysisPlugin.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates that we properly split fields using the word delimiter filter in query_string.
|
||||||
|
*/
|
||||||
|
public void testCustomWordDelimiterQueryString() {
|
||||||
|
assertAcked(client().admin().indices().prepareCreate("test")
|
||||||
|
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
|
||||||
|
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
|
||||||
|
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
|
||||||
|
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
|
||||||
|
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
|
||||||
|
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
|
||||||
|
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
|
||||||
|
"analysis.filter.custom_word_delimiter.catenate_words", "false",
|
||||||
|
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
|
||||||
|
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
|
||||||
|
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
|
||||||
|
.addMapping("type1",
|
||||||
|
"field1", "type=text,analyzer=my_analyzer",
|
||||||
|
"field2", "type=text,analyzer=my_analyzer"));
|
||||||
|
|
||||||
|
client().prepareIndex("test", "type1", "1").setSource(
|
||||||
|
"field1", "foo bar baz",
|
||||||
|
"field2", "not needed").get();
|
||||||
|
refresh();
|
||||||
|
|
||||||
|
SearchResponse response = client()
|
||||||
|
.prepareSearch("test")
|
||||||
|
.setQuery(
|
||||||
|
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
|
||||||
|
.field("field1").field("field2")).get();
|
||||||
|
assertHitCount(response, 1L);
|
||||||
|
}
|
||||||
|
}
|
@ -16,52 +16,62 @@
|
|||||||
* specific language governing permissions and limitations
|
* specific language governing permissions and limitations
|
||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
public class WordDelimiterGraphTokenFilterFactoryTests
|
||||||
|
extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
||||||
public WordDelimiterGraphTokenFilterFactoryTests() {
|
public WordDelimiterGraphTokenFilterFactoryTests() {
|
||||||
super("word_delimiter_graph");
|
super("word_delimiter_graph");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testMultiTerms() throws IOException {
|
public void testMultiTerms() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
|
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||||
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
|
String[] expected = new String[] { "PowerShot", "PowerShot", "Power", "Shot", "50042",
|
||||||
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
|
"500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi",
|
||||||
"ONeil", "O'Neil's", "O", "Neil" };
|
"fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" };
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer();
|
Tokenizer tokenizer = new WhitespaceTokenizer();
|
||||||
tokenizer.setReader(new StringReader(source));
|
tokenizer.setReader(new StringReader(source));
|
||||||
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
|
int[] expectedIncr = new int[] { 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
|
||||||
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
|
1, 1, 1, 0, 0, 1 };
|
||||||
|
int[] expectedPosLen = new int[] { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3,
|
||||||
|
1, 1, 1, 2, 2, 1, 1 };
|
||||||
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
|
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
|
||||||
expectedIncr, expectedPosLen, null);
|
expectedIncr, expectedPosLen, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
|
/**
|
||||||
|
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
|
||||||
|
*/
|
||||||
public void testPartsAndCatenate() throws IOException {
|
public void testPartsAndCatenate() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot";
|
String source = "PowerShot";
|
||||||
int[] expectedIncr = new int[]{1, 0, 1};
|
int[] expectedIncr = new int[]{1, 0, 1};
|
@ -16,31 +16,38 @@
|
|||||||
* specific language governing permissions and limitations
|
* specific language governing permissions and limitations
|
||||||
* under the License.
|
* under the License.
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.analysis.common;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||||
|
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
public class WordDelimiterTokenFilterFactoryTests
|
||||||
|
extends BaseWordDelimiterTokenFilterFactoryTestCase {
|
||||||
public WordDelimiterTokenFilterFactoryTests() {
|
public WordDelimiterTokenFilterFactoryTests() {
|
||||||
super("word_delimiter");
|
super("word_delimiter");
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
|
/**
|
||||||
|
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
|
||||||
|
*/
|
||||||
public void testPartsAndCatenate() throws IOException {
|
public void testPartsAndCatenate() throws IOException {
|
||||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
|
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
Settings.builder()
|
||||||
.put("index.analysis.filter.my_word_delimiter.type", type)
|
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
|
||||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
.put("index.analysis.filter.my_word_delimiter.type", type)
|
||||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||||
.build());
|
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
|
||||||
|
.build(),
|
||||||
|
new CommonAnalysisPlugin());
|
||||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
|
||||||
String source = "PowerShot";
|
String source = "PowerShot";
|
||||||
String[] expected = new String[]{"Power", "PowerShot", "Shot" };
|
String[] expected = new String[]{"Power", "PowerShot", "Shot" };
|
@ -0,0 +1,11 @@
|
|||||||
|
"Module loaded":
|
||||||
|
- do:
|
||||||
|
cluster.state: {}
|
||||||
|
|
||||||
|
# Get master node id
|
||||||
|
- set: { master_node: master }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
nodes.info: {}
|
||||||
|
|
||||||
|
- match: { nodes.$master.modules.0.name: analysis-common }
|
@ -0,0 +1,11 @@
|
|||||||
|
## Smoke tests for analyzers included in the analysis-common module
|
||||||
|
|
||||||
|
"whitespace":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: Foo Bar!
|
||||||
|
analyzer: whitespace
|
||||||
|
- length: { tokens: 2 }
|
||||||
|
- match: { tokens.0.token: Foo }
|
||||||
|
- match: { tokens.1.token: Bar! }
|
@ -0,0 +1,27 @@
|
|||||||
|
## Smoke tests for tokenizers included in the analysis-common module
|
||||||
|
|
||||||
|
"keyword":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: Foo Bar!
|
||||||
|
tokenizer: keyword
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: Foo Bar! }
|
||||||
|
|
||||||
|
---
|
||||||
|
"nGram":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: good
|
||||||
|
explain: true
|
||||||
|
tokenizer:
|
||||||
|
type: nGram
|
||||||
|
min_gram: 2
|
||||||
|
max_gram: 2
|
||||||
|
- length: { detail.tokenizer.tokens: 3 }
|
||||||
|
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
||||||
|
- match: { detail.tokenizer.tokens.0.token: go }
|
||||||
|
- match: { detail.tokenizer.tokens.1.token: oo }
|
||||||
|
- match: { detail.tokenizer.tokens.2.token: od }
|
@ -0,0 +1,82 @@
|
|||||||
|
## Smoke tests for token filters included in the analysis-common module
|
||||||
|
|
||||||
|
"asciifolding":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: Musée d'Orsay
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: [asciifolding]
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: Musee d'Orsay }
|
||||||
|
|
||||||
|
---
|
||||||
|
"lowercase":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: Foo Bar!
|
||||||
|
tokenizer: keyword
|
||||||
|
filter: [lowercase]
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: foo bar! }
|
||||||
|
|
||||||
|
---
|
||||||
|
"word_delimiter":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: the qu1ck brown fox
|
||||||
|
tokenizer: standard
|
||||||
|
filter: [word_delimiter]
|
||||||
|
- length: { tokens: 6 }
|
||||||
|
- match: { tokens.0.token: the }
|
||||||
|
- match: { tokens.1.token: qu }
|
||||||
|
- match: { tokens.2.token: "1" }
|
||||||
|
- match: { tokens.3.token: ck }
|
||||||
|
- match: { tokens.4.token: brown }
|
||||||
|
- match: { tokens.5.token: fox }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: the qu1ck brown fox
|
||||||
|
tokenizer: standard
|
||||||
|
filter:
|
||||||
|
- type: word_delimiter
|
||||||
|
split_on_numerics: false
|
||||||
|
- length: { tokens: 4 }
|
||||||
|
- match: { tokens.0.token: the }
|
||||||
|
- match: { tokens.1.token: qu1ck }
|
||||||
|
- match: { tokens.2.token: brown }
|
||||||
|
- match: { tokens.3.token: fox }
|
||||||
|
|
||||||
|
---
|
||||||
|
"word_delimiter_graph":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: the qu1ck brown fox
|
||||||
|
tokenizer: standard
|
||||||
|
filter: [word_delimiter_graph]
|
||||||
|
- length: { tokens: 6 }
|
||||||
|
- match: { tokens.0.token: the }
|
||||||
|
- match: { tokens.1.token: qu }
|
||||||
|
- match: { tokens.2.token: "1" }
|
||||||
|
- match: { tokens.3.token: ck }
|
||||||
|
- match: { tokens.4.token: brown }
|
||||||
|
- match: { tokens.5.token: fox }
|
||||||
|
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: the qu1ck brown fox
|
||||||
|
tokenizer: standard
|
||||||
|
filter:
|
||||||
|
- type: word_delimiter_graph
|
||||||
|
split_on_numerics: false
|
||||||
|
- length: { tokens: 4 }
|
||||||
|
- match: { tokens.0.token: the }
|
||||||
|
- match: { tokens.1.token: qu1ck }
|
||||||
|
- match: { tokens.2.token: brown }
|
||||||
|
- match: { tokens.3.token: fox }
|
@ -0,0 +1,13 @@
|
|||||||
|
## Smoke tests for analyzers included in the analysis-common module
|
||||||
|
|
||||||
|
"mapping":
|
||||||
|
- do:
|
||||||
|
indices.analyze:
|
||||||
|
body:
|
||||||
|
text: jeff quit phish
|
||||||
|
tokenizer: keyword
|
||||||
|
char_filter:
|
||||||
|
- type: mapping
|
||||||
|
mappings: ["ph => f", "qu => q"]
|
||||||
|
- length: { tokens: 1 }
|
||||||
|
- match: { tokens.0.token: "jeff qit fish" }
|
@ -19,14 +19,9 @@
|
|||||||
|
|
||||||
package org.elasticsearch.index.analysis;
|
package org.elasticsearch.index.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.elasticsearch.AnalysisFactoryTestCase;
|
import org.elasticsearch.AnalysisFactoryTestCase;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
@ -37,6 +32,10 @@ import org.elasticsearch.env.Environment;
|
|||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
|
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,29 +1,11 @@
|
|||||||
# Will be performed before each test as a part of the test setup
|
|
||||||
#
|
|
||||||
setup:
|
|
||||||
- do:
|
|
||||||
ping: {}
|
|
||||||
|
|
||||||
---
|
|
||||||
"Basic test":
|
"Basic test":
|
||||||
- do:
|
- do:
|
||||||
indices.analyze:
|
indices.analyze:
|
||||||
body:
|
body:
|
||||||
text: Foo Bar
|
text: Foo Bar
|
||||||
- length: { tokens: 2 }
|
- length: { tokens: 2 }
|
||||||
- match: { tokens.0.token: foo }
|
- match: { tokens.0.token: foo }
|
||||||
- match: { tokens.1.token: bar }
|
- match: { tokens.1.token: bar }
|
||||||
|
|
||||||
---
|
|
||||||
"Tokenizer and filter":
|
|
||||||
- do:
|
|
||||||
indices.analyze:
|
|
||||||
body:
|
|
||||||
filter: [lowercase]
|
|
||||||
text: Foo Bar
|
|
||||||
tokenizer: keyword
|
|
||||||
- length: { tokens: 1 }
|
|
||||||
- match: { tokens.0.token: foo bar }
|
|
||||||
|
|
||||||
---
|
---
|
||||||
"Index and field":
|
"Index and field":
|
||||||
@ -36,7 +18,7 @@ setup:
|
|||||||
properties:
|
properties:
|
||||||
text:
|
text:
|
||||||
type: text
|
type: text
|
||||||
analyzer: whitespace
|
analyzer: standard
|
||||||
|
|
||||||
- do:
|
- do:
|
||||||
indices.analyze:
|
indices.analyze:
|
||||||
@ -45,84 +27,51 @@ setup:
|
|||||||
field: text
|
field: text
|
||||||
text: Foo Bar!
|
text: Foo Bar!
|
||||||
- length: { tokens: 2 }
|
- length: { tokens: 2 }
|
||||||
- match: { tokens.0.token: Foo }
|
- match: { tokens.0.token: foo }
|
||||||
- match: { tokens.1.token: Bar! }
|
- match: { tokens.1.token: bar }
|
||||||
---
|
|
||||||
"JSON in Body":
|
|
||||||
- do:
|
|
||||||
indices.analyze:
|
|
||||||
body: { "text": "Foo Bar", "filter": ["lowercase"], "tokenizer": keyword }
|
|
||||||
- length: {tokens: 1 }
|
|
||||||
- match: { tokens.0.token: foo bar }
|
|
||||||
---
|
---
|
||||||
"Array text":
|
"Array text":
|
||||||
- do:
|
- do:
|
||||||
indices.analyze:
|
indices.analyze:
|
||||||
body: { "text": ["Foo Bar", "Baz"], "filter": ["lowercase"], "tokenizer": keyword }
|
body:
|
||||||
- length: {tokens: 2 }
|
text: ["Foo Bar", "Baz"]
|
||||||
- match: { tokens.0.token: foo bar }
|
tokenizer: standard
|
||||||
- match: { tokens.1.token: baz }
|
- length: { tokens: 3 }
|
||||||
|
- match: { tokens.0.token: Foo }
|
||||||
|
- match: { tokens.1.token: Bar }
|
||||||
|
- match: { tokens.2.token: Baz }
|
||||||
|
|
||||||
---
|
---
|
||||||
"Detail response with Analyzer":
|
"Detail response with Analyzer":
|
||||||
- do:
|
- do:
|
||||||
indices.analyze:
|
indices.analyze:
|
||||||
body: {"text": "This is troubled", "analyzer": standard, "explain": "true"}
|
body:
|
||||||
|
text: This is troubled
|
||||||
|
analyzer: standard
|
||||||
|
explain: true
|
||||||
- length: { detail.analyzer.tokens: 3 }
|
- length: { detail.analyzer.tokens: 3 }
|
||||||
- match: { detail.analyzer.name: standard }
|
- match: { detail.analyzer.name: standard }
|
||||||
- match: { detail.analyzer.tokens.0.token: this }
|
- match: { detail.analyzer.tokens.0.token: this }
|
||||||
- match: { detail.analyzer.tokens.1.token: is }
|
- match: { detail.analyzer.tokens.1.token: is }
|
||||||
- match: { detail.analyzer.tokens.2.token: troubled }
|
- match: { detail.analyzer.tokens.2.token: troubled }
|
||||||
---
|
|
||||||
"Detail output spcified attribute":
|
|
||||||
- do:
|
|
||||||
indices.analyze:
|
|
||||||
body: {"text": "<text>This is troubled</text>", "char_filter": ["html_strip"], "filter": ["snowball"], "tokenizer": standard, "explain": true, "attributes": ["keyword"]}
|
|
||||||
- length: { detail.charfilters: 1 }
|
|
||||||
- length: { detail.tokenizer.tokens: 3 }
|
|
||||||
- length: { detail.tokenfilters.0.tokens: 3 }
|
|
||||||
- match: { detail.tokenizer.name: standard }
|
|
||||||
- match: { detail.tokenizer.tokens.0.token: This }
|
|
||||||
- match: { detail.tokenizer.tokens.1.token: is }
|
|
||||||
- match: { detail.tokenizer.tokens.2.token: troubled }
|
|
||||||
- match: { detail.tokenfilters.0.name: snowball }
|
|
||||||
- match: { detail.tokenfilters.0.tokens.0.token: This }
|
|
||||||
- match: { detail.tokenfilters.0.tokens.1.token: is }
|
|
||||||
- match: { detail.tokenfilters.0.tokens.2.token: troubl }
|
|
||||||
- match: { detail.tokenfilters.0.tokens.2.keyword: false }
|
|
||||||
|
|
||||||
---
|
---
|
||||||
"Custom filter in request":
|
"Custom filter in request":
|
||||||
- do:
|
- do:
|
||||||
indices.analyze:
|
indices.analyze:
|
||||||
body: { "text": "Foo Bar Buzz", "filter": ["lowercase", { "type": "stop", "stopwords": ["foo", "buzz"]}], "tokenizer": whitespace, "explain": true }
|
body:
|
||||||
- length: {detail.tokenizer.tokens: 3 }
|
text: foo bar buzz
|
||||||
- length: {detail.tokenfilters.0.tokens: 3 }
|
tokenizer: standard
|
||||||
- length: {detail.tokenfilters.1.tokens: 1 }
|
explain: true
|
||||||
- match: { detail.tokenizer.name: whitespace }
|
filter:
|
||||||
- match: { detail.tokenizer.tokens.0.token: Foo }
|
- type: stop
|
||||||
- match: { detail.tokenizer.tokens.1.token: Bar }
|
stopwords: ["foo", "buzz"]
|
||||||
- match: { detail.tokenizer.tokens.2.token: Buzz }
|
- length: { detail.tokenizer.tokens: 3 }
|
||||||
- match: { detail.tokenfilters.0.name: lowercase }
|
- length: { detail.tokenfilters.0.tokens: 1 }
|
||||||
- match: { detail.tokenfilters.0.tokens.0.token: foo }
|
- match: { detail.tokenizer.name: standard }
|
||||||
- match: { detail.tokenfilters.0.tokens.1.token: bar }
|
- match: { detail.tokenizer.tokens.0.token: foo }
|
||||||
- match: { detail.tokenfilters.0.tokens.2.token: buzz }
|
- match: { detail.tokenizer.tokens.1.token: bar }
|
||||||
- match: { detail.tokenfilters.1.name: "_anonymous_tokenfilter_[1]" }
|
- match: { detail.tokenizer.tokens.2.token: buzz }
|
||||||
- match: { detail.tokenfilters.1.tokens.0.token: bar }
|
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
|
||||||
---
|
- match: { detail.tokenfilters.0.tokens.0.token: bar }
|
||||||
"Custom char_filter in request":
|
|
||||||
- do:
|
|
||||||
indices.analyze:
|
|
||||||
body: { "text": "jeff quit phish", "char_filter": [{"type": "mapping", "mappings": ["ph => f", "qu => q"]}], "tokenizer": keyword }
|
|
||||||
- length: {tokens: 1 }
|
|
||||||
- match: { tokens.0.token: "jeff qit fish" }
|
|
||||||
|
|
||||||
---
|
|
||||||
"Custom tokenizer in request":
|
|
||||||
- do:
|
|
||||||
indices.analyze:
|
|
||||||
body: { "text": "good", "tokenizer": {"type": "nGram", "min_gram": 2, "max_gram": 2}, "explain": true }
|
|
||||||
- length: {detail.tokenizer.tokens: 3 }
|
|
||||||
- match: { detail.tokenizer.name: _anonymous_tokenizer }
|
|
||||||
- match: { detail.tokenizer.tokens.0.token: go }
|
|
||||||
- match: { detail.tokenizer.tokens.1.token: oo }
|
|
||||||
- match: { detail.tokenizer.tokens.2.token: od }
|
|
||||||
|
@ -26,14 +26,15 @@ List projects = [
|
|||||||
'test:fixtures:hdfs-fixture',
|
'test:fixtures:hdfs-fixture',
|
||||||
'test:logger-usage',
|
'test:logger-usage',
|
||||||
'modules:aggs-matrix-stats',
|
'modules:aggs-matrix-stats',
|
||||||
|
'modules:analysis-common',
|
||||||
'modules:ingest-common',
|
'modules:ingest-common',
|
||||||
'modules:lang-expression',
|
'modules:lang-expression',
|
||||||
'modules:lang-mustache',
|
'modules:lang-mustache',
|
||||||
'modules:lang-painless',
|
'modules:lang-painless',
|
||||||
'modules:transport-netty4',
|
|
||||||
'modules:reindex',
|
|
||||||
'modules:percolator',
|
'modules:percolator',
|
||||||
|
'modules:reindex',
|
||||||
'modules:repository-url',
|
'modules:repository-url',
|
||||||
|
'modules:transport-netty4',
|
||||||
'plugins:analysis-icu',
|
'plugins:analysis-icu',
|
||||||
'plugins:analysis-kuromoji',
|
'plugins:analysis-kuromoji',
|
||||||
'plugins:analysis-phonetic',
|
'plugins:analysis-phonetic',
|
||||||
|
@ -20,14 +20,12 @@
|
|||||||
package org.elasticsearch;
|
package org.elasticsearch;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
|
||||||
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
|
|
||||||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||||
import org.apache.lucene.analysis.util.CharFilterFactory;
|
import org.apache.lucene.analysis.util.CharFilterFactory;
|
||||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||||
import org.elasticsearch.common.collect.MapBuilder;
|
import org.elasticsearch.common.collect.MapBuilder;
|
||||||
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
||||||
@ -92,7 +90,6 @@ import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
|
|||||||
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
|
||||||
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
|
|
||||||
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
|
||||||
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
|
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
|
||||||
@ -110,7 +107,7 @@ import java.util.regex.Matcher;
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Alerts us if new analyzers are added to lucene, so we don't miss them.
|
* Alerts us if new analysis components are added to Lucene, so we don't miss them.
|
||||||
* <p>
|
* <p>
|
||||||
* If we don't want to expose one for a specific reason, just map it to Void.
|
* If we don't want to expose one for a specific reason, just map it to Void.
|
||||||
* The deprecated ones can be mapped to Deprecated.class.
|
* The deprecated ones can be mapped to Deprecated.class.
|
||||||
@ -178,7 +175,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
|||||||
.put("apostrophe", ApostropheFilterFactory.class)
|
.put("apostrophe", ApostropheFilterFactory.class)
|
||||||
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
|
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
|
||||||
.put("arabicstem", ArabicStemTokenFilterFactory.class)
|
.put("arabicstem", ArabicStemTokenFilterFactory.class)
|
||||||
.put("asciifolding", ASCIIFoldingTokenFilterFactory.class)
|
.put("asciifolding", MovedToAnalysisCommon.class)
|
||||||
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
|
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
|
||||||
.put("bulgarianstem", StemmerTokenFilterFactory.class)
|
.put("bulgarianstem", StemmerTokenFilterFactory.class)
|
||||||
.put("cjkbigram", CJKBigramFilterFactory.class)
|
.put("cjkbigram", CJKBigramFilterFactory.class)
|
||||||
@ -253,8 +250,8 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
|||||||
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
|
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
|
||||||
.put("type", KeepTypesFilterFactory.class)
|
.put("type", KeepTypesFilterFactory.class)
|
||||||
.put("uppercase", UpperCaseTokenFilterFactory.class)
|
.put("uppercase", UpperCaseTokenFilterFactory.class)
|
||||||
.put("worddelimiter", WordDelimiterTokenFilterFactory.class)
|
.put("worddelimiter", MovedToAnalysisCommon.class)
|
||||||
.put("worddelimitergraph", WordDelimiterGraphFilterFactory.class)
|
.put("worddelimitergraph", MovedToAnalysisCommon.class)
|
||||||
.put("flattengraph", FlattenGraphTokenFilterFactory.class)
|
.put("flattengraph", FlattenGraphTokenFilterFactory.class)
|
||||||
|
|
||||||
// TODO: these tokenfilters are not yet exposed: useful?
|
// TODO: these tokenfilters are not yet exposed: useful?
|
||||||
@ -401,6 +398,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
expected.remove(Void.class);
|
expected.remove(Void.class);
|
||||||
|
expected.remove(MovedToAnalysisCommon.class);
|
||||||
expected.remove(Deprecated.class);
|
expected.remove(Deprecated.class);
|
||||||
|
|
||||||
Collection<Class<?>> actual = new HashSet<>();
|
Collection<Class<?>> actual = new HashSet<>();
|
||||||
@ -489,4 +487,11 @@ public class AnalysisFactoryTestCase extends ESTestCase {
|
|||||||
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
classesThatShouldNotHaveMultiTermSupport.isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Marker class for components that have moved to the analysis-common modules. This will be
|
||||||
|
* removed when the module is complete and these analysis components aren't available to core.
|
||||||
|
*/
|
||||||
|
protected static final class MovedToAnalysisCommon {
|
||||||
|
private MovedToAnalysisCommon() {}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,17 +25,18 @@ import org.elasticsearch.common.settings.Settings;
|
|||||||
import org.elasticsearch.env.Environment;
|
import org.elasticsearch.env.Environment;
|
||||||
import org.elasticsearch.index.IndexSettings;
|
import org.elasticsearch.index.IndexSettings;
|
||||||
import org.elasticsearch.indices.analysis.AnalysisModule;
|
import org.elasticsearch.indices.analysis.AnalysisModule;
|
||||||
|
import org.elasticsearch.plugins.AnalysisPlugin;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.elasticsearch.test.IndexSettingsModule;
|
import org.elasticsearch.test.IndexSettingsModule;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Arrays;
|
||||||
import static java.util.Collections.emptyList;
|
|
||||||
|
|
||||||
public class AnalysisTestsHelper {
|
public class AnalysisTestsHelper {
|
||||||
|
|
||||||
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, String resource) throws IOException {
|
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir,
|
||||||
|
String resource) throws IOException {
|
||||||
Settings settings = Settings.builder()
|
Settings settings = Settings.builder()
|
||||||
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
|
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
|
||||||
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
|
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
|
||||||
@ -45,12 +46,15 @@ public class AnalysisTestsHelper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
|
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
|
||||||
Settings settings) throws IOException {
|
Settings settings, AnalysisPlugin... plugins) throws IOException {
|
||||||
if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) {
|
if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) {
|
||||||
settings = Settings.builder().put(settings).put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
settings = Settings.builder().put(settings)
|
||||||
|
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
|
||||||
}
|
}
|
||||||
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
|
||||||
AnalysisRegistry analysisRegistry = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry();
|
AnalysisRegistry analysisRegistry =
|
||||||
|
new AnalysisModule(new Environment(settings), Arrays.asList(plugins))
|
||||||
|
.getAnalysisRegistry();
|
||||||
return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings),
|
return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings),
|
||||||
analysisRegistry.buildTokenFilterFactories(indexSettings),
|
analysisRegistry.buildTokenFilterFactories(indexSettings),
|
||||||
analysisRegistry.buildTokenizerFactories(indexSettings),
|
analysisRegistry.buildTokenizerFactories(indexSettings),
|
Loading…
x
Reference in New Issue
Block a user