Start building analysis-common module (#23614)

Start moving built in analysis components into the new analysis-common
module. The goal of this project is:
1. Remove core's dependency on lucene-analyzers-common.jar which should
shrink the dependencies for transport client and high level rest client.
2. Prove that analysis plugins can do all the "built in" things by moving all
"built in" behavior to a plugin.
3. Force tests not to depend on any oddball analyzer behavior. If tests
need anything more than the standard analyzer they can use the mock
analyzer provided by Lucene's test infrastructure.
This commit is contained in:
Nik Everett 2017-04-19 18:51:34 -04:00 committed by GitHub
parent 151a65ed17
commit caf376c8af
33 changed files with 959 additions and 476 deletions

View File

@ -1096,7 +1096,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]MergeSchedulerConfig.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SearchSlowLog.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionType.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractCharFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractIndexAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AbstractTokenFilterFactory.java" checks="LineLength" />
@ -1225,8 +1224,6 @@
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]UpperCaseTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceAnalyzerProvider.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WhitespaceTokenizerFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]AbstractCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]DictionaryCompoundWordTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]compound[/\\]HyphenationCompoundWordTokenFilterFactory.java" checks="LineLength" />
@ -2686,11 +2683,8 @@
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]SettingsListenerIT.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]VersionTypeTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]WaitUntilRefreshIT.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]ASCIIFoldingTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisRegistryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]AnalysisTestsHelper.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]BaseWordDelimiterTokenFilterFactoryTestCase.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CJKFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CharFilterTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]CompoundAnalysisTests.java" checks="LineLength" />
@ -2709,8 +2703,6 @@
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StemmerTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopAnalyzerTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]StopTokenFilterTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterGraphTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]WordDelimiterTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]commongrams[/\\]CommonGramsTokenFilterFactoryTests.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]filter1[/\\]MyFilterTokenFilterFactory.java" checks="LineLength" />
<suppress files="core[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]analysis[/\\]synonyms[/\\]SynonymsAnalysisTests.java" checks="LineLength" />

View File

@ -71,4 +71,9 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
return result;
}
@Override
public boolean breaksFastVectorHighlighter() {
return true;
}
}

View File

@ -20,10 +20,20 @@
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter;
public interface TokenFilterFactory {
String name();
TokenStream create(TokenStream tokenStream);
/**
* Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the
* {@link FastVectorHighlighter}? If this is {@code true} then the
* {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets.
*/
default boolean breaksFastVectorHighlighter() {
return false;
}
}

View File

@ -25,7 +25,6 @@ import org.elasticsearch.common.NamedRegistry;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.AnalyzerProvider;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
@ -140,8 +139,6 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.plugins.AnalysisPlugin;
@ -205,7 +202,6 @@ public final class AnalysisModule {
NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter");
tokenFilters.register("stop", StopTokenFilterFactory::new);
tokenFilters.register("reverse", ReverseTokenFilterFactory::new);
tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new);
tokenFilters.register("length", LengthTokenFilterFactory::new);
tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new);
tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new);
@ -225,8 +221,6 @@ public final class AnalysisModule {
tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
tokenFilters.register("snowball", SnowballTokenFilterFactory::new);
tokenFilters.register("stemmer", StemmerTokenFilterFactory::new);
tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new);
tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new);
tokenFilters.register("elision", ElisionTokenFilterFactory::new);
tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new);

View File

@ -26,15 +26,9 @@ import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo;
import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.Version;
import org.elasticsearch.index.analysis.CustomAnalyzer;
import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory;
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
import org.elasticsearch.index.analysis.NGramTokenFilterFactory;
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.mapper.FieldMapper;
import java.util.Comparator;
@ -56,7 +50,7 @@ public final class FragmentBuilderHelper {
public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) {
assert fragInfo != null : "FragInfo must not be null";
assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name();
if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.fieldType().indexAnalyzer()))) {
if (!fragInfo.getSubInfos().isEmpty() && containsBrokenAnalysis(mapper.fieldType().indexAnalyzer())) {
/* This is a special case where broken analysis like WDF is used for term-vector creation at index-time
* which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort
* the fragments based on their offsets rather than using soley the positions as it is done in
@ -91,8 +85,7 @@ public final class FragmentBuilderHelper {
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
TokenFilterFactory[] tokenFilters = a.tokenFilters();
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory
|| tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) {
if (tokenFilterFactory.breaksFastVectorHighlighter()) {
return true;
}
}

View File

@ -18,6 +18,8 @@
*/
package org.elasticsearch.action.admin.indices;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
@ -27,18 +29,28 @@ import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.AnalysisRegistry;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.mapper.AllFieldMapper;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
/**
* Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the
* {@code common-analysis} module.
*/
public class TransportAnalyzeActionTests extends ESTestCase {
private IndexAnalyzers indexAnalyzers;
@ -53,23 +65,28 @@ public class TransportAnalyzeActionTests extends ESTestCase {
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID())
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
.put("index.analysis.tokenizer.trigram.type", "ngram")
.put("index.analysis.tokenizer.trigram.min_gram", 3)
.put("index.analysis.tokenizer.trigram.max_gram", 3)
.put("index.analysis.filter.synonym.type", "synonym")
.putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay")
.put("index.analysis.filter.synonym.tokenizer", "trigram")
.put("index.analysis.filter.synonym.min_gram", 3)
.put("index.analysis.filter.synonym.max_gram", 3).build();
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard")
.put("index.analysis.analyzer.custom_analyzer.filter", "mock").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
environment = new Environment(settings);
registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry();
AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
}
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
}
};
registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry();
indexAnalyzers = registry.build(idxSettings);
}
@ -143,51 +160,44 @@ public class TransportAnalyzeActionTests extends ESTestCase {
}
public void testWithIndexAnalyzers() throws IOException {
AnalyzeRequest request = new AnalyzeRequest();
request.analyzer("standard");
request.text("the quick brown fox");
request.analyzer("custom_analyzer");
request.text("the qu1ck brown fox");
AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens();
assertEquals(4, tokens.size());
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
assertEquals("brown", tokens.get(1).getTerm());
assertEquals("fox", tokens.get(2).getTerm());
request.analyzer("whitespace");
request.text("the qu1ck brown fox-dog");
request.analyzer("standard");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(4, tokens.size());
request.analyzer("custom_analyzer");
request.text("the qu1ck brown fox-dog");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(5, tokens.size());
request.analyzer(null);
request.tokenizer("whitespace");
request.addTokenFilter("lowercase");
request.addTokenFilter("wordDelimiter");
request.text("the qu1ck brown fox-dog");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(5, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
assertEquals("qu1ck", tokens.get(1).getTerm());
assertEquals("quick", tokens.get(1).getTerm());
assertEquals("brown", tokens.get(2).getTerm());
assertEquals("fox", tokens.get(3).getTerm());
assertEquals("dog", tokens.get(4).getTerm());
// Switch the analyzer out for just a tokenizer
request.analyzer(null);
request.tokenizer("trigram");
request.addTokenFilter("synonym");
request.text("kimchy");
request.tokenizer("standard");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(2, tokens.size());
assertEquals("sha", tokens.get(0).getTerm());
assertEquals("hay", tokens.get(1).getTerm());
assertEquals(4, tokens.size());
assertEquals("the", tokens.get(0).getTerm());
assertEquals("quick", tokens.get(1).getTerm());
assertEquals("brown", tokens.get(2).getTerm());
assertEquals("fox", tokens.get(3).getTerm());
// Now try applying our token filter
request.addTokenFilter("mock");
analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment);
tokens = analyze.getTokens();
assertEquals(3, tokens.size());
assertEquals("quick", tokens.get(0).getTerm());
assertEquals("brown", tokens.get(1).getTerm());
assertEquals("fox", tokens.get(2).getTerm());
}
public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException {

View File

@ -22,5 +22,5 @@ package org.elasticsearch.index.analysis;
import org.elasticsearch.AnalysisFactoryTestCase;
public class AnalysisFactoryTests extends AnalysisFactoryTestCase {
// tests are inherited
// tests are inherited and nothing needs to be defined here
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.analysis.MockTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
@ -31,17 +32,20 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.indices.analysis.PreBuiltAnalyzers;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import static java.util.Collections.emptyList;
import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
@ -112,51 +116,73 @@ public class AnalysisRegistryTests extends ESTestCase {
assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class));
}
/**
* Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide.
*/
public void testConfigureCamelCaseTokenFilter() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
Settings indexSettings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("index.analysis.filter.wordDelimiter.type", "word_delimiter")
.put("index.analysis.filter.wordDelimiter.split_on_numerics", false)
.put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")
.put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build();
.put("index.analysis.filter.testFilter.type", "mock")
.put("index.analysis.filter.test_filter.type", "mock")
.put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard")
.putArray("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter")
.put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard")
.putArray("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry()
/* The snake_case version of the name should not filter out any stopwords while the
* camelCase version will filter out English stopwords. */
AnalysisPlugin plugin = new AnalysisPlugin() {
class MockFactory extends AbstractTokenFilterFactory {
MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
if (name().equals("test_filter")) {
return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET);
}
return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET);
}
}
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
return singletonMap("mock", MockFactory::new);
}
};
IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), singletonList(plugin)).getAnalysisRegistry()
.build(idxSettings);
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) {
// This shouldn't contain English stopwords
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) {
assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
List<String> token = new ArrayList<>();
while(tokenStream.incrementToken()) {
token.add(charTermAttribute.toString());
}
assertEquals(token.toString(), 2, token.size());
assertEquals("j2se", token.get(0));
assertEquals("j2ee", token.get(1));
assertTrue(tokenStream.incrementToken());
assertEquals("has", charTermAttribute.toString());
assertTrue(tokenStream.incrementToken());
assertEquals("foo", charTermAttribute.toString());
assertFalse(tokenStream.incrementToken());
}
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) {
// This *should* contain English stopwords
try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) {
assertNotNull(custom_analyser);
TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee");
TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo");
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
List<String> token = new ArrayList<>();
while(tokenStream.incrementToken()) {
token.add(charTermAttribute.toString());
}
assertEquals(token.toString(), 6, token.size());
assertEquals("j", token.get(0));
assertEquals("2", token.get(1));
assertEquals("se", token.get(2));
assertEquals("j", token.get(3));
assertEquals("2", token.get(4));
assertEquals("ee", token.get(5));
assertTrue(tokenStream.incrementToken());
assertEquals("has", charTermAttribute.toString());
assertTrue(tokenStream.incrementToken());
assertEquals("a", charTermAttribute.toString());
assertTrue(tokenStream.incrementToken());
assertEquals("foo", charTermAttribute.toString());
assertFalse(tokenStream.incrementToken());
}
}

View File

@ -19,6 +19,7 @@
package org.elasticsearch.search.fetch.subphase.highlight;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.search.join.ScoreMode;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.search.SearchRequestBuilder;
@ -100,6 +101,7 @@ import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.startsWith;
public class HighlighterSearchIT extends ESIntegTestCase {
// TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests
private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"};
private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"};
@ -113,12 +115,11 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject();
mappings.startObject("type")
.startObject("properties")
.startObject("text")
.field("type", "keyword")
.field("store", true)
.endObject()
.endObject()
.endObject();
.startObject("text")
.field("type", "keyword")
.field("store", true)
.endObject()
.endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
@ -139,14 +140,13 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject();
mappings.startObject("type")
.startObject("properties")
.startObject("text")
.field("type", "text")
.field("analyzer", "keyword")
.field("index_options", "offsets")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject()
.endObject();
.startObject("text")
.field("type", "text")
.field("analyzer", "keyword")
.field("index_options", "offsets")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
@ -166,23 +166,22 @@ public class HighlighterSearchIT extends ESIntegTestCase {
mappings.startObject();
mappings.startObject("type")
.startObject("_source")
.field("enabled", false)
.field("enabled", false)
.endObject()
.startObject("properties")
.startObject("unstored_field")
.field("index_options", "offsets")
.field("term_vector", "with_positions_offsets")
.field("type", "text")
.field("store", false)
.endObject()
.startObject("text")
.field("index_options", "offsets")
.field("term_vector", "with_positions_offsets")
.field("type", "text")
.field("store", true)
.endObject()
.endObject()
.endObject();
.startObject("unstored_field")
.field("index_options", "offsets")
.field("term_vector", "with_positions_offsets")
.field("type", "text")
.field("store", false)
.endObject()
.startObject("text")
.field("index_options", "offsets")
.field("term_vector", "with_positions_offsets")
.field("type", "text")
.field("store", true)
.endObject()
.endObject().endObject();
mappings.endObject();
assertAcked(prepareCreate("test")
.addMapping("type", mappings));
@ -218,103 +217,6 @@ public class HighlighterSearchIT extends ESIntegTestCase {
assertHighlight(search, 0, "name", 0, startsWith("<em>abc</em> <em>abc</em> <em>abc</em> <em>abc</em>"));
}
public void testNgramHighlightingWithBrokenPositions() throws IOException {
assertAcked(prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("name")
.startObject("fields")
.startObject("autocomplete")
.field("type", "text")
.field("analyzer", "autocomplete")
.field("search_analyzer", "search_autocomplete")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject()
.field("type", "text")
.endObject()
.endObject()
.endObject()
.endObject())
.setSettings(Settings.builder()
.put(indexSettings())
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putArray("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM",
"+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM",
"^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
.putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter")
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
.putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter")));
client().prepareIndex("test", "test", "1")
.setSource("name", "ARCOTEL Hotels Deutschland").get();
refresh();
SearchResponse search = client().prepareSearch("test").setTypes("test")
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
.highlighter(new HighlightBuilder().field("name.autocomplete")).execute().actionGet();
assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
}
public void testMultiPhraseCutoff() throws IOException {
/*
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
* query. We cut off and extract terms if there are more than 16 terms in the query
*/
assertAcked(prepareCreate("test")
.addMapping("test",
"body", "type=text,analyzer=custom_analyzer,search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
.setSettings(
Settings.builder().put(indexSettings())
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter"))
);
ensureGreen();
client().prepareIndex("test", "test", "1")
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ "http://twitter.com this is a test for highlighting feature")
.get();
refresh();
SearchResponse search = client().prepareSearch().setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
search = client()
.prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com "
+ "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: "
+ "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ "http://twitter.com this is a test for highlighting feature"))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: <em>http://www.facebook.com</em> "
+ "<em>http://elasticsearch.org</em> <em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
}
public void testNgramHighlighting() throws IOException {
assertAcked(prepareCreate("test")
.addMapping("test",

View File

@ -1605,33 +1605,6 @@ public class SearchQueryIT extends ESIntegTestCase {
assertHitCount(searchResponse, 2);
}
// see #3898
public void testCustomWordDelimiterQueryString() {
assertAcked(client().admin().indices().prepareCreate("test")
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
"analysis.filter.custom_word_delimiter.catenate_words", "false",
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
.addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer"));
client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get();
refresh();
SearchResponse response = client()
.prepareSearch("test")
.setQuery(
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
.field("field1").field("field2")).get();
assertHitCount(response, 1L);
}
// see #3797
public void testMultiMatchLenientIssue3797() {
createIndex("test");

View File

@ -0,0 +1,23 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
esplugin {
description 'Adds "built in" analyzers to Elasticsearch.'
classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin'
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
@ -25,20 +25,26 @@ import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
/**
* Factory for ASCIIFoldingFilter.
*/
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
public static boolean DEFAULT_PRESERVE_ORIGINAL = false;
public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
implements MultiTermAwareComponent {
public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
private final boolean preserveOriginal;
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment,
String name, Settings settings) {
super(indexSettings, name, settings);
preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices(
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(),
DEFAULT_PRESERVE_ORIGINAL, deprecationLogger);
}
@Override

View File

@ -0,0 +1,39 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.plugins.Plugin;
import java.util.HashMap;
import java.util.Map;
public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
@Override
public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
Map<String, AnalysisProvider<TokenFilterFactory>> filters = new HashMap<>();
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new);
filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new);
filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new);
return filters;
}
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -26,20 +26,22 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.util.List;
import java.util.Set;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE;
import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE;
import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.parseTypes;
public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory {
@ -47,7 +49,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
private final int flags;
private final CharArraySet protoWords;
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
super(indexSettings, name, settings);
// Sample Format for the type table:
@ -82,7 +85,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
}

View File

@ -17,7 +17,7 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import java.util.Collection;
import java.util.List;
@ -52,7 +54,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
private final int flags;
private final CharArraySet protoWords;
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env,
String name, Settings settings) {
super(indexSettings, name, settings);
// Sample Format for the type table:
@ -87,7 +90,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words");
Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(),
settings, "protected_words");
this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords);
this.flags = flags;
}
@ -101,7 +105,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
}
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), key, defaultValue, deprecationLogger)) {
if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(),
key, defaultValue, deprecationLogger)) {
return flag;
}
return 0;
@ -122,14 +127,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
String lhs = parseString(m.group(1).trim());
Byte rhs = parseType(m.group(2).trim());
if (lhs.length() != 1)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
throw new RuntimeException("Invalid Mapping Rule : ["
+ rule + "]. Only a single character is allowed.");
if (rhs == null)
throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
typeMap.put(lhs.charAt(0), rhs);
}
// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
byte types[] = new byte[Math.max(
typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)];
for (int i = 0; i < types.length; i++)
types[i] = WordDelimiterIterator.getType(i);
for (Map.Entry<Character, Byte> mapping : typeMap.entrySet())
@ -196,4 +203,9 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
}
return new String(out, 0, writePos);
}
@Override
public boolean breaksFastVectorHighlighter() {
return true;
}
}

View File

@ -17,12 +17,15 @@
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
@ -31,10 +34,12 @@ import java.io.StringReader;
public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[]{"Anspruche"};
@ -44,11 +49,13 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
}
public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_ascii_folding.type", "asciifolding")
.put("index.analysis.filter.my_ascii_folding.preserve_original", true)
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding");
String source = "Ansprüche";
String[] expected = new String[]{"Anspruche", "Ansprüche"};
@ -57,7 +64,8 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase {
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
// but the multi-term aware component still emits a single token
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent();
tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter)
.getMultiTermComponent();
tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
expected = new String[]{"Anspruche"};

View File

@ -16,13 +16,15 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
@ -30,7 +32,8 @@ import java.io.IOException;
import java.io.StringReader;
/**
* Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory}
* Base class to test {@link WordDelimiterTokenFilterFactory} and
* {@link WordDelimiterGraphTokenFilterFactory}.
*/
public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase {
final String type;
@ -40,10 +43,12 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testDefault() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi",
@ -54,44 +59,51 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testCatenateWords() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j",
"2", "se", "ONeil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateNumbers() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2",
"se", "O", "Neil"};
String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000",
"j", "2", "se", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testCatenateAll() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
@ -101,11 +113,13 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testSplitOnCaseChange() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"PowerShot"};
@ -115,30 +129,35 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke
}
public void testPreserveOriginal() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi",
"wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42",
"wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se",
"O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
public void testStemEnglishPossessive() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2",
"se", "O", "Neil", "s"};
String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi",
"4000", "j", "2", "se", "O", "Neil", "s" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
assertTokenStreamContents(tokenFilter.create(tokenizer), expected);

View File

@ -0,0 +1,36 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import com.carrotsearch.randomizedtesting.annotations.Name;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
public class CommonAnalysisClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
public CommonAnalysisClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) {
super(testCandidate);
}
@ParametersFactory
public static Iterable<Object[]> parameters() throws Exception {
return ESClientYamlSuiteTestCase.createParameters();
}
}

View File

@ -0,0 +1,86 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.AnalysisFactoryTestCase;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;
public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
@Override
protected Map<String, Class<?>> getTokenizers() {
Map<String, Class<?>> tokenizers = new HashMap<>(super.getTokenizers());
return tokenizers;
}
@Override
protected Map<String, Class<?>> getTokenFilters() {
Map<String, Class<?>> filters = new HashMap<>(super.getTokenFilters());
filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class);
filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class);
filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class);
return filters;
}
@Override
protected Map<String, Class<?>> getCharFilters() {
Map<String, Class<?>> filters = new HashMap<>(super.getCharFilters());
return filters;
}
/**
* Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but
* hasn't been marked in this class with its proper factory.
*/
public void testAllTokenizersMarked() {
markedTestCase("char filter", getTokenizers());
}
/**
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
* hasn't been marked in this class with its proper factory.
*/
public void testAllCharFiltersMarked() {
markedTestCase("char filter", getCharFilters());
}
/**
* Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but
* hasn't been marked in this class with its proper factory.
*/
public void testAllTokenFiltersMarked() {
markedTestCase("token filter", getTokenFilters());
}
private void markedTestCase(String name, Map<String, Class<?>> map) {
List<String> unmarked = map.entrySet().stream()
.filter(e -> e.getValue() == MovedToAnalysisCommon.class)
.map(Map.Entry::getKey)
.sorted()
.collect(toList());
assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common "
+ "but not mapped here", emptyList(), unmarked);
}
}

View File

@ -0,0 +1,154 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.test.ESIntegTestCase;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.startsWith;
public class HighlighterWithAnalyzersTests extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Arrays.asList(CommonAnalysisPlugin.class);
}
public void testNgramHighlightingWithBrokenPositions() throws IOException {
assertAcked(prepareCreate("test")
.addMapping("test", jsonBuilder()
.startObject()
.startObject("test")
.startObject("properties")
.startObject("name")
.field("type", "text")
.startObject("fields")
.startObject("autocomplete")
.field("type", "text")
.field("analyzer", "autocomplete")
.field("search_analyzer", "search_autocomplete")
.field("term_vector", "with_positions_offsets")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject()
.endObject())
.setSettings(Settings.builder()
.put(indexSettings())
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putArray("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
"? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM",
"# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM",
", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM",
"/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM",
"@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM",
"] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM",
"{ => ALPHANUM")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", false)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.autocomplete.tokenizer", "autocomplete")
.putArray("analysis.analyzer.autocomplete.filter",
"lowercase", "wordDelimiter")
.put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace")
.putArray("analysis.analyzer.search_autocomplete.filter",
"lowercase", "wordDelimiter")));
client().prepareIndex("test", "test", "1")
.setSource("name", "ARCOTEL Hotels Deutschland").get();
refresh();
SearchResponse search = client().prepareSearch("test").setTypes("test")
.setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR))
.highlighter(new HighlightBuilder().field("name.autocomplete")).get();
assertHighlight(search, 0, "name.autocomplete", 0,
equalTo("ARCO<em>TEL</em> Ho<em>tel</em>s <em>Deut</em>schland"));
}
public void testMultiPhraseCutoff() throws IOException {
/*
* MultiPhraseQuery can literally kill an entire node if there are too many terms in the
* query. We cut off and extract terms if there are more than 16 terms in the query
*/
assertAcked(prepareCreate("test")
.addMapping("test", "body", "type=text,analyzer=custom_analyzer,"
+ "search_analyzer=custom_analyzer,term_vector=with_positions_offsets")
.setSettings(
Settings.builder().put(indexSettings())
.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.put("analysis.filter.wordDelimiter.type.split_on_numerics", false)
.put("analysis.filter.wordDelimiter.generate_word_parts", true)
.put("analysis.filter.wordDelimiter.generate_number_parts", true)
.put("analysis.filter.wordDelimiter.catenate_words", true)
.put("analysis.filter.wordDelimiter.catenate_numbers", true)
.put("analysis.filter.wordDelimiter.catenate_all", false)
.put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace")
.putArray("analysis.analyzer.custom_analyzer.filter",
"lowercase", "wordDelimiter"))
);
ensureGreen();
client().prepareIndex("test", "test", "1")
.setSource("body", "Test: http://www.facebook.com http://elasticsearch.org "
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this is "
+ "a test for highlighting feature Test: http://www.facebook.com "
+ "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com "
+ "http://twitter.com this is a test for highlighting feature")
.get();
refresh();
SearchResponse search = client().prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "))
.highlighter(new HighlightBuilder().field("body")).get();
assertHighlight(search, 0, "body", 0, startsWith("<em>Test: http://www.facebook.com</em>"));
search = client()
.prepareSearch()
.setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com "
+ "http://elasticsearch.org http://xing.com http://cnn.com "
+ "http://quora.com http://twitter.com this is a test for highlighting "
+ "feature Test: http://www.facebook.com http://elasticsearch.org "
+ "http://xing.com http://cnn.com http://quora.com http://twitter.com this "
+ "is a test for highlighting feature"))
.highlighter(new HighlightBuilder().field("body")).execute().actionGet();
assertHighlight(search, 0, "body", 0, equalTo("<em>Test</em>: "
+ "<em>http://www.facebook.com</em> <em>http://elasticsearch.org</em> "
+ "<em>http://xing.com</em> <em>http://cnn.com</em> http://quora.com"));
}
}

View File

@ -0,0 +1,72 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.analysis.common;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.index.query.Operator;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import java.util.Arrays;
import java.util.Collection;
import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
public class QueryStringWithAnalyzersTests extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
return Arrays.asList(CommonAnalysisPlugin.class);
}
/**
* Validates that we properly split fields using the word delimiter filter in query_string.
*/
public void testCustomWordDelimiterQueryString() {
assertAcked(client().admin().indices().prepareCreate("test")
.setSettings("analysis.analyzer.my_analyzer.type", "custom",
"analysis.analyzer.my_analyzer.tokenizer", "whitespace",
"analysis.analyzer.my_analyzer.filter", "custom_word_delimiter",
"analysis.filter.custom_word_delimiter.type", "word_delimiter",
"analysis.filter.custom_word_delimiter.generate_word_parts", "true",
"analysis.filter.custom_word_delimiter.generate_number_parts", "false",
"analysis.filter.custom_word_delimiter.catenate_numbers", "true",
"analysis.filter.custom_word_delimiter.catenate_words", "false",
"analysis.filter.custom_word_delimiter.split_on_case_change", "false",
"analysis.filter.custom_word_delimiter.split_on_numerics", "false",
"analysis.filter.custom_word_delimiter.stem_english_possessive", "false")
.addMapping("type1",
"field1", "type=text,analyzer=my_analyzer",
"field2", "type=text,analyzer=my_analyzer"));
client().prepareIndex("test", "type1", "1").setSource(
"field1", "foo bar baz",
"field2", "not needed").get();
refresh();
SearchResponse response = client()
.prepareSearch("test")
.setQuery(
queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND)
.field("field1").field("field2")).get();
assertHitCount(response, 1L);
}
}

View File

@ -16,52 +16,62 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.io.StringReader;
public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
public class WordDelimiterGraphTokenFilterFactoryTests
extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterGraphTokenFilterFactoryTests() {
super("word_delimiter_graph");
}
public void testMultiTerms() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42",
"wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se",
"ONeil", "O'Neil's", "O", "Neil" };
String[] expected = new String[] { "PowerShot", "PowerShot", "Power", "Shot", "50042",
"500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi",
"fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" };
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1};
int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1};
int[] expectedIncr = new int[] { 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
1, 1, 1, 0, 0, 1 };
int[] expectedPosLen = new int[] { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3,
1, 1, 1, 2, 2, 1, 1 };
assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null,
expectedIncr, expectedPosLen, null);
}
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
/**
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
*/
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
int[] expectedIncr = new int[]{1, 0, 1};

View File

@ -16,31 +16,38 @@
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
package org.elasticsearch.analysis.common;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import java.io.IOException;
import java.io.StringReader;
public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase {
public class WordDelimiterTokenFilterFactoryTests
extends BaseWordDelimiterTokenFilterFactoryTestCase {
public WordDelimiterTokenFilterFactoryTests() {
super("word_delimiter");
}
/** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */
/**
* Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power
*/
public void testPartsAndCatenate() throws IOException {
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build());
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
Settings.builder()
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put("index.analysis.filter.my_word_delimiter.type", type)
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true")
.build(),
new CommonAnalysisPlugin());
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"Power", "PowerShot", "Shot" };

View File

@ -0,0 +1,11 @@
"Module loaded":
- do:
cluster.state: {}
# Get master node id
- set: { master_node: master }
- do:
nodes.info: {}
- match: { nodes.$master.modules.0.name: analysis-common }

View File

@ -0,0 +1,11 @@
## Smoke tests for analyzers included in the analysis-common module
"whitespace":
- do:
indices.analyze:
body:
text: Foo Bar!
analyzer: whitespace
- length: { tokens: 2 }
- match: { tokens.0.token: Foo }
- match: { tokens.1.token: Bar! }

View File

@ -0,0 +1,27 @@
## Smoke tests for tokenizers included in the analysis-common module
"keyword":
- do:
indices.analyze:
body:
text: Foo Bar!
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: Foo Bar! }
---
"nGram":
- do:
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: nGram
min_gram: 2
max_gram: 2
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: go }
- match: { detail.tokenizer.tokens.1.token: oo }
- match: { detail.tokenizer.tokens.2.token: od }

View File

@ -0,0 +1,82 @@
## Smoke tests for token filters included in the analysis-common module
"asciifolding":
- do:
indices.analyze:
body:
text: Musée d'Orsay
tokenizer: keyword
filter: [asciifolding]
- length: { tokens: 1 }
- match: { tokens.0.token: Musee d'Orsay }
---
"lowercase":
- do:
indices.analyze:
body:
text: Foo Bar!
tokenizer: keyword
filter: [lowercase]
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar! }
---
"word_delimiter":
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter: [word_delimiter]
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.2.token: "1" }
- match: { tokens.3.token: ck }
- match: { tokens.4.token: brown }
- match: { tokens.5.token: fox }
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter
split_on_numerics: false
- length: { tokens: 4 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu1ck }
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }
---
"word_delimiter_graph":
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter: [word_delimiter_graph]
- length: { tokens: 6 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu }
- match: { tokens.2.token: "1" }
- match: { tokens.3.token: ck }
- match: { tokens.4.token: brown }
- match: { tokens.5.token: fox }
- do:
indices.analyze:
body:
text: the qu1ck brown fox
tokenizer: standard
filter:
- type: word_delimiter_graph
split_on_numerics: false
- length: { tokens: 4 }
- match: { tokens.0.token: the }
- match: { tokens.1.token: qu1ck }
- match: { tokens.2.token: brown }
- match: { tokens.3.token: fox }

View File

@ -0,0 +1,13 @@
## Smoke tests for analyzers included in the analysis-common module
"mapping":
- do:
indices.analyze:
body:
text: jeff quit phish
tokenizer: keyword
char_filter:
- type: mapping
mappings: ["ph => f", "qu => q"]
- length: { tokens: 1 }
- match: { tokens.0.token: "jeff qit fish" }

View File

@ -19,14 +19,9 @@
package org.elasticsearch.index.analysis;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.AnalysisFactoryTestCase;
import org.elasticsearch.Version;
@ -37,6 +32,10 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase {
@Override

View File

@ -1,29 +1,11 @@
# Will be performed before each test as a part of the test setup
#
setup:
- do:
ping: {}
---
"Basic test":
- do:
indices.analyze:
body:
text: Foo Bar
- length: { tokens: 2 }
- match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
---
"Tokenizer and filter":
- do:
indices.analyze:
body:
filter: [lowercase]
text: Foo Bar
tokenizer: keyword
- length: { tokens: 1 }
- match: { tokens.0.token: foo bar }
- match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
---
"Index and field":
@ -36,7 +18,7 @@ setup:
properties:
text:
type: text
analyzer: whitespace
analyzer: standard
- do:
indices.analyze:
@ -45,84 +27,51 @@ setup:
field: text
text: Foo Bar!
- length: { tokens: 2 }
- match: { tokens.0.token: Foo }
- match: { tokens.1.token: Bar! }
---
"JSON in Body":
- do:
indices.analyze:
body: { "text": "Foo Bar", "filter": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: foo bar }
- match: { tokens.0.token: foo }
- match: { tokens.1.token: bar }
---
"Array text":
- do:
indices.analyze:
body: { "text": ["Foo Bar", "Baz"], "filter": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 2 }
- match: { tokens.0.token: foo bar }
- match: { tokens.1.token: baz }
body:
text: ["Foo Bar", "Baz"]
tokenizer: standard
- length: { tokens: 3 }
- match: { tokens.0.token: Foo }
- match: { tokens.1.token: Bar }
- match: { tokens.2.token: Baz }
---
"Detail response with Analyzer":
- do:
indices.analyze:
body: {"text": "This is troubled", "analyzer": standard, "explain": "true"}
body:
text: This is troubled
analyzer: standard
explain: true
- length: { detail.analyzer.tokens: 3 }
- match: { detail.analyzer.name: standard }
- match: { detail.analyzer.tokens.0.token: this }
- match: { detail.analyzer.tokens.1.token: is }
- match: { detail.analyzer.tokens.2.token: troubled }
---
"Detail output spcified attribute":
- do:
indices.analyze:
body: {"text": "<text>This is troubled</text>", "char_filter": ["html_strip"], "filter": ["snowball"], "tokenizer": standard, "explain": true, "attributes": ["keyword"]}
- length: { detail.charfilters: 1 }
- length: { detail.tokenizer.tokens: 3 }
- length: { detail.tokenfilters.0.tokens: 3 }
- match: { detail.tokenizer.name: standard }
- match: { detail.tokenizer.tokens.0.token: This }
- match: { detail.tokenizer.tokens.1.token: is }
- match: { detail.tokenizer.tokens.2.token: troubled }
- match: { detail.tokenfilters.0.name: snowball }
- match: { detail.tokenfilters.0.tokens.0.token: This }
- match: { detail.tokenfilters.0.tokens.1.token: is }
- match: { detail.tokenfilters.0.tokens.2.token: troubl }
- match: { detail.tokenfilters.0.tokens.2.keyword: false }
- match: { detail.analyzer.name: standard }
- match: { detail.analyzer.tokens.0.token: this }
- match: { detail.analyzer.tokens.1.token: is }
- match: { detail.analyzer.tokens.2.token: troubled }
---
"Custom filter in request":
- do:
indices.analyze:
body: { "text": "Foo Bar Buzz", "filter": ["lowercase", { "type": "stop", "stopwords": ["foo", "buzz"]}], "tokenizer": whitespace, "explain": true }
- length: {detail.tokenizer.tokens: 3 }
- length: {detail.tokenfilters.0.tokens: 3 }
- length: {detail.tokenfilters.1.tokens: 1 }
- match: { detail.tokenizer.name: whitespace }
- match: { detail.tokenizer.tokens.0.token: Foo }
- match: { detail.tokenizer.tokens.1.token: Bar }
- match: { detail.tokenizer.tokens.2.token: Buzz }
- match: { detail.tokenfilters.0.name: lowercase }
- match: { detail.tokenfilters.0.tokens.0.token: foo }
- match: { detail.tokenfilters.0.tokens.1.token: bar }
- match: { detail.tokenfilters.0.tokens.2.token: buzz }
- match: { detail.tokenfilters.1.name: "_anonymous_tokenfilter_[1]" }
- match: { detail.tokenfilters.1.tokens.0.token: bar }
---
"Custom char_filter in request":
- do:
indices.analyze:
body: { "text": "jeff quit phish", "char_filter": [{"type": "mapping", "mappings": ["ph => f", "qu => q"]}], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: "jeff qit fish" }
---
"Custom tokenizer in request":
- do:
indices.analyze:
body: { "text": "good", "tokenizer": {"type": "nGram", "min_gram": 2, "max_gram": 2}, "explain": true }
- length: {detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: go }
- match: { detail.tokenizer.tokens.1.token: oo }
- match: { detail.tokenizer.tokens.2.token: od }
body:
text: foo bar buzz
tokenizer: standard
explain: true
filter:
- type: stop
stopwords: ["foo", "buzz"]
- length: { detail.tokenizer.tokens: 3 }
- length: { detail.tokenfilters.0.tokens: 1 }
- match: { detail.tokenizer.name: standard }
- match: { detail.tokenizer.tokens.0.token: foo }
- match: { detail.tokenizer.tokens.1.token: bar }
- match: { detail.tokenizer.tokens.2.token: buzz }
- match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" }
- match: { detail.tokenfilters.0.tokens.0.token: bar }

View File

@ -26,14 +26,15 @@ List projects = [
'test:fixtures:hdfs-fixture',
'test:logger-usage',
'modules:aggs-matrix-stats',
'modules:analysis-common',
'modules:ingest-common',
'modules:lang-expression',
'modules:lang-mustache',
'modules:lang-painless',
'modules:transport-netty4',
'modules:reindex',
'modules:percolator',
'modules:reindex',
'modules:repository-url',
'modules:transport-netty4',
'plugins:analysis-icu',
'plugins:analysis-kuromoji',
'plugins:analysis-phonetic',

View File

@ -20,14 +20,12 @@
package org.elasticsearch;
import org.apache.lucene.analysis.en.PorterStemFilterFactory;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory;
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.elasticsearch.common.collect.MapBuilder;
import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory;
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
@ -92,7 +90,6 @@ import org.elasticsearch.index.analysis.TruncateTokenFilterFactory;
import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory;
import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory;
import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory;
import org.elasticsearch.indices.analysis.PreBuiltCharFilters;
@ -110,7 +107,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Alerts us if new analyzers are added to lucene, so we don't miss them.
* Alerts us if new analysis components are added to Lucene, so we don't miss them.
* <p>
* If we don't want to expose one for a specific reason, just map it to Void.
* The deprecated ones can be mapped to Deprecated.class.
@ -178,7 +175,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("apostrophe", ApostropheFilterFactory.class)
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
.put("arabicstem", ArabicStemTokenFilterFactory.class)
.put("asciifolding", ASCIIFoldingTokenFilterFactory.class)
.put("asciifolding", MovedToAnalysisCommon.class)
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
.put("bulgarianstem", StemmerTokenFilterFactory.class)
.put("cjkbigram", CJKBigramFilterFactory.class)
@ -253,8 +250,8 @@ public class AnalysisFactoryTestCase extends ESTestCase {
.put("turkishlowercase", LowerCaseTokenFilterFactory.class)
.put("type", KeepTypesFilterFactory.class)
.put("uppercase", UpperCaseTokenFilterFactory.class)
.put("worddelimiter", WordDelimiterTokenFilterFactory.class)
.put("worddelimitergraph", WordDelimiterGraphFilterFactory.class)
.put("worddelimiter", MovedToAnalysisCommon.class)
.put("worddelimitergraph", MovedToAnalysisCommon.class)
.put("flattengraph", FlattenGraphTokenFilterFactory.class)
// TODO: these tokenfilters are not yet exposed: useful?
@ -401,6 +398,7 @@ public class AnalysisFactoryTestCase extends ESTestCase {
}
}
expected.remove(Void.class);
expected.remove(MovedToAnalysisCommon.class);
expected.remove(Deprecated.class);
Collection<Class<?>> actual = new HashSet<>();
@ -489,4 +487,11 @@ public class AnalysisFactoryTestCase extends ESTestCase {
classesThatShouldNotHaveMultiTermSupport.isEmpty());
}
/**
* Marker class for components that have moved to the analysis-common modules. This will be
* removed when the module is complete and these analysis components aren't available to core.
*/
protected static final class MovedToAnalysisCommon {
private MovedToAnalysisCommon() {}
}
}

View File

@ -25,17 +25,18 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.plugins.AnalysisPlugin;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import java.io.IOException;
import java.nio.file.Path;
import static java.util.Collections.emptyList;
import java.util.Arrays;
public class AnalysisTestsHelper {
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, String resource) throws IOException {
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir,
String resource) throws IOException {
Settings settings = Settings.builder()
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
@ -45,12 +46,15 @@ public class AnalysisTestsHelper {
}
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
Settings settings) throws IOException {
Settings settings, AnalysisPlugin... plugins) throws IOException {
if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) {
settings = Settings.builder().put(settings).put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
settings = Settings.builder().put(settings)
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build();
}
IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisRegistry analysisRegistry = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry();
AnalysisRegistry analysisRegistry =
new AnalysisModule(new Environment(settings), Arrays.asList(plugins))
.getAnalysisRegistry();
return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings),
analysisRegistry.buildTokenFilterFactories(indexSettings),
analysisRegistry.buildTokenizerFactories(indexSettings),