diff --git a/buildSrc/src/main/resources/checkstyle_suppressions.xml b/buildSrc/src/main/resources/checkstyle_suppressions.xml index 8c5aa12739e..6e62b8ec346 100644 --- a/buildSrc/src/main/resources/checkstyle_suppressions.xml +++ b/buildSrc/src/main/resources/checkstyle_suppressions.xml @@ -1096,7 +1096,6 @@ - @@ -1225,8 +1224,6 @@ - - @@ -2686,11 +2683,8 @@ - - - @@ -2709,8 +2703,6 @@ - - diff --git a/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java index 9d287d90c83..1d3b8e296ec 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/EdgeNGramTokenFilterFactory.java @@ -71,4 +71,9 @@ public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory { return result; } + + @Override + public boolean breaksFastVectorHighlighter() { + return true; + } } \ No newline at end of file diff --git a/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java index 8c976646b85..c90138d7a23 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java +++ b/core/src/main/java/org/elasticsearch/index/analysis/TokenFilterFactory.java @@ -20,10 +20,20 @@ package org.elasticsearch.index.analysis; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter; public interface TokenFilterFactory { - String name(); TokenStream create(TokenStream tokenStream); + + /** + * Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the + * {@link FastVectorHighlighter}? If this is {@code true} then the + * {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets. + */ + default boolean breaksFastVectorHighlighter() { + return false; + } } diff --git a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java index 61950942e60..c494c4cae9c 100644 --- a/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java +++ b/core/src/main/java/org/elasticsearch/indices/analysis/AnalysisModule.java @@ -25,7 +25,6 @@ import org.elasticsearch.common.NamedRegistry; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.AnalyzerProvider; import org.elasticsearch.index.analysis.ApostropheFilterFactory; @@ -140,8 +139,6 @@ import org.elasticsearch.index.analysis.UniqueTokenFilterFactory; import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.WhitespaceAnalyzerProvider; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; -import org.elasticsearch.index.analysis.WordDelimiterGraphTokenFilterFactory; -import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.plugins.AnalysisPlugin; @@ -205,7 +202,6 @@ public final class AnalysisModule { NamedRegistry> tokenFilters = new NamedRegistry<>("token_filter"); tokenFilters.register("stop", StopTokenFilterFactory::new); tokenFilters.register("reverse", ReverseTokenFilterFactory::new); - tokenFilters.register("asciifolding", ASCIIFoldingTokenFilterFactory::new); tokenFilters.register("length", LengthTokenFilterFactory::new); tokenFilters.register("lowercase", LowerCaseTokenFilterFactory::new); tokenFilters.register("uppercase", UpperCaseTokenFilterFactory::new); @@ -225,8 +221,6 @@ public final class AnalysisModule { tokenFilters.register("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new)); tokenFilters.register("snowball", SnowballTokenFilterFactory::new); tokenFilters.register("stemmer", StemmerTokenFilterFactory::new); - tokenFilters.register("word_delimiter", WordDelimiterTokenFilterFactory::new); - tokenFilters.register("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); tokenFilters.register("delimited_payload_filter", DelimitedPayloadTokenFilterFactory::new); tokenFilters.register("elision", ElisionTokenFilterFactory::new); tokenFilters.register("flatten_graph", FlattenGraphTokenFilterFactory::new); diff --git a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java index ac0dab3a638..37971e6b480 100644 --- a/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java +++ b/core/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/FragmentBuilderHelper.java @@ -26,15 +26,9 @@ import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo; import org.apache.lucene.search.vectorhighlight.FieldFragList.WeightedFragInfo.SubInfo; import org.apache.lucene.search.vectorhighlight.FragmentsBuilder; import org.apache.lucene.util.CollectionUtil; -import org.apache.lucene.util.Version; import org.elasticsearch.index.analysis.CustomAnalyzer; -import org.elasticsearch.index.analysis.EdgeNGramTokenFilterFactory; -import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory; -import org.elasticsearch.index.analysis.NGramTokenFilterFactory; -import org.elasticsearch.index.analysis.NGramTokenizerFactory; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.analysis.TokenFilterFactory; -import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.mapper.FieldMapper; import java.util.Comparator; @@ -56,7 +50,7 @@ public final class FragmentBuilderHelper { public static WeightedFragInfo fixWeightedFragInfo(FieldMapper mapper, Field[] values, WeightedFragInfo fragInfo) { assert fragInfo != null : "FragInfo must not be null"; assert mapper.fieldType().name().equals(values[0].name()) : "Expected FieldMapper for field " + values[0].name(); - if (!fragInfo.getSubInfos().isEmpty() && (containsBrokenAnalysis(mapper.fieldType().indexAnalyzer()))) { + if (!fragInfo.getSubInfos().isEmpty() && containsBrokenAnalysis(mapper.fieldType().indexAnalyzer())) { /* This is a special case where broken analysis like WDF is used for term-vector creation at index-time * which can potentially mess up the offsets. To prevent a SAIIOBException we need to resort * the fragments based on their offsets rather than using soley the positions as it is done in @@ -91,8 +85,7 @@ public final class FragmentBuilderHelper { final CustomAnalyzer a = (CustomAnalyzer) analyzer; TokenFilterFactory[] tokenFilters = a.tokenFilters(); for (TokenFilterFactory tokenFilterFactory : tokenFilters) { - if (tokenFilterFactory instanceof WordDelimiterTokenFilterFactory - || tokenFilterFactory instanceof EdgeNGramTokenFilterFactory) { + if (tokenFilterFactory.breaksFastVectorHighlighter()) { return true; } } diff --git a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java index bcd7bba8d38..57a83b2c680 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/indices/TransportAnalyzeActionTests.java @@ -18,6 +18,8 @@ */ package org.elasticsearch.action.admin.indices; +import org.apache.lucene.analysis.MockTokenFilter; +import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.Version; import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest; import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; @@ -27,18 +29,28 @@ import org.elasticsearch.common.UUIDs; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; import org.elasticsearch.index.analysis.AnalysisRegistry; import org.elasticsearch.index.analysis.IndexAnalyzers; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.index.mapper.AllFieldMapper; import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import java.io.IOException; import java.util.List; +import java.util.Map; -import static java.util.Collections.emptyList; +import static java.util.Collections.singletonList; +import static java.util.Collections.singletonMap; +/** + * Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the + * {@code common-analysis} module. + */ public class TransportAnalyzeActionTests extends ESTestCase { private IndexAnalyzers indexAnalyzers; @@ -53,23 +65,28 @@ public class TransportAnalyzeActionTests extends ESTestCase { Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) - .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") - .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) - .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") - .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") - .put("index.analysis.tokenizer.trigram.type", "ngram") - .put("index.analysis.tokenizer.trigram.min_gram", 3) - .put("index.analysis.tokenizer.trigram.max_gram", 3) - .put("index.analysis.filter.synonym.type", "synonym") - .putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay") - .put("index.analysis.filter.synonym.tokenizer", "trigram") - .put("index.analysis.filter.synonym.min_gram", 3) - .put("index.analysis.filter.synonym.max_gram", 3).build(); + .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard") + .put("index.analysis.analyzer.custom_analyzer.filter", "mock").build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); environment = new Environment(settings); - registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry(); + AnalysisPlugin plugin = new AnalysisPlugin() { + class MockFactory extends AbstractTokenFilterFactory { + MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET); + } + } + + @Override + public Map> getTokenFilters() { + return singletonMap("mock", MockFactory::new); + } + }; + registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); indexAnalyzers = registry.build(idxSettings); } @@ -143,51 +160,44 @@ public class TransportAnalyzeActionTests extends ESTestCase { } public void testWithIndexAnalyzers() throws IOException { - AnalyzeRequest request = new AnalyzeRequest(); - request.analyzer("standard"); request.text("the quick brown fox"); request.analyzer("custom_analyzer"); - request.text("the qu1ck brown fox"); AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); List tokens = analyze.getTokens(); - assertEquals(4, tokens.size()); + assertEquals(3, tokens.size()); + assertEquals("quick", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); - request.analyzer("whitespace"); - request.text("the qu1ck brown fox-dog"); + request.analyzer("standard"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); tokens = analyze.getTokens(); assertEquals(4, tokens.size()); - - request.analyzer("custom_analyzer"); - request.text("the qu1ck brown fox-dog"); - analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); - tokens = analyze.getTokens(); - assertEquals(5, tokens.size()); - - request.analyzer(null); - request.tokenizer("whitespace"); - request.addTokenFilter("lowercase"); - request.addTokenFilter("wordDelimiter"); - request.text("the qu1ck brown fox-dog"); - analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); - tokens = analyze.getTokens(); - assertEquals(5, tokens.size()); assertEquals("the", tokens.get(0).getTerm()); - assertEquals("qu1ck", tokens.get(1).getTerm()); + assertEquals("quick", tokens.get(1).getTerm()); assertEquals("brown", tokens.get(2).getTerm()); assertEquals("fox", tokens.get(3).getTerm()); - assertEquals("dog", tokens.get(4).getTerm()); + // Switch the analyzer out for just a tokenizer request.analyzer(null); - request.tokenizer("trigram"); - request.addTokenFilter("synonym"); - request.text("kimchy"); + request.tokenizer("standard"); analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); tokens = analyze.getTokens(); - assertEquals(2, tokens.size()); - assertEquals("sha", tokens.get(0).getTerm()); - assertEquals("hay", tokens.get(1).getTerm()); + assertEquals(4, tokens.size()); + assertEquals("the", tokens.get(0).getTerm()); + assertEquals("quick", tokens.get(1).getTerm()); + assertEquals("brown", tokens.get(2).getTerm()); + assertEquals("fox", tokens.get(3).getTerm()); + + // Now try applying our token filter + request.addTokenFilter("mock"); + analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); + tokens = analyze.getTokens(); + assertEquals(3, tokens.size()); + assertEquals("quick", tokens.get(0).getTerm()); + assertEquals("brown", tokens.get(1).getTerm()); + assertEquals("fox", tokens.get(2).getTerm()); } public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException { diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java index 6893fda75b8..0a62e8c4915 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisFactoryTests.java @@ -22,5 +22,5 @@ package org.elasticsearch.index.analysis; import org.elasticsearch.AnalysisFactoryTestCase; public class AnalysisFactoryTests extends AnalysisFactoryTestCase { - // tests are inherited + // tests are inherited and nothing needs to be defined here } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java index 12071f0eac7..0edd2fbe2c0 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java +++ b/core/src/test/java/org/elasticsearch/index/analysis/AnalysisRegistryTests.java @@ -21,6 +21,7 @@ package org.elasticsearch.index.analysis; import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import org.apache.lucene.analysis.MockTokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; @@ -31,17 +32,20 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; import org.elasticsearch.indices.analysis.PreBuiltAnalyzers; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import org.elasticsearch.test.VersionUtils; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; -import static java.util.Collections.emptyList; import static java.util.Collections.emptyMap; +import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.instanceOf; @@ -112,51 +116,73 @@ public class AnalysisRegistryTests extends ESTestCase { assertThat(indexAnalyzers.getDefaultSearchQuoteAnalyzer().analyzer(), instanceOf(EnglishAnalyzer.class)); } + /** + * Tests that {@code camelCase} filter names and {@code snake_case} filter names don't collide. + */ public void testConfigureCamelCaseTokenFilter() throws IOException { Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build(); Settings indexSettings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") - .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) - .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") - .put("index.analysis.analyzer.custom_analyzer_1.tokenizer", "whitespace") - .putArray("index.analysis.analyzer.custom_analyzer_1.filter", "lowercase", "word_delimiter").build(); + .put("index.analysis.filter.testFilter.type", "mock") + .put("index.analysis.filter.test_filter.type", "mock") + .put("index.analysis.analyzer.custom_analyzer_with_camel_case.tokenizer", "standard") + .putArray("index.analysis.analyzer.custom_analyzer_with_camel_case.filter", "lowercase", "testFilter") + .put("index.analysis.analyzer.custom_analyzer_with_snake_case.tokenizer", "standard") + .putArray("index.analysis.analyzer.custom_analyzer_with_snake_case.filter", "lowercase", "test_filter").build(); IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); - IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry() + /* The snake_case version of the name should not filter out any stopwords while the + * camelCase version will filter out English stopwords. */ + AnalysisPlugin plugin = new AnalysisPlugin() { + class MockFactory extends AbstractTokenFilterFactory { + MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + if (name().equals("test_filter")) { + return new MockTokenFilter(tokenStream, MockTokenFilter.EMPTY_STOPSET); + } + return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET); + } + } + + @Override + public Map> getTokenFilters() { + return singletonMap("mock", MockFactory::new); + } + }; + IndexAnalyzers indexAnalyzers = new AnalysisModule(new Environment(settings), singletonList(plugin)).getAnalysisRegistry() .build(idxSettings); - try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer")) { + + // This shouldn't contain English stopwords + try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_camel_case")) { assertNotNull(custom_analyser); - TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); + TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); - List token = new ArrayList<>(); - while(tokenStream.incrementToken()) { - token.add(charTermAttribute.toString()); - } - assertEquals(token.toString(), 2, token.size()); - assertEquals("j2se", token.get(0)); - assertEquals("j2ee", token.get(1)); + assertTrue(tokenStream.incrementToken()); + assertEquals("has", charTermAttribute.toString()); + assertTrue(tokenStream.incrementToken()); + assertEquals("foo", charTermAttribute.toString()); + assertFalse(tokenStream.incrementToken()); } - try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_1")) { + // This *should* contain English stopwords + try (NamedAnalyzer custom_analyser = indexAnalyzers.get("custom_analyzer_with_snake_case")) { assertNotNull(custom_analyser); - TokenStream tokenStream = custom_analyser.tokenStream("foo", "J2SE j2ee"); + TokenStream tokenStream = custom_analyser.tokenStream("foo", "has a foo"); tokenStream.reset(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); - List token = new ArrayList<>(); - while(tokenStream.incrementToken()) { - token.add(charTermAttribute.toString()); - } - assertEquals(token.toString(), 6, token.size()); - assertEquals("j", token.get(0)); - assertEquals("2", token.get(1)); - assertEquals("se", token.get(2)); - assertEquals("j", token.get(3)); - assertEquals("2", token.get(4)); - assertEquals("ee", token.get(5)); + assertTrue(tokenStream.incrementToken()); + assertEquals("has", charTermAttribute.toString()); + assertTrue(tokenStream.incrementToken()); + assertEquals("a", charTermAttribute.toString()); + assertTrue(tokenStream.incrementToken()); + assertEquals("foo", charTermAttribute.toString()); + assertFalse(tokenStream.incrementToken()); } } diff --git a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java index c0c52928d20..819b2c7d644 100644 --- a/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java +++ b/core/src/test/java/org/elasticsearch/search/fetch/subphase/highlight/HighlighterSearchIT.java @@ -19,6 +19,7 @@ package org.elasticsearch.search.fetch.subphase.highlight; import com.carrotsearch.randomizedtesting.generators.RandomPicks; + import org.apache.lucene.search.join.ScoreMode; import org.elasticsearch.action.index.IndexRequestBuilder; import org.elasticsearch.action.search.SearchRequestBuilder; @@ -100,6 +101,7 @@ import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.startsWith; public class HighlighterSearchIT extends ESIntegTestCase { + // TODO as we move analyzers out of the core we need to move some of these into HighlighterWithAnalyzersTests private static final String[] ALL_TYPES = new String[] {"plain", "postings", "fvh", "unified"}; private static final String[] UNIFIED_AND_NULL = new String[] {null, "unified"}; @@ -113,12 +115,11 @@ public class HighlighterSearchIT extends ESIntegTestCase { mappings.startObject(); mappings.startObject("type") .startObject("properties") - .startObject("text") - .field("type", "keyword") - .field("store", true) - .endObject() - .endObject() - .endObject(); + .startObject("text") + .field("type", "keyword") + .field("store", true) + .endObject() + .endObject().endObject(); mappings.endObject(); assertAcked(prepareCreate("test") .addMapping("type", mappings)); @@ -139,14 +140,13 @@ public class HighlighterSearchIT extends ESIntegTestCase { mappings.startObject(); mappings.startObject("type") .startObject("properties") - .startObject("text") - .field("type", "text") - .field("analyzer", "keyword") - .field("index_options", "offsets") - .field("term_vector", "with_positions_offsets") - .endObject() - .endObject() - .endObject(); + .startObject("text") + .field("type", "text") + .field("analyzer", "keyword") + .field("index_options", "offsets") + .field("term_vector", "with_positions_offsets") + .endObject() + .endObject().endObject(); mappings.endObject(); assertAcked(prepareCreate("test") .addMapping("type", mappings)); @@ -166,23 +166,22 @@ public class HighlighterSearchIT extends ESIntegTestCase { mappings.startObject(); mappings.startObject("type") .startObject("_source") - .field("enabled", false) + .field("enabled", false) .endObject() .startObject("properties") - .startObject("unstored_field") - .field("index_options", "offsets") - .field("term_vector", "with_positions_offsets") - .field("type", "text") - .field("store", false) - .endObject() - .startObject("text") - .field("index_options", "offsets") - .field("term_vector", "with_positions_offsets") - .field("type", "text") - .field("store", true) - .endObject() - .endObject() - .endObject(); + .startObject("unstored_field") + .field("index_options", "offsets") + .field("term_vector", "with_positions_offsets") + .field("type", "text") + .field("store", false) + .endObject() + .startObject("text") + .field("index_options", "offsets") + .field("term_vector", "with_positions_offsets") + .field("type", "text") + .field("store", true) + .endObject() + .endObject().endObject(); mappings.endObject(); assertAcked(prepareCreate("test") .addMapping("type", mappings)); @@ -218,103 +217,6 @@ public class HighlighterSearchIT extends ESIntegTestCase { assertHighlight(search, 0, "name", 0, startsWith("abc abc abc abc")); } - public void testNgramHighlightingWithBrokenPositions() throws IOException { - assertAcked(prepareCreate("test") - .addMapping("test", jsonBuilder() - .startObject() - .startObject("test") - .startObject("properties") - .startObject("name") - .startObject("fields") - .startObject("autocomplete") - .field("type", "text") - .field("analyzer", "autocomplete") - .field("search_analyzer", "search_autocomplete") - .field("term_vector", "with_positions_offsets") - .endObject() - .endObject() - .field("type", "text") - .endObject() - .endObject() - .endObject() - .endObject()) - .setSettings(Settings.builder() - .put(indexSettings()) - .put("analysis.tokenizer.autocomplete.max_gram", 20) - .put("analysis.tokenizer.autocomplete.min_gram", 1) - .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit") - .put("analysis.tokenizer.autocomplete.type", "nGram") - .put("analysis.filter.wordDelimiter.type", "word_delimiter") - .putArray("analysis.filter.wordDelimiter.type_table", - "& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM", - "? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", "# => ALPHANUM", "% => ALPHANUM", - "+ => ALPHANUM", ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", "/ => ALPHANUM", - "^ => ALPHANUM", "$ => ALPHANUM", "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM", - "] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", "{ => ALPHANUM") - - .put("analysis.filter.wordDelimiter.type.split_on_numerics", false) - .put("analysis.filter.wordDelimiter.generate_word_parts", true) - .put("analysis.filter.wordDelimiter.generate_number_parts", false) - .put("analysis.filter.wordDelimiter.catenate_words", true) - .put("analysis.filter.wordDelimiter.catenate_numbers", true) - .put("analysis.filter.wordDelimiter.catenate_all", false) - - .put("analysis.analyzer.autocomplete.tokenizer", "autocomplete") - .putArray("analysis.analyzer.autocomplete.filter", "lowercase", "wordDelimiter") - .put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace") - .putArray("analysis.analyzer.search_autocomplete.filter", "lowercase", "wordDelimiter"))); - client().prepareIndex("test", "test", "1") - .setSource("name", "ARCOTEL Hotels Deutschland").get(); - refresh(); - SearchResponse search = client().prepareSearch("test").setTypes("test") - .setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR)) - .highlighter(new HighlightBuilder().field("name.autocomplete")).execute().actionGet(); - assertHighlight(search, 0, "name.autocomplete", 0, equalTo("ARCOTEL Hotels Deutschland")); - } - - public void testMultiPhraseCutoff() throws IOException { - /* - * MultiPhraseQuery can literally kill an entire node if there are too many terms in the - * query. We cut off and extract terms if there are more than 16 terms in the query - */ - assertAcked(prepareCreate("test") - .addMapping("test", - "body", "type=text,analyzer=custom_analyzer,search_analyzer=custom_analyzer,term_vector=with_positions_offsets") - .setSettings( - Settings.builder().put(indexSettings()) - .put("analysis.filter.wordDelimiter.type", "word_delimiter") - .put("analysis.filter.wordDelimiter.type.split_on_numerics", false) - .put("analysis.filter.wordDelimiter.generate_word_parts", true) - .put("analysis.filter.wordDelimiter.generate_number_parts", true) - .put("analysis.filter.wordDelimiter.catenate_words", true) - .put("analysis.filter.wordDelimiter.catenate_numbers", true) - .put("analysis.filter.wordDelimiter.catenate_all", false) - .put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace") - .putArray("analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter")) - ); - - ensureGreen(); - client().prepareIndex("test", "test", "1") - .setSource("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com " - + "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: " - + "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com " - + "http://twitter.com this is a test for highlighting feature") - .get(); - refresh(); - SearchResponse search = client().prepareSearch().setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com ")) - .highlighter(new HighlightBuilder().field("body")).execute().actionGet(); - assertHighlight(search, 0, "body", 0, startsWith("Test: http://www.facebook.com")); - search = client() - .prepareSearch() - .setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com http://elasticsearch.org http://xing.com " - + "http://cnn.com http://quora.com http://twitter.com this is a test for highlighting feature Test: " - + "http://www.facebook.com http://elasticsearch.org http://xing.com http://cnn.com http://quora.com " - + "http://twitter.com this is a test for highlighting feature")) - .highlighter(new HighlightBuilder().field("body")).execute().actionGet(); - assertHighlight(search, 0, "body", 0, equalTo("Test: http://www.facebook.com " - + "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com")); - } - public void testNgramHighlighting() throws IOException { assertAcked(prepareCreate("test") .addMapping("test", diff --git a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java index 05dc973f9e2..a30049c70d5 100644 --- a/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java +++ b/core/src/test/java/org/elasticsearch/search/query/SearchQueryIT.java @@ -1605,33 +1605,6 @@ public class SearchQueryIT extends ESIntegTestCase { assertHitCount(searchResponse, 2); } - // see #3898 - public void testCustomWordDelimiterQueryString() { - assertAcked(client().admin().indices().prepareCreate("test") - .setSettings("analysis.analyzer.my_analyzer.type", "custom", - "analysis.analyzer.my_analyzer.tokenizer", "whitespace", - "analysis.analyzer.my_analyzer.filter", "custom_word_delimiter", - "analysis.filter.custom_word_delimiter.type", "word_delimiter", - "analysis.filter.custom_word_delimiter.generate_word_parts", "true", - "analysis.filter.custom_word_delimiter.generate_number_parts", "false", - "analysis.filter.custom_word_delimiter.catenate_numbers", "true", - "analysis.filter.custom_word_delimiter.catenate_words", "false", - "analysis.filter.custom_word_delimiter.split_on_case_change", "false", - "analysis.filter.custom_word_delimiter.split_on_numerics", "false", - "analysis.filter.custom_word_delimiter.stem_english_possessive", "false") - .addMapping("type1", "field1", "type=text,analyzer=my_analyzer", "field2", "type=text,analyzer=my_analyzer")); - - client().prepareIndex("test", "type1", "1").setSource("field1", "foo bar baz", "field2", "not needed").get(); - refresh(); - - SearchResponse response = client() - .prepareSearch("test") - .setQuery( - queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND) - .field("field1").field("field2")).get(); - assertHitCount(response, 1L); - } - // see #3797 public void testMultiMatchLenientIssue3797() { createIndex("test"); diff --git a/modules/analysis-common/build.gradle b/modules/analysis-common/build.gradle new file mode 100644 index 00000000000..391b74934c9 --- /dev/null +++ b/modules/analysis-common/build.gradle @@ -0,0 +1,23 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +esplugin { + description 'Adds "built in" analyzers to Elasticsearch.' + classname 'org.elasticsearch.analysis.common.CommonAnalysisPlugin' +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java similarity index 76% rename from core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java index 5e53a86129a..f8e0c7383a0 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; @@ -25,20 +25,26 @@ import org.elasticsearch.common.ParseField; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; +import org.elasticsearch.index.analysis.TokenFilterFactory; /** * Factory for ASCIIFoldingFilter. */ -public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { - public static ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original"); - public static boolean DEFAULT_PRESERVE_ORIGINAL = false; +public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory + implements MultiTermAwareComponent { + public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original"); + public static final boolean DEFAULT_PRESERVE_ORIGINAL = false; private final boolean preserveOriginal; - public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + public ASCIIFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, + String name, Settings settings) { super(indexSettings, name, settings); preserveOriginal = settings.getAsBooleanLenientForPreEs6Indices( - indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), DEFAULT_PRESERVE_ORIGINAL, deprecationLogger); + indexSettings.getIndexVersionCreated(), PRESERVE_ORIGINAL.getPreferredName(), + DEFAULT_PRESERVE_ORIGINAL, deprecationLogger); } @Override diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java new file mode 100644 index 00000000000..bfd1bbdcc97 --- /dev/null +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -0,0 +1,39 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; + +import java.util.HashMap; +import java.util.Map; + +public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { + @Override + public Map> getTokenFilters() { + Map> filters = new HashMap<>(); + filters.put("asciifolding", ASCIIFoldingTokenFilterFactory::new); + filters.put("word_delimiter", WordDelimiterTokenFilterFactory::new); + filters.put("word_delimiter_graph", WordDelimiterGraphTokenFilterFactory::new); + return filters; + } +} diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java similarity index 86% rename from core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java index 7cdc215f1b3..16133398531 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; @@ -26,20 +26,22 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.Analysis; import java.util.List; import java.util.Set; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS; -import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; -import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_NUMBERS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_WORDS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS; +import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE; +import static org.elasticsearch.analysis.common.WordDelimiterTokenFilterFactory.parseTypes; public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory { @@ -47,7 +49,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac private final int flags; private final CharArraySet protoWords; - public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, + String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: @@ -82,7 +85,8 @@ public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFac // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited - Set protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); + Set protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), + settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; } diff --git a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java similarity index 92% rename from core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java rename to modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java index 09882072ee6..8c38beb8f8b 100644 --- a/core/src/main/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; @@ -26,6 +26,8 @@ import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; +import org.elasticsearch.index.analysis.Analysis; import java.util.Collection; import java.util.List; @@ -52,7 +54,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory private final int flags; private final CharArraySet protoWords; - public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + public WordDelimiterTokenFilterFactory(IndexSettings indexSettings, Environment env, + String name, Settings settings) { super(indexSettings, name, settings); // Sample Format for the type table: @@ -87,7 +90,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); // If not null is the set of tokens to protect from being delimited - Set protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); + Set protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), + settings, "protected_words"); this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); this.flags = flags; } @@ -101,7 +105,8 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory } public int getFlag(int flag, Settings settings, String key, boolean defaultValue) { - if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), key, defaultValue, deprecationLogger)) { + if (settings.getAsBooleanLenientForPreEs6Indices(indexSettings.getIndexVersionCreated(), + key, defaultValue, deprecationLogger)) { return flag; } return 0; @@ -122,14 +127,16 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory String lhs = parseString(m.group(1).trim()); Byte rhs = parseType(m.group(2).trim()); if (lhs.length() != 1) - throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed."); + throw new RuntimeException("Invalid Mapping Rule : [" + + rule + "]. Only a single character is allowed."); if (rhs == null) throw new RuntimeException("Invalid Mapping Rule : [" + rule + "]. Illegal type."); typeMap.put(lhs.charAt(0), rhs); } // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance - byte types[] = new byte[Math.max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; + byte types[] = new byte[Math.max( + typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length)]; for (int i = 0; i < types.length; i++) types[i] = WordDelimiterIterator.getType(i); for (Map.Entry mapping : typeMap.entrySet()) @@ -196,4 +203,9 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory } return new String(out, 0, writePos); } + + @Override + public boolean breaksFastVectorHighlighter() { + return true; + } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactoryTests.java similarity index 71% rename from core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactoryTests.java index 973225df180..22ac081011f 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/ASCIIFoldingTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactoryTests.java @@ -17,12 +17,15 @@ * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.MultiTermAwareComponent; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; @@ -31,10 +34,12 @@ import java.io.StringReader; public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase { public void testDefault() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding"); String source = "Ansprüche"; String[] expected = new String[]{"Anspruche"}; @@ -44,11 +49,13 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase { } public void testPreserveOriginal() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") - .put("index.analysis.filter.my_ascii_folding.preserve_original", true) - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_ascii_folding.type", "asciifolding") + .put("index.analysis.filter.my_ascii_folding.preserve_original", true) + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_ascii_folding"); String source = "Ansprüche"; String[] expected = new String[]{"Anspruche", "Ansprüche"}; @@ -57,7 +64,8 @@ public class ASCIIFoldingTokenFilterFactoryTests extends ESTokenStreamTestCase { assertTokenStreamContents(tokenFilter.create(tokenizer), expected); // but the multi-term aware component still emits a single token - tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter).getMultiTermComponent(); + tokenFilter = (TokenFilterFactory) ((MultiTermAwareComponent) tokenFilter) + .getMultiTermComponent(); tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); expected = new String[]{"Anspruche"}; diff --git a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java similarity index 54% rename from core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java index 713e9424759..ce6d0403c07 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/BaseWordDelimiterTokenFilterFactoryTestCase.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/BaseWordDelimiterTokenFilterFactoryTestCase.java @@ -16,13 +16,15 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTokenStreamTestCase; @@ -30,7 +32,8 @@ import java.io.IOException; import java.io.StringReader; /** - * Base class to test {@link WordDelimiterTokenFilterFactory} and {@link WordDelimiterGraphTokenFilterFactory} + * Base class to test {@link WordDelimiterTokenFilterFactory} and + * {@link WordDelimiterGraphTokenFilterFactory}. */ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESTokenStreamTestCase { final String type; @@ -40,10 +43,12 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke } public void testDefault() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", @@ -54,44 +59,51 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke } public void testCatenateWords() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"}; + String[] expected = new String[] { "PowerShot", "500", "42", "wifi", "wifi", "4000", "j", + "2", "se", "ONeil" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } public void testCatenateNumbers() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") - .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") + .put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", - "se", "O", "Neil"}; + String[] expected = new String[] { "Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", + "j", "2", "se", "O", "Neil" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } public void testCatenateAll() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") - .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") - .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false") + .put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false") + .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"}; @@ -101,11 +113,13 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke } public void testSplitOnCaseChange() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; String[] expected = new String[]{"PowerShot"}; @@ -115,30 +129,35 @@ public abstract class BaseWordDelimiterTokenFilterFactoryTestCase extends ESToke } public void testPreserveOriginal() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", - "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"}; + String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42", + "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", + "O'Neil's", "O", "Neil" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } public void testStemEnglishPossessive() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", - "se", "O", "Neil", "s"}; + String[] expected = new String[] { "Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", + "4000", "j", "2", "se", "O", "Neil", "s" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java new file mode 100644 index 00000000000..b5d6bf23b3e --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisClientYamlTestSuiteIT.java @@ -0,0 +1,36 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.analysis.common; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; +import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; + +public class CommonAnalysisClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase { + public CommonAnalysisClientYamlTestSuiteIT(@Name("yaml")ClientYamlTestCandidate testCandidate) { + super(testCandidate); + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return ESClientYamlSuiteTestCase.createParameters(); + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java new file mode 100644 index 00000000000..886dad37b56 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/CommonAnalysisFactoryTests.java @@ -0,0 +1,86 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.AnalysisFactoryTestCase; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; +import static java.util.stream.Collectors.toList; + +public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase { + @Override + protected Map> getTokenizers() { + Map> tokenizers = new HashMap<>(super.getTokenizers()); + return tokenizers; + } + + @Override + protected Map> getTokenFilters() { + Map> filters = new HashMap<>(super.getTokenFilters()); + filters.put("asciifolding", ASCIIFoldingTokenFilterFactory.class); + filters.put("worddelimiter", WordDelimiterTokenFilterFactory.class); + filters.put("worddelimitergraph", WordDelimiterGraphTokenFilterFactory.class); + return filters; + } + + @Override + protected Map> getCharFilters() { + Map> filters = new HashMap<>(super.getCharFilters()); + return filters; + } + + /** + * Fails if a tokenizer is marked in the superclass with {@link MovedToAnalysisCommon} but + * hasn't been marked in this class with its proper factory. + */ + public void testAllTokenizersMarked() { + markedTestCase("char filter", getTokenizers()); + } + + /** + * Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but + * hasn't been marked in this class with its proper factory. + */ + public void testAllCharFiltersMarked() { + markedTestCase("char filter", getCharFilters()); + } + + /** + * Fails if a char filter is marked in the superclass with {@link MovedToAnalysisCommon} but + * hasn't been marked in this class with its proper factory. + */ + public void testAllTokenFiltersMarked() { + markedTestCase("token filter", getTokenFilters()); + } + + private void markedTestCase(String name, Map> map) { + List unmarked = map.entrySet().stream() + .filter(e -> e.getValue() == MovedToAnalysisCommon.class) + .map(Map.Entry::getKey) + .sorted() + .collect(toList()); + assertEquals(name + " marked in AnalysisFactoryTestCase as moved to analysis-common " + + "but not mapped here", emptyList(), unmarked); + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java new file mode 100644 index 00000000000..c022d5c85ac --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/HighlighterWithAnalyzersTests.java @@ -0,0 +1,154 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.query.Operator; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.elasticsearch.test.ESIntegTestCase; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.query.QueryBuilders.matchPhraseQuery; +import static org.elasticsearch.index.query.QueryBuilders.matchQuery; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHighlight; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.startsWith; + +public class HighlighterWithAnalyzersTests extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + return Arrays.asList(CommonAnalysisPlugin.class); + } + + public void testNgramHighlightingWithBrokenPositions() throws IOException { + assertAcked(prepareCreate("test") + .addMapping("test", jsonBuilder() + .startObject() + .startObject("test") + .startObject("properties") + .startObject("name") + .field("type", "text") + .startObject("fields") + .startObject("autocomplete") + .field("type", "text") + .field("analyzer", "autocomplete") + .field("search_analyzer", "search_autocomplete") + .field("term_vector", "with_positions_offsets") + .endObject() + .endObject() + .endObject() + .endObject() + .endObject() + .endObject()) + .setSettings(Settings.builder() + .put(indexSettings()) + .put("analysis.tokenizer.autocomplete.max_gram", 20) + .put("analysis.tokenizer.autocomplete.min_gram", 1) + .put("analysis.tokenizer.autocomplete.token_chars", "letter,digit") + .put("analysis.tokenizer.autocomplete.type", "nGram") + .put("analysis.filter.wordDelimiter.type", "word_delimiter") + .putArray("analysis.filter.wordDelimiter.type_table", + "& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM", + "? => ALPHANUM", ". => ALPHANUM", "- => ALPHANUM", + "# => ALPHANUM", "% => ALPHANUM", "+ => ALPHANUM", + ", => ALPHANUM", "~ => ALPHANUM", ": => ALPHANUM", + "/ => ALPHANUM", "^ => ALPHANUM", "$ => ALPHANUM", + "@ => ALPHANUM", ") => ALPHANUM", "( => ALPHANUM", + "] => ALPHANUM", "[ => ALPHANUM", "} => ALPHANUM", + "{ => ALPHANUM") + .put("analysis.filter.wordDelimiter.type.split_on_numerics", false) + .put("analysis.filter.wordDelimiter.generate_word_parts", true) + .put("analysis.filter.wordDelimiter.generate_number_parts", false) + .put("analysis.filter.wordDelimiter.catenate_words", true) + .put("analysis.filter.wordDelimiter.catenate_numbers", true) + .put("analysis.filter.wordDelimiter.catenate_all", false) + + .put("analysis.analyzer.autocomplete.tokenizer", "autocomplete") + .putArray("analysis.analyzer.autocomplete.filter", + "lowercase", "wordDelimiter") + .put("analysis.analyzer.search_autocomplete.tokenizer", "whitespace") + .putArray("analysis.analyzer.search_autocomplete.filter", + "lowercase", "wordDelimiter"))); + client().prepareIndex("test", "test", "1") + .setSource("name", "ARCOTEL Hotels Deutschland").get(); + refresh(); + SearchResponse search = client().prepareSearch("test").setTypes("test") + .setQuery(matchQuery("name.autocomplete", "deut tel").operator(Operator.OR)) + .highlighter(new HighlightBuilder().field("name.autocomplete")).get(); + assertHighlight(search, 0, "name.autocomplete", 0, + equalTo("ARCOTEL Hotels Deutschland")); + } + + public void testMultiPhraseCutoff() throws IOException { + /* + * MultiPhraseQuery can literally kill an entire node if there are too many terms in the + * query. We cut off and extract terms if there are more than 16 terms in the query + */ + assertAcked(prepareCreate("test") + .addMapping("test", "body", "type=text,analyzer=custom_analyzer," + + "search_analyzer=custom_analyzer,term_vector=with_positions_offsets") + .setSettings( + Settings.builder().put(indexSettings()) + .put("analysis.filter.wordDelimiter.type", "word_delimiter") + .put("analysis.filter.wordDelimiter.type.split_on_numerics", false) + .put("analysis.filter.wordDelimiter.generate_word_parts", true) + .put("analysis.filter.wordDelimiter.generate_number_parts", true) + .put("analysis.filter.wordDelimiter.catenate_words", true) + .put("analysis.filter.wordDelimiter.catenate_numbers", true) + .put("analysis.filter.wordDelimiter.catenate_all", false) + .put("analysis.analyzer.custom_analyzer.tokenizer", "whitespace") + .putArray("analysis.analyzer.custom_analyzer.filter", + "lowercase", "wordDelimiter")) + ); + + ensureGreen(); + client().prepareIndex("test", "test", "1") + .setSource("body", "Test: http://www.facebook.com http://elasticsearch.org " + + "http://xing.com http://cnn.com http://quora.com http://twitter.com this is " + + "a test for highlighting feature Test: http://www.facebook.com " + + "http://elasticsearch.org http://xing.com http://cnn.com http://quora.com " + + "http://twitter.com this is a test for highlighting feature") + .get(); + refresh(); + SearchResponse search = client().prepareSearch() + .setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com ")) + .highlighter(new HighlightBuilder().field("body")).get(); + assertHighlight(search, 0, "body", 0, startsWith("Test: http://www.facebook.com")); + search = client() + .prepareSearch() + .setQuery(matchPhraseQuery("body", "Test: http://www.facebook.com " + + "http://elasticsearch.org http://xing.com http://cnn.com " + + "http://quora.com http://twitter.com this is a test for highlighting " + + "feature Test: http://www.facebook.com http://elasticsearch.org " + + "http://xing.com http://cnn.com http://quora.com http://twitter.com this " + + "is a test for highlighting feature")) + .highlighter(new HighlightBuilder().field("body")).execute().actionGet(); + assertHighlight(search, 0, "body", 0, equalTo("Test: " + + "http://www.facebook.com http://elasticsearch.org " + + "http://xing.com http://cnn.com http://quora.com")); + } +} diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java new file mode 100644 index 00000000000..7dd53a04494 --- /dev/null +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/QueryStringWithAnalyzersTests.java @@ -0,0 +1,72 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.analysis.common; + +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.index.query.Operator; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESIntegTestCase; + +import java.util.Arrays; +import java.util.Collection; + +import static org.elasticsearch.index.query.QueryBuilders.queryStringQuery; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; + +public class QueryStringWithAnalyzersTests extends ESIntegTestCase { + @Override + protected Collection> nodePlugins() { + return Arrays.asList(CommonAnalysisPlugin.class); + } + + /** + * Validates that we properly split fields using the word delimiter filter in query_string. + */ + public void testCustomWordDelimiterQueryString() { + assertAcked(client().admin().indices().prepareCreate("test") + .setSettings("analysis.analyzer.my_analyzer.type", "custom", + "analysis.analyzer.my_analyzer.tokenizer", "whitespace", + "analysis.analyzer.my_analyzer.filter", "custom_word_delimiter", + "analysis.filter.custom_word_delimiter.type", "word_delimiter", + "analysis.filter.custom_word_delimiter.generate_word_parts", "true", + "analysis.filter.custom_word_delimiter.generate_number_parts", "false", + "analysis.filter.custom_word_delimiter.catenate_numbers", "true", + "analysis.filter.custom_word_delimiter.catenate_words", "false", + "analysis.filter.custom_word_delimiter.split_on_case_change", "false", + "analysis.filter.custom_word_delimiter.split_on_numerics", "false", + "analysis.filter.custom_word_delimiter.stem_english_possessive", "false") + .addMapping("type1", + "field1", "type=text,analyzer=my_analyzer", + "field2", "type=text,analyzer=my_analyzer")); + + client().prepareIndex("test", "type1", "1").setSource( + "field1", "foo bar baz", + "field2", "not needed").get(); + refresh(); + + SearchResponse response = client() + .prepareSearch("test") + .setQuery( + queryStringQuery("foo.baz").useDisMax(false).defaultOperator(Operator.AND) + .field("field1").field("field2")).get(); + assertHitCount(response, 1L); + } +} diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java similarity index 55% rename from core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java index 2ae4267104a..bd7ff2f0c01 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterGraphTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterGraphTokenFilterFactoryTests.java @@ -16,52 +16,62 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; - +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import java.io.IOException; import java.io.StringReader; -public class WordDelimiterGraphTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { +public class WordDelimiterGraphTokenFilterFactoryTests + extends BaseWordDelimiterTokenFilterFactoryTestCase { public WordDelimiterGraphTokenFilterFactoryTests() { super("word_delimiter_graph"); } public void testMultiTerms() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") - .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_all", "true") + .put("index.analysis.filter.my_word_delimiter.preserve_original", "true") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's"; - String[] expected = new String[]{"PowerShot", "PowerShot", "Power", "Shot", "50042", "500-42", "500", "42", - "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j2se", "j", "2", "se", - "ONeil", "O'Neil's", "O", "Neil" }; + String[] expected = new String[] { "PowerShot", "PowerShot", "Power", "Shot", "50042", + "500-42", "500", "42", "wifi", "wi-fi", "wi", "fi", "wifi4000", "wi-fi-4000", "wi", + "fi", "4000", "j2se", "j2se", "j", "2", "se", "ONeil", "O'Neil's", "O", "Neil" }; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(source)); - int[] expectedIncr = new int[]{1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}; - int[] expectedPosLen = new int[]{2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1}; + int[] expectedIncr = new int[] { 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, + 1, 1, 1, 0, 0, 1 }; + int[] expectedPosLen = new int[] { 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 3, 3, 1, 1, 1, 3, 3, + 1, 1, 1, 2, 2, 1, 1 }; assertTokenStreamContents(tokenFilter.create(tokenizer), expected, null, null, null, expectedIncr, expectedPosLen, null); } - /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ + /** + * Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power + */ public void testPartsAndCatenate() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; int[] expectedIncr = new int[]{1, 0, 1}; diff --git a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactoryTests.java similarity index 65% rename from core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java rename to modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactoryTests.java index 1e919e00bbb..78c4f1485aa 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/WordDelimiterTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/WordDelimiterTokenFilterFactoryTests.java @@ -16,31 +16,38 @@ * specific language governing permissions and limitations * under the License. */ -package org.elasticsearch.index.analysis; +package org.elasticsearch.analysis.common; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; +import org.elasticsearch.index.analysis.AnalysisTestsHelper; +import org.elasticsearch.index.analysis.TokenFilterFactory; import org.elasticsearch.test.ESTestCase; import java.io.IOException; import java.io.StringReader; -public class WordDelimiterTokenFilterFactoryTests extends BaseWordDelimiterTokenFilterFactoryTestCase { +public class WordDelimiterTokenFilterFactoryTests + extends BaseWordDelimiterTokenFilterFactoryTestCase { public WordDelimiterTokenFilterFactoryTests() { super("word_delimiter"); } - /** Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power */ + /** + * Correct offset order when doing both parts and concatenation: PowerShot is a synonym of Power + */ public void testPartsAndCatenate() throws IOException { - ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder() - .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) - .put("index.analysis.filter.my_word_delimiter.type", type) - .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") - .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") - .build()); + ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings( + Settings.builder() + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_word_delimiter.type", type) + .put("index.analysis.filter.my_word_delimiter.catenate_words", "true") + .put("index.analysis.filter.my_word_delimiter.generate_word_parts", "true") + .build(), + new CommonAnalysisPlugin()); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter"); String source = "PowerShot"; String[] expected = new String[]{"Power", "PowerShot", "Shot" }; diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml new file mode 100644 index 00000000000..d27a0861b2e --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/10_basic.yaml @@ -0,0 +1,11 @@ +"Module loaded": + - do: + cluster.state: {} + + # Get master node id + - set: { master_node: master } + + - do: + nodes.info: {} + + - match: { nodes.$master.modules.0.name: analysis-common } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml new file mode 100644 index 00000000000..9fb34e7a821 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/20_analyzers.yaml @@ -0,0 +1,11 @@ +## Smoke tests for analyzers included in the analysis-common module + +"whitespace": + - do: + indices.analyze: + body: + text: Foo Bar! + analyzer: whitespace + - length: { tokens: 2 } + - match: { tokens.0.token: Foo } + - match: { tokens.1.token: Bar! } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml new file mode 100644 index 00000000000..174a15f772b --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yaml @@ -0,0 +1,27 @@ +## Smoke tests for tokenizers included in the analysis-common module + +"keyword": + - do: + indices.analyze: + body: + text: Foo Bar! + tokenizer: keyword + - length: { tokens: 1 } + - match: { tokens.0.token: Foo Bar! } + +--- +"nGram": + - do: + indices.analyze: + body: + text: good + explain: true + tokenizer: + type: nGram + min_gram: 2 + max_gram: 2 + - length: { detail.tokenizer.tokens: 3 } + - match: { detail.tokenizer.name: _anonymous_tokenizer } + - match: { detail.tokenizer.tokens.0.token: go } + - match: { detail.tokenizer.tokens.1.token: oo } + - match: { detail.tokenizer.tokens.2.token: od } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml new file mode 100644 index 00000000000..ac5bcb82e57 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/40_token_filters.yaml @@ -0,0 +1,82 @@ +## Smoke tests for token filters included in the analysis-common module + +"asciifolding": + - do: + indices.analyze: + body: + text: Musée d'Orsay + tokenizer: keyword + filter: [asciifolding] + - length: { tokens: 1 } + - match: { tokens.0.token: Musee d'Orsay } + +--- +"lowercase": + - do: + indices.analyze: + body: + text: Foo Bar! + tokenizer: keyword + filter: [lowercase] + - length: { tokens: 1 } + - match: { tokens.0.token: foo bar! } + +--- +"word_delimiter": + - do: + indices.analyze: + body: + text: the qu1ck brown fox + tokenizer: standard + filter: [word_delimiter] + - length: { tokens: 6 } + - match: { tokens.0.token: the } + - match: { tokens.1.token: qu } + - match: { tokens.2.token: "1" } + - match: { tokens.3.token: ck } + - match: { tokens.4.token: brown } + - match: { tokens.5.token: fox } + + - do: + indices.analyze: + body: + text: the qu1ck brown fox + tokenizer: standard + filter: + - type: word_delimiter + split_on_numerics: false + - length: { tokens: 4 } + - match: { tokens.0.token: the } + - match: { tokens.1.token: qu1ck } + - match: { tokens.2.token: brown } + - match: { tokens.3.token: fox } + +--- +"word_delimiter_graph": + - do: + indices.analyze: + body: + text: the qu1ck brown fox + tokenizer: standard + filter: [word_delimiter_graph] + - length: { tokens: 6 } + - match: { tokens.0.token: the } + - match: { tokens.1.token: qu } + - match: { tokens.2.token: "1" } + - match: { tokens.3.token: ck } + - match: { tokens.4.token: brown } + - match: { tokens.5.token: fox } + + - do: + indices.analyze: + body: + text: the qu1ck brown fox + tokenizer: standard + filter: + - type: word_delimiter_graph + split_on_numerics: false + - length: { tokens: 4 } + - match: { tokens.0.token: the } + - match: { tokens.1.token: qu1ck } + - match: { tokens.2.token: brown } + - match: { tokens.3.token: fox } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml new file mode 100644 index 00000000000..06775a2a722 --- /dev/null +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/50_char_filters.yaml @@ -0,0 +1,13 @@ +## Smoke tests for analyzers included in the analysis-common module + +"mapping": + - do: + indices.analyze: + body: + text: jeff quit phish + tokenizer: keyword + char_filter: + - type: mapping + mappings: ["ph => f", "qu => q"] + - length: { tokens: 1 } + - match: { tokens.0.token: "jeff qit fish" } diff --git a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java index e68cb260b0b..83015296276 100644 --- a/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java +++ b/plugins/analysis-stempel/src/test/java/org/elasticsearch/index/analysis/AnalysisPolishFactoryTests.java @@ -19,14 +19,9 @@ package org.elasticsearch.index.analysis; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.Tokenizer; import org.elasticsearch.AnalysisFactoryTestCase; import org.elasticsearch.Version; @@ -37,6 +32,10 @@ import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.pl.PolishStemTokenFilterFactory; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + public class AnalysisPolishFactoryTests extends AnalysisFactoryTestCase { @Override diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml index 268cd781289..93ce5c8c807 100644 --- a/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml +++ b/rest-api-spec/src/main/resources/rest-api-spec/test/indices.analyze/10_analyze.yaml @@ -1,29 +1,11 @@ -# Will be performed before each test as a part of the test setup -# -setup: - - do: - ping: {} - ---- "Basic test": - do: indices.analyze: body: text: Foo Bar - length: { tokens: 2 } - - match: { tokens.0.token: foo } - - match: { tokens.1.token: bar } - ---- -"Tokenizer and filter": - - do: - indices.analyze: - body: - filter: [lowercase] - text: Foo Bar - tokenizer: keyword - - length: { tokens: 1 } - - match: { tokens.0.token: foo bar } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } --- "Index and field": @@ -36,7 +18,7 @@ setup: properties: text: type: text - analyzer: whitespace + analyzer: standard - do: indices.analyze: @@ -45,84 +27,51 @@ setup: field: text text: Foo Bar! - length: { tokens: 2 } - - match: { tokens.0.token: Foo } - - match: { tokens.1.token: Bar! } ---- -"JSON in Body": - - do: - indices.analyze: - body: { "text": "Foo Bar", "filter": ["lowercase"], "tokenizer": keyword } - - length: {tokens: 1 } - - match: { tokens.0.token: foo bar } + - match: { tokens.0.token: foo } + - match: { tokens.1.token: bar } + --- "Array text": - do: indices.analyze: - body: { "text": ["Foo Bar", "Baz"], "filter": ["lowercase"], "tokenizer": keyword } - - length: {tokens: 2 } - - match: { tokens.0.token: foo bar } - - match: { tokens.1.token: baz } + body: + text: ["Foo Bar", "Baz"] + tokenizer: standard + - length: { tokens: 3 } + - match: { tokens.0.token: Foo } + - match: { tokens.1.token: Bar } + - match: { tokens.2.token: Baz } + --- "Detail response with Analyzer": - do: indices.analyze: - body: {"text": "This is troubled", "analyzer": standard, "explain": "true"} + body: + text: This is troubled + analyzer: standard + explain: true - length: { detail.analyzer.tokens: 3 } - - match: { detail.analyzer.name: standard } - - match: { detail.analyzer.tokens.0.token: this } - - match: { detail.analyzer.tokens.1.token: is } - - match: { detail.analyzer.tokens.2.token: troubled } ---- -"Detail output spcified attribute": - - do: - indices.analyze: - body: {"text": "This is troubled", "char_filter": ["html_strip"], "filter": ["snowball"], "tokenizer": standard, "explain": true, "attributes": ["keyword"]} - - length: { detail.charfilters: 1 } - - length: { detail.tokenizer.tokens: 3 } - - length: { detail.tokenfilters.0.tokens: 3 } - - match: { detail.tokenizer.name: standard } - - match: { detail.tokenizer.tokens.0.token: This } - - match: { detail.tokenizer.tokens.1.token: is } - - match: { detail.tokenizer.tokens.2.token: troubled } - - match: { detail.tokenfilters.0.name: snowball } - - match: { detail.tokenfilters.0.tokens.0.token: This } - - match: { detail.tokenfilters.0.tokens.1.token: is } - - match: { detail.tokenfilters.0.tokens.2.token: troubl } - - match: { detail.tokenfilters.0.tokens.2.keyword: false } + - match: { detail.analyzer.name: standard } + - match: { detail.analyzer.tokens.0.token: this } + - match: { detail.analyzer.tokens.1.token: is } + - match: { detail.analyzer.tokens.2.token: troubled } --- "Custom filter in request": - do: indices.analyze: - body: { "text": "Foo Bar Buzz", "filter": ["lowercase", { "type": "stop", "stopwords": ["foo", "buzz"]}], "tokenizer": whitespace, "explain": true } - - length: {detail.tokenizer.tokens: 3 } - - length: {detail.tokenfilters.0.tokens: 3 } - - length: {detail.tokenfilters.1.tokens: 1 } - - match: { detail.tokenizer.name: whitespace } - - match: { detail.tokenizer.tokens.0.token: Foo } - - match: { detail.tokenizer.tokens.1.token: Bar } - - match: { detail.tokenizer.tokens.2.token: Buzz } - - match: { detail.tokenfilters.0.name: lowercase } - - match: { detail.tokenfilters.0.tokens.0.token: foo } - - match: { detail.tokenfilters.0.tokens.1.token: bar } - - match: { detail.tokenfilters.0.tokens.2.token: buzz } - - match: { detail.tokenfilters.1.name: "_anonymous_tokenfilter_[1]" } - - match: { detail.tokenfilters.1.tokens.0.token: bar } ---- -"Custom char_filter in request": - - do: - indices.analyze: - body: { "text": "jeff quit phish", "char_filter": [{"type": "mapping", "mappings": ["ph => f", "qu => q"]}], "tokenizer": keyword } - - length: {tokens: 1 } - - match: { tokens.0.token: "jeff qit fish" } - ---- -"Custom tokenizer in request": - - do: - indices.analyze: - body: { "text": "good", "tokenizer": {"type": "nGram", "min_gram": 2, "max_gram": 2}, "explain": true } - - length: {detail.tokenizer.tokens: 3 } - - match: { detail.tokenizer.name: _anonymous_tokenizer } - - match: { detail.tokenizer.tokens.0.token: go } - - match: { detail.tokenizer.tokens.1.token: oo } - - match: { detail.tokenizer.tokens.2.token: od } + body: + text: foo bar buzz + tokenizer: standard + explain: true + filter: + - type: stop + stopwords: ["foo", "buzz"] + - length: { detail.tokenizer.tokens: 3 } + - length: { detail.tokenfilters.0.tokens: 1 } + - match: { detail.tokenizer.name: standard } + - match: { detail.tokenizer.tokens.0.token: foo } + - match: { detail.tokenizer.tokens.1.token: bar } + - match: { detail.tokenizer.tokens.2.token: buzz } + - match: { detail.tokenfilters.0.name: "_anonymous_tokenfilter_[0]" } + - match: { detail.tokenfilters.0.tokens.0.token: bar } diff --git a/settings.gradle b/settings.gradle index 8e6d3d80a0e..36f9c23e7c5 100644 --- a/settings.gradle +++ b/settings.gradle @@ -26,14 +26,15 @@ List projects = [ 'test:fixtures:hdfs-fixture', 'test:logger-usage', 'modules:aggs-matrix-stats', + 'modules:analysis-common', 'modules:ingest-common', 'modules:lang-expression', 'modules:lang-mustache', 'modules:lang-painless', - 'modules:transport-netty4', - 'modules:reindex', 'modules:percolator', + 'modules:reindex', 'modules:repository-url', + 'modules:transport-netty4', 'plugins:analysis-icu', 'plugins:analysis-kuromoji', 'plugins:analysis-phonetic', diff --git a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java b/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java index 83f955296b7..7f60058788a 100644 --- a/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/AnalysisFactoryTestCase.java @@ -20,14 +20,12 @@ package org.elasticsearch; import org.apache.lucene.analysis.en.PorterStemFilterFactory; -import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilterFactory; import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory; import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory; import org.apache.lucene.analysis.util.CharFilterFactory; import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.analysis.util.TokenizerFactory; import org.elasticsearch.common.collect.MapBuilder; -import org.elasticsearch.index.analysis.ASCIIFoldingTokenFilterFactory; import org.elasticsearch.index.analysis.ApostropheFilterFactory; import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory; import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory; @@ -92,7 +90,6 @@ import org.elasticsearch.index.analysis.TruncateTokenFilterFactory; import org.elasticsearch.index.analysis.UAX29URLEmailTokenizerFactory; import org.elasticsearch.index.analysis.UpperCaseTokenFilterFactory; import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; -import org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory; import org.elasticsearch.index.analysis.compound.DictionaryCompoundWordTokenFilterFactory; import org.elasticsearch.index.analysis.compound.HyphenationCompoundWordTokenFilterFactory; import org.elasticsearch.indices.analysis.PreBuiltCharFilters; @@ -110,7 +107,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * Alerts us if new analyzers are added to lucene, so we don't miss them. + * Alerts us if new analysis components are added to Lucene, so we don't miss them. *

* If we don't want to expose one for a specific reason, just map it to Void. * The deprecated ones can be mapped to Deprecated.class. @@ -178,7 +175,7 @@ public class AnalysisFactoryTestCase extends ESTestCase { .put("apostrophe", ApostropheFilterFactory.class) .put("arabicnormalization", ArabicNormalizationFilterFactory.class) .put("arabicstem", ArabicStemTokenFilterFactory.class) - .put("asciifolding", ASCIIFoldingTokenFilterFactory.class) + .put("asciifolding", MovedToAnalysisCommon.class) .put("brazilianstem", BrazilianStemTokenFilterFactory.class) .put("bulgarianstem", StemmerTokenFilterFactory.class) .put("cjkbigram", CJKBigramFilterFactory.class) @@ -253,8 +250,8 @@ public class AnalysisFactoryTestCase extends ESTestCase { .put("turkishlowercase", LowerCaseTokenFilterFactory.class) .put("type", KeepTypesFilterFactory.class) .put("uppercase", UpperCaseTokenFilterFactory.class) - .put("worddelimiter", WordDelimiterTokenFilterFactory.class) - .put("worddelimitergraph", WordDelimiterGraphFilterFactory.class) + .put("worddelimiter", MovedToAnalysisCommon.class) + .put("worddelimitergraph", MovedToAnalysisCommon.class) .put("flattengraph", FlattenGraphTokenFilterFactory.class) // TODO: these tokenfilters are not yet exposed: useful? @@ -401,6 +398,7 @@ public class AnalysisFactoryTestCase extends ESTestCase { } } expected.remove(Void.class); + expected.remove(MovedToAnalysisCommon.class); expected.remove(Deprecated.class); Collection> actual = new HashSet<>(); @@ -489,4 +487,11 @@ public class AnalysisFactoryTestCase extends ESTestCase { classesThatShouldNotHaveMultiTermSupport.isEmpty()); } + /** + * Marker class for components that have moved to the analysis-common modules. This will be + * removed when the module is complete and these analysis components aren't available to core. + */ + protected static final class MovedToAnalysisCommon { + private MovedToAnalysisCommon() {} + } } diff --git a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java b/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java similarity index 80% rename from core/src/test/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java rename to test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java index a60c21c1a7e..d75a894d073 100644 --- a/core/src/test/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java +++ b/test/framework/src/main/java/org/elasticsearch/index/analysis/AnalysisTestsHelper.java @@ -25,17 +25,18 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.indices.analysis.AnalysisModule; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.IndexSettingsModule; import java.io.IOException; import java.nio.file.Path; - -import static java.util.Collections.emptyList; +import java.util.Arrays; public class AnalysisTestsHelper { - public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, String resource) throws IOException { + public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(Path baseDir, + String resource) throws IOException { Settings settings = Settings.builder() .loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource)) .put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString()) @@ -45,12 +46,15 @@ public class AnalysisTestsHelper { } public static ESTestCase.TestAnalysis createTestAnalysisFromSettings( - Settings settings) throws IOException { + Settings settings, AnalysisPlugin... plugins) throws IOException { if (settings.get(IndexMetaData.SETTING_VERSION_CREATED) == null) { - settings = Settings.builder().put(settings).put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); + settings = Settings.builder().put(settings) + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).build(); } IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); - AnalysisRegistry analysisRegistry = new AnalysisModule(new Environment(settings), emptyList()).getAnalysisRegistry(); + AnalysisRegistry analysisRegistry = + new AnalysisModule(new Environment(settings), Arrays.asList(plugins)) + .getAnalysisRegistry(); return new ESTestCase.TestAnalysis(analysisRegistry.build(indexSettings), analysisRegistry.buildTokenFilterFactories(indexSettings), analysisRegistry.buildTokenizerFactories(indexSettings),