diff --git a/docs/reference/analysis/tokenizers.asciidoc b/docs/reference/analysis/tokenizers.asciidoc index f1e0899d7ab..add0abdec01 100644 --- a/docs/reference/analysis/tokenizers.asciidoc +++ b/docs/reference/analysis/tokenizers.asciidoc @@ -99,14 +99,14 @@ terms. <>:: -The `simplepattern` tokenizer uses a regular expression to capture matching +The `simple_pattern` tokenizer uses a regular expression to capture matching text as terms. It uses a restricted subset of regular expression features and is generally faster than the `pattern` tokenizer. <>:: -The `simplepatternsplit` tokenizer uses the same restricted regular expression -subset as the `simplepattern` tokenizer, but splits the input at matches rather +The `simple_pattern_split` tokenizer uses the same restricted regular expression +subset as the `simple_pattern` tokenizer, but splits the input at matches rather than returning the matches as terms. <>:: diff --git a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc index bee92c75d26..3f235fa6358 100644 --- a/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/simplepattern-tokenizer.asciidoc @@ -3,7 +3,7 @@ experimental[] -The `simplepattern` tokenizer uses a regular expression to capture matching +The `simple_pattern` tokenizer uses a regular expression to capture matching text as terms. The set of regular expression features it supports is more limited than the <> tokenizer, but the tokenization is generally faster. @@ -11,7 +11,7 @@ tokenization is generally faster. This tokenizer does not support splitting the input on a pattern match, unlike the <> tokenizer. To split on pattern matches using the same restricted regular expression subset, see the -<> tokenizer. +<> tokenizer. This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. For an explanation of the supported features and syntax, see <>. @@ -22,7 +22,7 @@ tokenizer should always be configured with a non-default pattern. [float] === Configuration -The `simplepattern` tokenizer accepts the following parameters: +The `simple_pattern` tokenizer accepts the following parameters: [horizontal] `pattern`:: @@ -31,7 +31,7 @@ The `simplepattern` tokenizer accepts the following parameters: [float] === Example configuration -This example configures the `simplepattern` tokenizer to produce terms that are +This example configures the `simple_pattern` tokenizer to produce terms that are three-digit numbers [source,js] @@ -47,7 +47,7 @@ PUT my_index }, "tokenizer": { "my_tokenizer": { - "type": "simplepattern", + "type": "simple_pattern", "pattern": "[0123456789]{3}" } } diff --git a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc index c009f8cb7a4..59b77936cb9 100644 --- a/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc +++ b/docs/reference/analysis/tokenizers/simplepatternsplit-tokenizer.asciidoc @@ -3,14 +3,14 @@ experimental[] -The `simplepatternsplit` tokenizer uses a regular expression to split the +The `simple_pattern_split` tokenizer uses a regular expression to split the input into terms at pattern matches. The set of regular expression features it supports is more limited than the <> tokenizer, but the tokenization is generally faster. This tokenizer does not produce terms from the matches themselves. To produce terms from matches using patterns in the same restricted regular expression -subset, see the <> +subset, see the <> tokenizer. This tokenizer uses {lucene-core-javadoc}/org/apache/lucene/util/automaton/RegExp.html[Lucene regular expressions]. @@ -23,7 +23,7 @@ pattern. [float] === Configuration -The `simplepatternsplit` tokenizer accepts the following parameters: +The `simple_pattern_split` tokenizer accepts the following parameters: [horizontal] `pattern`:: @@ -32,7 +32,7 @@ The `simplepatternsplit` tokenizer accepts the following parameters: [float] === Example configuration -This example configures the `simplepatternsplit` tokenizer to split the input +This example configures the `simple_pattern_split` tokenizer to split the input text on underscores. [source,js] @@ -48,7 +48,7 @@ PUT my_index }, "tokenizer": { "my_tokenizer": { - "type": "simplepatternsplit", + "type": "simple_pattern_split", "pattern": "_" } } diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java index 39fdf54bebe..0299e37affc 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java @@ -122,8 +122,8 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin { @Override public Map> getTokenizers() { Map> tokenizers = new TreeMap<>(); - tokenizers.put("simplepattern", SimplePatternTokenizerFactory::new); - tokenizers.put("simplepatternsplit", SimplePatternSplitTokenizerFactory::new); + tokenizers.put("simple_pattern", SimplePatternTokenizerFactory::new); + tokenizers.put("simple_pattern_split", SimplePatternSplitTokenizerFactory::new); return tokenizers; } diff --git a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml index 7063437ad46..c0945e047c5 100644 --- a/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml +++ b/modules/analysis-common/src/test/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml @@ -27,14 +27,14 @@ - match: { detail.tokenizer.tokens.2.token: od } --- -"simplepattern": +"simple_pattern": - do: indices.analyze: body: text: "a6bf fooo ff61" explain: true tokenizer: - type: simplepattern + type: simple_pattern pattern: "[abcdef0123456789]{4}" - length: { detail.tokenizer.tokens: 2 } - match: { detail.tokenizer.name: _anonymous_tokenizer } @@ -42,14 +42,14 @@ - match: { detail.tokenizer.tokens.1.token: ff61 } --- -"simplepatternsplit": +"simple_pattern_split": - do: indices.analyze: body: text: "foo==bar" explain: true tokenizer: - type: simplepatternsplit + type: simple_pattern_split pattern: == - length: { detail.tokenizer.tokens: 2 } - match: { detail.tokenizer.name: _anonymous_tokenizer }