diff --git a/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc index 85ddd236556..69064eaba63 100644 --- a/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/pattern_replace-tokenfilter.asciidoc @@ -4,23 +4,157 @@ <titleabbrev>Pattern replace</titleabbrev> ++++ -The `pattern_replace` token filter allows to easily handle string -replacements based on a regular expression. The regular expression is -defined using the `pattern` parameter, and the replacement string can be -provided using the `replacement` parameter (supporting referencing the -original text, as explained -http://docs.oracle.com/javase/6/docs/api/java/util/regex/Matcher.html#appendReplacement(java.lang.StringBuffer,%20java.lang.String)[here]). +Uses a regular expression to match and replace token substrings. + +The `pattern_replace` filter uses +http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java's +regular expression syntax]. By default, the filter replaces matching +substrings with an empty substring (`""`). + +Regular expressions cannot be anchored to the +beginning or end of a token. Replacement substrings can use Java's +https://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#appendReplacement-java.lang.StringBuffer-java.lang.String-[`$g` syntax] to reference capture groups +from the original token text. [WARNING] -.Beware of Pathological Regular Expressions -======================================== +==== +A poorly-written regular expression may run slowly or return a +StackOverflowError, causing the node running the expression to exit suddenly. -The pattern replace token filter uses -http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java Regular Expressions]. +Read more about +http://www.regular-expressions.info/catastrophic.html[pathological regular +expressions and how to avoid them]. +==== -A badly written regular expression could run very slowly or even throw a -StackOverflowError and cause the node it is running on to exit suddenly. +This filter uses Lucene's +{lucene-analysis-docs}/pattern/PatternReplaceFilter.html[PatternReplaceFilter]. -Read more about http://www.regular-expressions.info/catastrophic.html[pathological regular expressions and how to avoid them]. +[[analysis-pattern-replace-tokenfilter-analyze-ex]] +==== Example -======================================== +The following <<indices-analyze,analyze API>> request uses the `pattern_replace` +filter to prepend `watch` to the substring `dog` in `foxes jump lazy dogs`. + +[source,console] +---- +GET /_analyze +{ + "tokenizer": "whitespace", + "filter": [ + { + "type": "pattern_replace", + "pattern": "(dog)", + "replacement": "watch$1" + } + ], + "text": "foxes jump lazy dogs" +} +---- + +The filter produces the following tokens. + +[source,text] +---- +[ foxes, jump, lazy, watchdogs ] +---- + +//// +[source,console-result] +---- +{ + "tokens": [ + { + "token": "foxes", + "start_offset": 0, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "jump", + "start_offset": 6, + "end_offset": 10, + "type": "word", + "position": 1 + }, + { + "token": "lazy", + "start_offset": 11, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "watchdogs", + "start_offset": 16, + "end_offset": 20, + "type": "word", + "position": 3 + } + ] +} +---- +//// + +[[analysis-pattern-replace-tokenfilter-configure-parms]] +==== Configurable parameters + +`all`:: +(Optional, boolean) +If `true`, all substrings matching the `pattern` parameter's regular expression +are replaced. If `false`, the filter replaces only the first matching substring +in each token. Defaults to `true`. + +`pattern`:: +(Required, string) +Regular expression, written in +http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html[Java's +regular expression syntax]. The filter replaces token substrings matching this +pattern with the substring in the `replacement` parameter. + +`replacement`:: +(Optional, string) +Replacement substring. Defaults to an empty substring (`""`). + +[[analysis-pattern-replace-tokenfilter-customize]] +==== Customize and add to an analyzer + +To customize the `pattern_replace` filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. + +The following <<indices-create-index,create index API>> request +configures a new <<analysis-custom-analyzer,custom analyzer>> using a custom +`pattern_replace` filter, `my_pattern_replace_filter`. + +The `my_pattern_replace_filter` filter uses the regular expression `[£|€]` to +match and remove the currency symbols `£` and `€`. The filter's `all` +parameter is `false`, meaning only the first matching symbol in each token is +removed. + +[source,console] +---- +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "keyword", + "filter": [ + "my_pattern_replace_filter" + ] + } + }, + "filter": { + "my_pattern_replace_filter": { + "type": "pattern_replace", + "pattern": "[£|€]", + "replacement": "", + "all": false + } + } + } + } +} +---- \ No newline at end of file