From e1eebea846d698158f690abb60bbfce53b956e2d Mon Sep 17 00:00:00 2001 From: James Rodewig Date: Mon, 16 Mar 2020 11:37:06 -0400 Subject: [PATCH] [DOCS] Reformat `remove_duplicates` token filter (#53608) Makes the following changes to the `remove_duplicates` token filter docs: * Rewrites description and adds Lucene link * Adds detailed analyze example * Adds custom analyzer example --- .../remove-duplicates-tokenfilter.asciidoc | 150 +++++++++++++++++- 1 file changed, 148 insertions(+), 2 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc index e9dbf1ed153..76f43bffc72 100644 --- a/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/remove-duplicates-tokenfilter.asciidoc @@ -4,5 +4,151 @@ Remove duplicates ++++ -A token filter of type `remove_duplicates` that drops identical tokens at the -same position. +Removes duplicate tokens in the same position. + +The `remove_duplicates` filter uses Lucene's +{lucene-analysis-docs}/miscellaneous/RemoveDuplicatesTokenFilter.html[RemoveDuplicatesTokenFilter]. + +[[analysis-remove-duplicates-tokenfilter-analyze-ex]] +==== Example + +To see how the `remove_duplicates` filter works, you first need to produce a +token stream containing duplicate tokens in the same position. + +The following <> request uses the +<> and +<> filters to create stemmed and +unstemmed tokens for `jumping dog`. + +[source,console] +---- +GET _analyze +{ + "tokenizer": "whitespace", + "filter": [ + "keyword_repeat", + "stemmer" + ], + "text": "jumping dog" +} +---- + +The API returns the following response. Note that the `dog` token in position +`1` is duplicated. + +[source,console-result] +---- +{ + "tokens": [ + { + "token": "jumping", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "jump", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 1 + } + ] +} +---- + +To remove one of the duplicate `dog` tokens, add the `remove_duplicates` filter +to the previous analyze API request. + +[source,console] +---- +GET _analyze +{ + "tokenizer": "whitespace", + "filter": [ + "keyword_repeat", + "stemmer", + "remove_duplicates" + ], + "text": "jumping dog" +} +---- + +The API returns the following response. There is now only one `dog` token in +position `1`. + +[source,console-result] +---- +{ + "tokens": [ + { + "token": "jumping", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "jump", + "start_offset": 0, + "end_offset": 7, + "type": "word", + "position": 0 + }, + { + "token": "dog", + "start_offset": 8, + "end_offset": 11, + "type": "word", + "position": 1 + } + ] +} +---- + +[[analysis-remove-duplicates-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`remove_duplicates` filter to configure a new <>. + +This custom analyzer uses the `keyword_repeat` and `stemmer` filters to create a +stemmed and unstemmed version of each token in a stream. The `remove_duplicates` +filter then removes any duplicate tokens in the same position. + +[source,console] +---- +PUT my_index +{ + "settings": { + "analysis": { + "analyzer": { + "my_custom_analyzer": { + "tokenizer": "standard", + "filter": [ + "keyword_repeat", + "stemmer", + "remove_duplicates" + ] + } + } + } + } +} +---- \ No newline at end of file