diff --git a/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc index e53a198df55..4bbe60e52be 100644 --- a/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/length-tokenfilter.asciidoc @@ -1,16 +1,170 @@ [[analysis-length-tokenfilter]] -=== Length Token Filter +=== Length token filter +++++ +Length +++++ -A token filter of type `length` that removes words that are too long or -too short for the stream. +Removes tokens shorter or longer than specified character lengths. +For example, you can use the `length` filter to exclude tokens shorter than 2 +characters and tokens longer than 5 characters. -The following are settings that can be set for a `length` token filter -type: +This filter uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/miscellaneous/LengthFilter.html[LengthFilter]. -[cols="<,<",options="header",] -|=========================================================== -|Setting |Description -|`min` |The minimum number. Defaults to `0`. -|`max` |The maximum number. Defaults to `Integer.MAX_VALUE`, which is `2^31-1` or 2147483647. -|=========================================================== +[TIP] +==== +The `length` filter removes entire tokens. If you'd prefer to shorten tokens to +a specific length, use the <> filter. +==== +[[analysis-length-tokenfilter-analyze-ex]] +==== Example + +The following <> request uses the `length` +filter to remove tokens longer than 4 characters: + +[source,console] +-------------------------------------------------- +GET _analyze +{ + "tokenizer": "whitespace", + "filter": [ + { + "type": "length", + "min": 0, + "max": 4 + } + ], + "text": "the quick brown fox jumps over the lazy dog" +} +-------------------------------------------------- + +The filter produces the following tokens: + +[source,text] +-------------------------------------------------- +[ the, fox, over, the, lazy, dog ] +-------------------------------------------------- + +///////////////////// +[source,console-result] +-------------------------------------------------- +{ + "tokens": [ + { + "token": "the", + "start_offset": 0, + "end_offset": 3, + "type": "word", + "position": 0 + }, + { + "token": "fox", + "start_offset": 16, + "end_offset": 19, + "type": "word", + "position": 3 + }, + { + "token": "over", + "start_offset": 26, + "end_offset": 30, + "type": "word", + "position": 5 + }, + { + "token": "the", + "start_offset": 31, + "end_offset": 34, + "type": "word", + "position": 6 + }, + { + "token": "lazy", + "start_offset": 35, + "end_offset": 39, + "type": "word", + "position": 7 + }, + { + "token": "dog", + "start_offset": 40, + "end_offset": 43, + "type": "word", + "position": 8 + } + ] +} +-------------------------------------------------- +///////////////////// + +[[analysis-length-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`length` filter to configure a new +<>. + +[source,console] +-------------------------------------------------- +PUT length_example +{ + "settings": { + "analysis": { + "analyzer": { + "standard_length": { + "tokenizer": "standard", + "filter": [ "length" ] + } + } + } + } +} +-------------------------------------------------- + +[[analysis-length-tokenfilter-configure-parms]] +==== Configurable parameters + +`min`:: +(Optional, integer) +Minimum character length of a token. Shorter tokens are excluded from the +output. Defaults to `0`. + +`max`:: +(Optional, integer) +Maximum character length of a token. Longer tokens are excluded from the output. +Defaults to `Integer.MAX_VALUE`, which is `2^31-1` or `2147483647`. + +[[analysis-length-tokenfilter-customize]] +==== Customize + +To customize the `length` filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. + +For example, the following request creates a custom `length` filter that removes +tokens shorter than 2 characters and tokens longer than 10 characters: + +[source,console] +-------------------------------------------------- +PUT length_custom_example +{ + "settings": { + "analysis": { + "analyzer": { + "whitespace_length_2_to_10_char": { + "tokenizer": "whitespace", + "filter": [ "length_2_to_10_char" ] + } + }, + "filter": { + "length_2_to_10_char": { + "type": "length", + "min": 2, + "max": 10 + } + } + } + } +} +--------------------------------------------------