diff --git a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc index 5f6bec96dba..d806bb2e9ac 100644 --- a/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/shingle-tokenfilter.asciidoc @@ -4,46 +4,507 @@ Shingle ++++ -NOTE: Shingles are generally used to help speed up phrase queries. Rather -than building filter chains by hand, you may find it easier to use the -<> option on a text field. +Add shingles, or word https://en.wikipedia.org/wiki/N-gram[n-grams], to a token +stream by concatenating adjacent tokens. By default, the `shingle` token filter +outputs two-word shingles and unigrams. -A token filter of type `shingle` that constructs shingles (token -n-grams) from a token stream. In other words, it creates combinations of -tokens as a single token. For example, the sentence "please divide this -sentence into shingles" might be tokenized into shingles "please -divide", "divide this", "this sentence", "sentence into", and "into -shingles". +For example, many tokenizers convert `the lazy dog` to `[ the, lazy, dog ]`. You +can use the `shingle` filter to add two-word shingles to this stream: +`[ the, the lazy, lazy, lazy dog, dog ]`. -This filter handles position increments > 1 by inserting filler tokens -(tokens with termtext "_"). It does not handle a position increment of -0. +TIP: Shingles are often used to help speed up phrase queries, such as +<>. Rather than creating shingles +using the `shingles` filter, we recommend you use the +<> mapping parameter on the appropriate +<> field instead. -The following are settings that can be set for a `shingle` token filter -type: +This filter uses Lucene's +{lucene-analysis-docs}/shingle/ShingleFilter.html[ShingleFilter]. -[cols="<,<",options="header",] -|======================================================================= -|Setting |Description -|`max_shingle_size` |The maximum shingle size. Defaults to `2`. +[[analysis-shingle-tokenfilter-analyze-ex]] +==== Example -|`min_shingle_size` |The minimum shingle size. Defaults to `2`. +The following <> request uses the `shingle` +filter to add two-word shingles to the token stream for `quick brown fox jumps`: -|`output_unigrams` |If `true` the output will contain the input tokens -(unigrams) as well as the shingles. Defaults to `true`. +[source,console] +---- +GET /_analyze +{ + "tokenizer": "whitespace", + "filter": [ "shingle" ], + "text": "quick brown fox jumps" +} +---- -|`output_unigrams_if_no_shingles` |If `output_unigrams` is `false` the -output will contain the input tokens (unigrams) if no shingles are -available. Note if `output_unigrams` is set to `true` this setting has -no effect. Defaults to `false`. +The filter produces the following tokens: -|`token_separator` |The string to use when joining adjacent tokens to -form a shingle. Defaults to `" "`. -|`filler_token` | The string to use as a replacement for each position -at which there is no actual token in the stream. For instance this string is -used if the position increment is greater than one when a `stop` filter is used -together with the `shingle` filter. Defaults to `"_"` -|======================================================================= +[source,text] +---- +[ quick, quick brown, brown, brown fox, fox, fox jumps, jumps ] +---- -The index level setting `index.max_shingle_diff` controls the maximum allowed -difference between `max_shingle_size` and `min_shingle_size`. +//// +[source,console-result] +---- +{ + "tokens": [ + { + "token": "quick", + "start_offset": 0, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "quick brown", + "start_offset": 0, + "end_offset": 11, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "brown", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "brown fox", + "start_offset": 6, + "end_offset": 15, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "fox", + "start_offset": 12, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "fox jumps", + "start_offset": 12, + "end_offset": 21, + "type": "shingle", + "position": 2, + "positionLength": 2 + }, + { + "token": "jumps", + "start_offset": 16, + "end_offset": 21, + "type": "word", + "position": 3 + } + ] +} +---- +//// + +To produce shingles of 2-3 words, add the following arguments to the analyze API +request: + +* `min_shingle_size`: `2` +* `max_shingle_size`: `3` + +[source,console] +---- +GET /_analyze +{ + "tokenizer": "whitespace", + "filter": [ + { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 3 + } + ], + "text": "quick brown fox jumps" +} +---- + +The filter produces the following tokens: + +[source,text] +---- +[ quick, quick brown, quick brown fox, brown, brown fox, brown fox jumps, fox, fox jumps, jumps ] +---- + +//// +[source, console-result] +---- +{ + "tokens": [ + { + "token": "quick", + "start_offset": 0, + "end_offset": 5, + "type": "word", + "position": 0 + }, + { + "token": "quick brown", + "start_offset": 0, + "end_offset": 11, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "quick brown fox", + "start_offset": 0, + "end_offset": 15, + "type": "shingle", + "position": 0, + "positionLength": 3 + }, + { + "token": "brown", + "start_offset": 6, + "end_offset": 11, + "type": "word", + "position": 1 + }, + { + "token": "brown fox", + "start_offset": 6, + "end_offset": 15, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "brown fox jumps", + "start_offset": 6, + "end_offset": 21, + "type": "shingle", + "position": 1, + "positionLength": 3 + }, + { + "token": "fox", + "start_offset": 12, + "end_offset": 15, + "type": "word", + "position": 2 + }, + { + "token": "fox jumps", + "start_offset": 12, + "end_offset": 21, + "type": "shingle", + "position": 2, + "positionLength": 2 + }, + { + "token": "jumps", + "start_offset": 16, + "end_offset": 21, + "type": "word", + "position": 3 + } + ] +} +---- +//// + +To only include shingles in the output, add an `output_unigrams` argument of +`false` to the request. + +[source,console] +---- +GET /_analyze +{ + "tokenizer": "whitespace", + "filter": [ + { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 3, + "output_unigrams": false + } + ], + "text": "quick brown fox jumps" +} +---- + +The filter produces the following tokens: + +[source,text] +---- +[ quick brown, quick brown fox, brown fox, brown fox jumps, fox jumps ] +---- + +//// +[source, console-result] +---- +{ + "tokens": [ + { + "token": "quick brown", + "start_offset": 0, + "end_offset": 11, + "type": "shingle", + "position": 0 + }, + { + "token": "quick brown fox", + "start_offset": 0, + "end_offset": 15, + "type": "shingle", + "position": 0, + "positionLength": 2 + }, + { + "token": "brown fox", + "start_offset": 6, + "end_offset": 15, + "type": "shingle", + "position": 1 + }, + { + "token": "brown fox jumps", + "start_offset": 6, + "end_offset": 21, + "type": "shingle", + "position": 1, + "positionLength": 2 + }, + { + "token": "fox jumps", + "start_offset": 12, + "end_offset": 21, + "type": "shingle", + "position": 2 + } + ] +} +---- +//// + +[[analysis-shingle-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`shingle` filter to configure a new <>. + +[source,console] +---- +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "standard_shingle": { + "tokenizer": "standard", + "filter": [ "shingle" ] + } + } + } + } +} +---- + +[[analysis-shingle-tokenfilter-configure-parms]] +==== Configurable parameters + +`max_shingle_size`:: +(Optional, integer) +Maximum number of tokens to concatenate when creating shingles. Defaults to `2`. ++ +NOTE: This value cannot be lower than the `min_shingle_size` argument, which +defaults to `2`. The difference between this value and the `min_shingle_size` +argument cannot exceed the <> +index-level setting, which defaults to `3`. + +`min_shingle_size`:: +(Optional, integer) +Minimum number of tokens to concatenate when creating shingles. Defaults to `2`. ++ +NOTE: This value cannot exceed the `max_shingle_size` argument, which defaults +to `2`. The difference between the `max_shingle_size` argument and this value +cannot exceed the <> +index-level setting, which defaults to `3`. + +`output_unigrams`:: +(Optional, boolean) +If `true`, the output includes the original input tokens. If `false`, the output +only includes shingles; the original input tokens are removed. Defaults to +`true`. + +`output_unigrams_if_no_shingles`:: +If `true`, the output includes the original input tokens only if no shingles are +produced; if shingles are produced, the output only includes shingles. Defaults +to `false`. ++ +IMPORTANT: If both this and the `output_unigrams` parameter are `true`, only the +`output_unigrams` argument is used. + +`token_separator`:: +(Optional, string) +Separator used to concatenate adjacent tokens to form a shingle. Defaults to a +space (`" "`). + +`filler_token`:: ++ +-- +(Optional, string) +String used in shingles as a replacement for empty positions that do not contain +a token. This filler token is only used in shingles, not original unigrams. +Defaults to an underscore (`_`). + +Some token filters, such as the `stop` filter, create empty positions when +removing stop words with a position increment greater than one. + +.*Example* +[%collapsible] +==== +In the following <> request, the `stop` filter +removes the stop word `a` from `fox jumps a lazy dog`, creating an empty +position. The subsequent `shingle` filter replaces this empty position with a +plus sign (`+`) in shingles. + +[source,console] +---- +GET /_analyze +{ + "tokenizer": "whitespace", + "filter": [ + { + "type": "stop", + "stopwords": [ "a" ] + }, + { + "type": "shingle", + "filler_token": "+" + } + ], + "text": "fox jumps a lazy dog" +} +---- + +The filter produces the following tokens: + +[source,text] +---- +[ fox, fox jumps, jumps, jumps +, + lazy, lazy, lazy dog, dog ] +---- + +//// +[source, console-result] +---- +{ + "tokens" : [ + { + "token" : "fox", + "start_offset" : 0, + "end_offset" : 3, + "type" : "word", + "position" : 0 + }, + { + "token" : "fox jumps", + "start_offset" : 0, + "end_offset" : 9, + "type" : "shingle", + "position" : 0, + "positionLength" : 2 + }, + { + "token" : "jumps", + "start_offset" : 4, + "end_offset" : 9, + "type" : "word", + "position" : 1 + }, + { + "token" : "jumps +", + "start_offset" : 4, + "end_offset" : 12, + "type" : "shingle", + "position" : 1, + "positionLength" : 2 + }, + { + "token" : "+ lazy", + "start_offset" : 12, + "end_offset" : 16, + "type" : "shingle", + "position" : 2, + "positionLength" : 2 + }, + { + "token" : "lazy", + "start_offset" : 12, + "end_offset" : 16, + "type" : "word", + "position" : 3 + }, + { + "token" : "lazy dog", + "start_offset" : 12, + "end_offset" : 20, + "type" : "shingle", + "position" : 3, + "positionLength" : 2 + }, + { + "token" : "dog", + "start_offset" : 17, + "end_offset" : 20, + "type" : "word", + "position" : 4 + } + ] +} +---- +//// +==== +-- + +[[analysis-shingle-tokenfilter-customize]] +==== Customize + +To customize the `shingle` filter, duplicate it to create the basis for a new +custom token filter. You can modify the filter using its configurable +parameters. + +For example, the following <> request +uses a custom `shingle` filter, `my_shingle_filter`, to configure a new +<>. + +The `my_shingle_filter` filter uses a `min_shingle_size` of `2` and a +`max_shingle_size` of `5`, meaning it produces shingles of 2-5 words. +The filter also includes a `output_unigrams` argument of `false`, meaning that +only shingles are included in the output. + +[source,console] +---- +PUT /my_index +{ + "settings": { + "analysis": { + "analyzer": { + "en": { + "tokenizer": "standard", + "filter": [ "my_shingle_filter" ] + } + }, + "filter": { + "my_shingle_filter": { + "type": "shingle", + "min_shingle_size": 2, + "max_shingle_size": 5, + "output_unigrams": false + } + } + } + } +} +---- diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc index 4fb31e31e85..6a9c1a988ea 100644 --- a/docs/reference/index-modules.asciidoc +++ b/docs/reference/index-modules.asciidoc @@ -165,10 +165,12 @@ specific index module: The maximum allowed difference between min_gram and max_gram for NGramTokenizer and NGramTokenFilter. Defaults to `1`. +[[index-max-shingle-diff]] `index.max_shingle_diff`:: - The maximum allowed difference between max_shingle_size and min_shingle_size for ShingleTokenFilter. - Defaults to `3`. + The maximum allowed difference between max_shingle_size and min_shingle_size + for the <>. Defaults to + `3`. `index.blocks.read_only`::