diff --git a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc index e68c8df6387..0e029fa2374 100644 --- a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc @@ -1,90 +1,52 @@ [[analysis-common-grams-tokenfilter]] -=== Common Grams Token Filter +=== Common grams token filter +++++ +Common grams +++++ -Token filter that generates bigrams for frequently occurring terms. -Single terms are still indexed. It can be used as an alternative to the -<> when we don't want to completely ignore common terms. +Generates https://en.wikipedia.org/wiki/Bigram[bigrams] for a specified set of +common words. -For example, the text "the quick brown is a fox" will be tokenized as -"the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", -"a_fox", "fox". Assuming "the", "is" and "a" are common words. +For example, you can specify `is` and `the` as common words. This filter then +converts the tokens `[the, quick, fox, is, brown]` to `[the, the_quick, quick, +fox, fox_is, is, is_brown, brown]`. -When `query_mode` is enabled, the token filter removes common words and -single terms followed by a common word. This parameter should be enabled -in the search analyzer. +You can use the `common_grams` filter in place of the +<> when you don't want to +completely ignore common words. -For example, the query "the quick brown is a fox" will be tokenized as -"the_quick", "quick", "brown_is", "is_a", "a_fox", "fox". +This filter uses Lucene's +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html[CommonGramsFilter]. -The following are settings that can be set: +[[analysis-common-grams-analyze-ex]] +==== Example -[cols="<,<",options="header",] -|======================================================================= -|Setting |Description -|`common_words` |A list of common words to use. - -|`common_words_path` |A path (either relative to `config` location, or -absolute) to a list of common words. Each word should be in its own -"line" (separated by a line break). The file must be UTF-8 encoded. - -|`ignore_case` |If true, common words matching will be case insensitive -(defaults to `false`). - -|`query_mode` |Generates bigrams then removes common words and single -terms followed by a common word (defaults to `false`). -|======================================================================= - -Note, `common_words` or `common_words_path` field is required. - -Here is an example: +The following <> request creates bigrams for `is` +and `the`: [source,console] -------------------------------------------------- -PUT /common_grams_example +GET /_analyze { - "settings": { - "analysis": { - "analyzer": { - "index_grams": { - "tokenizer": "whitespace", - "filter": ["common_grams"] - }, - "search_grams": { - "tokenizer": "whitespace", - "filter": ["common_grams_query"] - } - }, - "filter": { - "common_grams": { - "type": "common_grams", - "common_words": ["the", "is", "a"] - }, - "common_grams_query": { - "type": "common_grams", - "query_mode": true, - "common_words": ["the", "is", "a"] - } - } - } + "tokenizer" : "whitespace", + "filter" : [ + "common_grams", { + "type": "common_grams", + "common_words": ["is", "the"] } + ], + "text" : "the quick fox is brown" } -------------------------------------------------- -You can see the output by using e.g. the `_analyze` endpoint: +The filter produces the following tokens: -[source,console] +[source,text] -------------------------------------------------- -POST /common_grams_example/_analyze -{ - "analyzer" : "index_grams", - "text" : "the quick brown is a fox" -} +[ the, the_quick, quick, fox, fox_is, is, is_brown, brown ] -------------------------------------------------- -// TEST[continued] - -And the response will be: +///////////////////// [source,console-result] -------------------------------------------------- { @@ -112,57 +74,155 @@ And the response will be: "position" : 1 }, { - "token" : "brown", + "token" : "fox", "start_offset" : 10, - "end_offset" : 15, + "end_offset" : 13, "type" : "word", "position" : 2 }, { - "token" : "brown_is", + "token" : "fox_is", "start_offset" : 10, - "end_offset" : 18, + "end_offset" : 16, "type" : "gram", "position" : 2, "positionLength" : 2 }, { "token" : "is", - "start_offset" : 16, - "end_offset" : 18, + "start_offset" : 14, + "end_offset" : 16, "type" : "word", "position" : 3 }, { - "token" : "is_a", - "start_offset" : 16, - "end_offset" : 20, + "token" : "is_brown", + "start_offset" : 14, + "end_offset" : 22, "type" : "gram", "position" : 3, "positionLength" : 2 }, { - "token" : "a", - "start_offset" : 19, - "end_offset" : 20, + "token" : "brown", + "start_offset" : 17, + "end_offset" : 22, "type" : "word", "position" : 4 - }, - { - "token" : "a_fox", - "start_offset" : 19, - "end_offset" : 24, - "type" : "gram", - "position" : 4, - "positionLength" : 2 - }, - { - "token" : "fox", - "start_offset" : 21, - "end_offset" : 24, - "type" : "word", - "position" : 5 } ] } -------------------------------------------------- +///////////////////// + +[[analysis-common-grams-tokenfilter-analyzer-ex]] +==== Add to an analyzer + +The following <> request uses the +`common_grams` filter to configure a new +<>: + +[source,console] +-------------------------------------------------- +PUT /common_grams_example +{ + "settings": { + "analysis": { + "analyzer": { + "index_grams": { + "tokenizer": "whitespace", + "filter": ["common_grams"] + } + }, + "filter": { + "common_grams": { + "type": "common_grams", + "common_words": ["a", "is", "the"] + } + } + } + } +} +-------------------------------------------------- + +[[analysis-common-grams-tokenfilter-configure-parms]] +==== Configurable parameters + +`common_words`:: ++ +-- +(Required+++*+++, array of strings) +A list of tokens. The filter generates bigrams for these tokens. + +Either this or the `common_words_path` parameter is required. +-- + +`common_words_path`:: ++ +-- +(Required+++*+++, string) +Path to a file containing a list of tokens. The filter generates bigrams for +these tokens. + +This path must be absolute or relative to the `config` location. The file must +be UTF-8 encoded. Each token in the file must be separated by a line break. + +Either this or the `common_words` parameter is required. +-- + +`ignore_case`:: +(Optional, boolean) +If `true`, matches for common words matching are case-insensitive. +Defaults to `false`. + +`query_mode`:: ++ +-- +(Optional, boolean) +If `true`, the filter excludes the following tokens from the output: + +* Unigrams for common words +* Unigrams for terms followed by common words + +Defaults to `false`. We recommend enabling this parameter for +<>. + +For example, you can enable this parameter and specify `is` and `the` as +common words. This filter converts the tokens `[the, quick, fox, is, brown]` to +`[the_quick, quick, fox_is, is_brown,]`. +-- + +[[analysis-common-grams-tokenfilter-customize]] +==== Customize + +To customize the `common_grams` filter, duplicate it to create the basis +for a new custom token filter. You can modify the filter using its configurable +parameters. + +For example, the following request creates a custom `common_grams` filter with +`ignore_case` and `query_mode` set to `true`: + +[source,console] +-------------------------------------------------- +PUT /common_grams_example +{ + "settings": { + "analysis": { + "analyzer": { + "index_grams": { + "tokenizer": "whitespace", + "filter": ["common_grams_query"] + } + }, + "filter": { + "common_grams_query": { + "type": "common_grams", + "common_words": ["a", "is", "the"], + "ignore_case": true, + "query_mode": true + } + } + } + } +} +--------------------------------------------------