diff --git a/docs/reference/analysis/tokenfilters.asciidoc b/docs/reference/analysis/tokenfilters.asciidoc index 14a67b2fc04..b33356d3c0b 100644 --- a/docs/reference/analysis/tokenfilters.asciidoc +++ b/docs/reference/analysis/tokenfilters.asciidoc @@ -1,7 +1,7 @@ [[analysis-tokenfilters]] == Token Filters -Token filters accept a stream of tokens from a +Token filters accept a stream of tokens from a <> and can modify tokens (eg lowercasing), delete tokens (eg remove stopwords) or add tokens (eg synonyms). @@ -71,6 +71,10 @@ include::tokenfilters/common-grams-tokenfilter.asciidoc[] include::tokenfilters/normalization-tokenfilter.asciidoc[] +include::tokenfilters/cjk-width-tokenfilter.asciidoc[] + +include::tokenfilters/cjk-bigram-tokenfilter.asciidoc[] + include::tokenfilters/delimited-payload-tokenfilter.asciidoc[] include::tokenfilters/keep-words-tokenfilter.asciidoc[] diff --git a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc new file mode 100644 index 00000000000..4805d3dc950 --- /dev/null +++ b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc @@ -0,0 +1,42 @@ +[[analysis-cjk-bigram-tokenfilter]] +=== CJK Bigram Token Filter + +The `cjk_bigram` token filter forms bigrams out of the CJK +terms that are generated by the <> +or the `icu_tokenizer` (see <>). + +By default, when a CJK character has no adjacent characters to form a bigram, +it is output in unigram form. If you always want to output both unigrams and +bigrams, set the `output_unigrams` flag to `true`. This can be used for a +combined unigram+bigram approach. + +Bigrams are generated for characters in `han`, `hiragana`, `katakana` and +`hangul`, but bigrams can be disabled for particular scripts with the +`ignore_scripts` parameter. All non-CJK input is passed through unmodified. + +[source,js] +-------------------------------------------------- +{ + "index" : { + "analysis" : { + "analyzer" : { + "han_bigrams" : { + "tokenizer" : "standard", + "filter" : ["han_bigrams_filter"] + } + }, + "filter" : { + "han_bigrams_filter" : { + "type" : "cjk_bigram", + "ignore_scripts": [ + "hiragana", + "katakana" + "hangul" + ], + "output_ungirams" : true + } + } + } + } +} +-------------------------------------------------- diff --git a/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc new file mode 100644 index 00000000000..11bdf0f77dc --- /dev/null +++ b/docs/reference/analysis/tokenfilters/cjk-width-tokenfilter.asciidoc @@ -0,0 +1,12 @@ +[[analysis-cjk-width-tokenfilter]] +=== CJK Width Token Filter + +The `cjk_width` token filter normalizes CJK width differences: + +* Folds fullwidth ASCII variants into the equivalent basic Latin +* Folds halfwidth Katakana variants into the equivalent Kana + +NOTE: This token filter can be viewed as a subset of NFKC/NFKD +Unicode normalization. See the <> +for full normalization support. +