CONSOLEify analysis docs

Converts the analysis docs to that were marked as json into `CONSOLE`
format. A few of them were in yaml but marked as json for historical
reasons. I added more complete examples for a few of the less obvious
sounding ones.

Relates to #18160
This commit is contained in:
Nik Everett 2017-04-02 11:15:26 -04:00
parent 01b807f98e
commit ad69503dce
15 changed files with 439 additions and 158 deletions

View File

@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [
'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc',
'reference/cat/snapshots.asciidoc',
'reference/cat/templates.asciidoc',
'reference/cat/thread_pool.asciidoc',
@ -124,10 +112,14 @@ integTestCluster {
configFile 'scripts/my_map_script.painless'
configFile 'scripts/my_combine_script.painless'
configFile 'scripts/my_reduce_script.painless'
configFile 'analysis/example_word_list.txt'
configFile 'analysis/hyphenation_patterns.xml'
configFile 'analysis/synonym.txt'
configFile 'analysis/stemmer_override.txt'
configFile 'userdict_ja.txt'
configFile 'KeywordTokenizer.rbbi'
extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'
// Whitelist reindexing from the local node so we can test it.
setting 'reindex.remote.whitelist', '127.0.0.1:*'
}

View File

@ -8,17 +8,21 @@ equivalents, if one exists. Example:
[source,js]
--------------------------------------------------
"index" : {
"analysis" : {
"analyzer" : {
"default" : {
"tokenizer" : "standard",
"filter" : ["standard", "asciifolding"]
PUT /asciifold_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"default" : {
"tokenizer" : "standard",
"filter" : ["standard", "asciifolding"]
}
}
}
}
}
--------------------------------------------------
// CONSOLE
Accepts `preserve_original` setting which defaults to false but if true
will keep the original token as well as emit the folded token. For
@ -26,20 +30,24 @@ example:
[source,js]
--------------------------------------------------
"index" : {
"analysis" : {
"analyzer" : {
"default" : {
"tokenizer" : "standard",
"filter" : ["standard", "my_ascii_folding"]
}
},
"filter" : {
"my_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : true
PUT /asciifold_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"default" : {
"tokenizer" : "standard",
"filter" : ["standard", "my_ascii_folding"]
}
},
"filter" : {
"my_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : true
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
[source,js]
--------------------------------------------------
PUT /cjk_bigram_example
{
"index" : {
"settings" : {
"analysis" : {
"analyzer" : {
"han_bigrams" : {
@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
}
}
--------------------------------------------------
// CONSOLE

View File

@ -41,21 +41,33 @@ Here is an example:
[source,js]
--------------------------------------------------
index :
analysis :
analyzer :
index_grams :
tokenizer : whitespace
filter : [common_grams]
search_grams :
tokenizer : whitespace
filter : [common_grams_query]
filter :
common_grams :
type : common_grams
common_words: [a, an, the]
common_grams_query :
type : common_grams
query_mode: true
common_words: [a, an, the]
PUT /common_grams_example
{
"settings": {
"analysis": {
"my_analyzer": {
"index_grams": {
"tokenizer": "whitespace",
"filter": ["common_grams"]
},
"search_grams": {
"tokenizer": "whitespace",
"filter": ["common_grams_query"]
}
},
"filter": {
"common_grams": {
"type": "common_grams",
"common_words": ["a", "an", "the"]
},
"common_grams_query": {
"type": "common_grams",
"query_mode": true,
"common_words": ["a", "an", "the"]
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -1,5 +1,5 @@
[[analysis-compound-word-tokenfilter]]
=== Compound Word Token Filter
=== Compound Word Token Filters
The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
decompose compound words found in many German languages into word parts.
@ -84,20 +84,31 @@ Here is an example:
[source,js]
--------------------------------------------------
index :
analysis :
analyzer :
myAnalyzer2 :
type : custom
tokenizer : standard
filter : [myTokenFilter1, myTokenFilter2]
filter :
myTokenFilter1 :
type : dictionary_decompounder
word_list: [one, two, three]
myTokenFilter2 :
type : hyphenation_decompounder
word_list_path: path/to/words.txt
hyphenation_patterns_path: path/to/fop.xml
max_subword_size : 22
PUT /compound_word_example
{
"index": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["dictionary_decompounder", "hyphenation_decompounder"]
}
},
"filter": {
"dictionary_decompounder": {
"type": "dictionary_decompounder",
"word_list": ["one", "two", "three"]
},
"hyphenation_decompounder": {
"type" : "hyphenation_decompounder",
"word_list_path": "analysis/example_word_list.txt",
"hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
"max_subword_size": 22
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -9,20 +9,24 @@ example:
[source,js]
--------------------------------------------------
"index" : {
"analysis" : {
"analyzer" : {
"default" : {
"tokenizer" : "standard",
"filter" : ["standard", "elision"]
}
},
"filter" : {
"elision" : {
"type" : "elision",
"articles" : ["l", "m", "t", "qu", "n", "s", "j"]
PUT /elision_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"default" : {
"tokenizer" : "standard",
"filter" : ["standard", "elision"]
}
},
"filter" : {
"elision" : {
"type" : "elision",
"articles" : ["l", "m", "t", "qu", "n", "s", "j"]
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up).
For example, assuming the default hunspell location is used, the
following directory layout will define the `en_US` dictionary:
[source,js]
[source,txt]
--------------------------------------------------
- conf
|-- hunspell
@ -42,24 +42,28 @@ settings:
[source,js]
--------------------------------------------------
PUT /hunspell_example
{
"analysis" : {
"analyzer" : {
"en" : {
"tokenizer" : "standard",
"filter" : [ "lowercase", "en_US" ]
}
},
"filter" : {
"en_US" : {
"type" : "hunspell",
"locale" : "en_US",
"dedup" : true
"settings": {
"analysis" : {
"analyzer" : {
"en" : {
"tokenizer" : "standard",
"filter" : [ "lowercase", "en_US" ]
}
},
"filter" : {
"en_US" : {
"type" : "hunspell",
"locale" : "en_US",
"dedup" : true
}
}
}
}
}
--------------------------------------------------
// CONSOLE
The hunspell token filter accepts four options:

View File

@ -1,7 +1,7 @@
[[analysis-keep-types-tokenfilter]]
=== Keep Types Token Filter
A token filter of type `keep_types` that only keeps tokens with a token type
A token filter of type `keep_types` that only keeps tokens with a token type
contained in a predefined set.
@ -14,24 +14,61 @@ types:: a list of types to keep
[float]
=== Settings example
You can set it up like:
[source,js]
--------------------------------------------------
PUT /keep_types_example
{
"index" : {
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "extract_numbers"]
},
}
},
"filter" : {
"extract_numbers" : {
"type" : "keep_types",
"types" : [ "<NUM>" ]
},
}
}
}
}
}
--------------------------------------------------
// CONSOLE
And test it like:
[source,js]
--------------------------------------------------
POST /keep_types_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "this is just 1 a test"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "1",
"start_offset": 13,
"end_offset": 14,
"type": "<NUM>",
"position": 3
}
]
}
--------------------------------------------------
// TESTRESPONSE
Note how only the `<NUM>` token is in the output.

View File

@ -20,17 +20,18 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
[source,js]
--------------------------------------------------
PUT /keep_words_example
{
"index" : {
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"example_1" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "words_till_three"]
},
"my_analyzer1" : {
"example_2" : {
"tokenizer" : "standard",
"filter" : ["standard", "lowercase", "words_on_file"]
"filter" : ["standard", "lowercase", "words_in_file"]
}
},
"filter" : {
@ -38,12 +39,13 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
"type" : "keep",
"keep_words" : [ "one", "two", "three"]
},
"words_on_file" : {
"words_in_file" : {
"type" : "keep",
"keep_words_path" : "/path/to/word/file"
"keep_words_path" : "analysis/example_word_list.txt"
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -19,19 +19,124 @@ in the text.
`false`.
|=======================================================================
Here is an example:
You can configure it like:
[source,js]
--------------------------------------------------
index :
analysis :
analyzer :
myAnalyzer :
type : custom
tokenizer : standard
filter : [lowercase, protwords, porter_stem]
filter :
protwords :
type : keyword_marker
keywords_path : analysis/protwords.txt
PUT /keyword_marker_example
{
"settings": {
"analysis": {
"analyzer": {
"protect_cats": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "protect_cats", "porter_stem"]
},
"normal": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "porter_stem"]
}
},
"filter": {
"protect_cats": {
"type": "keyword_marker",
"keywords": ["cats"]
}
}
}
}
}
--------------------------------------------------
// CONSOLE
And test it with:
[source,js]
--------------------------------------------------
POST /keyword_marker_example/_analyze
{
"analyzer" : "protect_cats",
"text" : "I like cats"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "like",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cats",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE
As compared to the `normal` analyzer which has `cats` stemmed to `cat`:
[source,js]
--------------------------------------------------
POST /keyword_marker_example/_analyze
{
"analyzer" : "normal",
"text" : "I like cats"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
Response:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "like",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cat",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE

View File

@ -9,18 +9,85 @@ subsequent stemmer will be indexed twice. Therefore, consider adding a
`unique` filter with `only_on_same_position` set to `true` to drop
unnecessary duplicates.
Here is an example:
Here is an example of using the `keyword_repeat` token filter to
preserve both the stemmed and unstemmed version of tokens:
[source,js]
--------------------------------------------------
index :
analysis :
analyzer :
myAnalyzer :
type : custom
tokenizer : standard
filter : [lowercase, keyword_repeat, porter_stem, unique_stem]
unique_stem:
type: unique
only_on_same_position : true
PUT /keyword_repeat_example
{
"settings": {
"analysis": {
"analyzer": {
"stemmed_and_unstemmed": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "keyword_repeat", "porter_stem", "unique_stem"]
}
},
"filter": {
"unique_stem": {
"type": "unique",
"only_on_same_position": true
}
}
}
}
}
--------------------------------------------------
// CONSOLE
And you can test it with:
[source,js]
--------------------------------------------------
POST /keyword_repeat_example/_analyze
{
"analyzer" : "stemmed_and_unstemmed",
"text" : "I like cats"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "like",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cats",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "cat",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE
Which preserves both the `cat` and `cats` tokens. Compare this to the example
on the <<analysis-keyword-marker-tokenfilter>>.

View File

@ -18,15 +18,25 @@ Here is an example:
[source,js]
--------------------------------------------------
index :
analysis :
analyzer :
myAnalyzer :
type : custom
tokenizer : standard
filter : [lowercase, five_token_limit]
filter :
five_token_limit :
type : limit
max_token_count : 5
PUT /limit_example
{
"settings": {
"analysis": {
"analyzer": {
"limit_example": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "five_token_limit"]
}
},
"filter": {
"five_token_limit": {
"type": "limit",
"max_token_count": 5
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -10,28 +10,30 @@ custom analyzer
[source,js]
--------------------------------------------------
index :
analysis :
analyzer :
myAnalyzer2 :
type : custom
tokenizer : myTokenizer1
filter : [myTokenFilter1, myGreekLowerCaseFilter]
char_filter : [my_html]
tokenizer :
myTokenizer1 :
type : standard
max_token_length : 900
filter :
myTokenFilter1 :
type : stop
stopwords : [stop1, stop2, stop3, stop4]
myGreekLowerCaseFilter :
type : lowercase
language : greek
char_filter :
my_html :
type : html_strip
escaped_tags : [xxx, yyy]
read_ahead : 1024
PUT /lowercase_example
{
"settings": {
"analysis": {
"analyzer": {
"standard_lowercase_example": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase"]
},
"greek_lowercase_example": {
"type": "custom",
"tokenizer": "standard",
"filter": ["greek_lowercase"]
}
},
"filter": {
"greek_lowercase": {
"type": "lowercase",
"language": "greek"
}
}
}
}
}
--------------------------------------------------
// CONSOLE

View File

@ -0,0 +1,4 @@
test
list
of
words

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
<!-- Example hyphenation patterns file. -->
<hyphenation-info>
<hyphen-char value="-"/>
<hyphen-min before="2" after="2"/>
<classes>
aA
</classes>
<exceptions>
</exceptions>
<patterns>
.a2
</patterns>
</hyphenation-info>