CONSOLEify analysis docs

Converts the analysis docs to that were marked as json into `CONSOLE`
format. A few of them were in yaml but marked as json for historical
reasons. I added more complete examples for a few of the less obvious
sounding ones.

Relates to #18160
This commit is contained in:
Nik Everett 2017-04-02 11:15:26 -04:00
parent 01b807f98e
commit ad69503dce
15 changed files with 439 additions and 158 deletions

View File

@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [
'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc', 'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc', 'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc', 'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc',
'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc',
'reference/cat/snapshots.asciidoc', 'reference/cat/snapshots.asciidoc',
'reference/cat/templates.asciidoc', 'reference/cat/templates.asciidoc',
'reference/cat/thread_pool.asciidoc', 'reference/cat/thread_pool.asciidoc',
@ -124,10 +112,14 @@ integTestCluster {
configFile 'scripts/my_map_script.painless' configFile 'scripts/my_map_script.painless'
configFile 'scripts/my_combine_script.painless' configFile 'scripts/my_combine_script.painless'
configFile 'scripts/my_reduce_script.painless' configFile 'scripts/my_reduce_script.painless'
configFile 'analysis/example_word_list.txt'
configFile 'analysis/hyphenation_patterns.xml'
configFile 'analysis/synonym.txt' configFile 'analysis/synonym.txt'
configFile 'analysis/stemmer_override.txt' configFile 'analysis/stemmer_override.txt'
configFile 'userdict_ja.txt' configFile 'userdict_ja.txt'
configFile 'KeywordTokenizer.rbbi' configFile 'KeywordTokenizer.rbbi'
extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'
// Whitelist reindexing from the local node so we can test it. // Whitelist reindexing from the local node so we can test it.
setting 'reindex.remote.whitelist', '127.0.0.1:*' setting 'reindex.remote.whitelist', '127.0.0.1:*'
} }

View File

@ -8,17 +8,21 @@ equivalents, if one exists. Example:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
"index" : { PUT /asciifold_example
"analysis" : { {
"analyzer" : { "settings" : {
"default" : { "analysis" : {
"tokenizer" : "standard", "analyzer" : {
"filter" : ["standard", "asciifolding"] "default" : {
"tokenizer" : "standard",
"filter" : ["standard", "asciifolding"]
}
} }
} }
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE
Accepts `preserve_original` setting which defaults to false but if true Accepts `preserve_original` setting which defaults to false but if true
will keep the original token as well as emit the folded token. For will keep the original token as well as emit the folded token. For
@ -26,20 +30,24 @@ example:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
"index" : { PUT /asciifold_example
"analysis" : { {
"analyzer" : { "settings" : {
"default" : { "analysis" : {
"tokenizer" : "standard", "analyzer" : {
"filter" : ["standard", "my_ascii_folding"] "default" : {
} "tokenizer" : "standard",
}, "filter" : ["standard", "my_ascii_folding"]
"filter" : { }
"my_ascii_folding" : { },
"type" : "asciifolding", "filter" : {
"preserve_original" : true "my_ascii_folding" : {
"type" : "asciifolding",
"preserve_original" : true
}
} }
} }
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
PUT /cjk_bigram_example
{ {
"index" : { "settings" : {
"analysis" : { "analysis" : {
"analyzer" : { "analyzer" : {
"han_bigrams" : { "han_bigrams" : {
@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -41,21 +41,33 @@ Here is an example:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index : PUT /common_grams_example
analysis : {
analyzer : "settings": {
index_grams : "analysis": {
tokenizer : whitespace "my_analyzer": {
filter : [common_grams] "index_grams": {
search_grams : "tokenizer": "whitespace",
tokenizer : whitespace "filter": ["common_grams"]
filter : [common_grams_query] },
filter : "search_grams": {
common_grams : "tokenizer": "whitespace",
type : common_grams "filter": ["common_grams_query"]
common_words: [a, an, the] }
common_grams_query : },
type : common_grams "filter": {
query_mode: true "common_grams": {
common_words: [a, an, the] "type": "common_grams",
"common_words": ["a", "an", "the"]
},
"common_grams_query": {
"type": "common_grams",
"query_mode": true,
"common_words": ["a", "an", "the"]
}
}
}
}
}
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -1,5 +1,5 @@
[[analysis-compound-word-tokenfilter]] [[analysis-compound-word-tokenfilter]]
=== Compound Word Token Filter === Compound Word Token Filters
The `hyphenation_decompounder` and `dictionary_decompounder` token filters can The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
decompose compound words found in many German languages into word parts. decompose compound words found in many German languages into word parts.
@ -84,20 +84,31 @@ Here is an example:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index : PUT /compound_word_example
analysis : {
analyzer : "index": {
myAnalyzer2 : "analysis": {
type : custom "analyzer": {
tokenizer : standard "my_analyzer": {
filter : [myTokenFilter1, myTokenFilter2] "type": "custom",
filter : "tokenizer": "standard",
myTokenFilter1 : "filter": ["dictionary_decompounder", "hyphenation_decompounder"]
type : dictionary_decompounder }
word_list: [one, two, three] },
myTokenFilter2 : "filter": {
type : hyphenation_decompounder "dictionary_decompounder": {
word_list_path: path/to/words.txt "type": "dictionary_decompounder",
hyphenation_patterns_path: path/to/fop.xml "word_list": ["one", "two", "three"]
max_subword_size : 22 },
"hyphenation_decompounder": {
"type" : "hyphenation_decompounder",
"word_list_path": "analysis/example_word_list.txt",
"hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
"max_subword_size": 22
}
}
}
}
}
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -9,20 +9,24 @@ example:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
"index" : { PUT /elision_example
"analysis" : { {
"analyzer" : { "settings" : {
"default" : { "analysis" : {
"tokenizer" : "standard", "analyzer" : {
"filter" : ["standard", "elision"] "default" : {
} "tokenizer" : "standard",
}, "filter" : ["standard", "elision"]
"filter" : { }
"elision" : { },
"type" : "elision", "filter" : {
"articles" : ["l", "m", "t", "qu", "n", "s", "j"] "elision" : {
"type" : "elision",
"articles" : ["l", "m", "t", "qu", "n", "s", "j"]
}
} }
} }
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up).
For example, assuming the default hunspell location is used, the For example, assuming the default hunspell location is used, the
following directory layout will define the `en_US` dictionary: following directory layout will define the `en_US` dictionary:
[source,js] [source,txt]
-------------------------------------------------- --------------------------------------------------
- conf - conf
|-- hunspell |-- hunspell
@ -42,24 +42,28 @@ settings:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
PUT /hunspell_example
{ {
"analysis" : { "settings": {
"analyzer" : { "analysis" : {
"en" : { "analyzer" : {
"tokenizer" : "standard", "en" : {
"filter" : [ "lowercase", "en_US" ] "tokenizer" : "standard",
} "filter" : [ "lowercase", "en_US" ]
}, }
"filter" : { },
"en_US" : { "filter" : {
"type" : "hunspell", "en_US" : {
"locale" : "en_US", "type" : "hunspell",
"dedup" : true "locale" : "en_US",
"dedup" : true
}
} }
} }
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE
The hunspell token filter accepts four options: The hunspell token filter accepts four options:

View File

@ -1,7 +1,7 @@
[[analysis-keep-types-tokenfilter]] [[analysis-keep-types-tokenfilter]]
=== Keep Types Token Filter === Keep Types Token Filter
A token filter of type `keep_types` that only keeps tokens with a token type A token filter of type `keep_types` that only keeps tokens with a token type
contained in a predefined set. contained in a predefined set.
@ -14,24 +14,61 @@ types:: a list of types to keep
[float] [float]
=== Settings example === Settings example
You can set it up like:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
PUT /keep_types_example
{ {
"index" : { "settings" : {
"analysis" : { "analysis" : {
"analyzer" : { "analyzer" : {
"my_analyzer" : { "my_analyzer" : {
"tokenizer" : "standard", "tokenizer" : "standard",
"filter" : ["standard", "lowercase", "extract_numbers"] "filter" : ["standard", "lowercase", "extract_numbers"]
}, }
}, },
"filter" : { "filter" : {
"extract_numbers" : { "extract_numbers" : {
"type" : "keep_types", "type" : "keep_types",
"types" : [ "<NUM>" ] "types" : [ "<NUM>" ]
}, }
} }
} }
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE
And test it like:
[source,js]
--------------------------------------------------
POST /keep_types_example/_analyze
{
"analyzer" : "my_analyzer",
"text" : "this is just 1 a test"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "1",
"start_offset": 13,
"end_offset": 14,
"type": "<NUM>",
"position": 3
}
]
}
--------------------------------------------------
// TESTRESPONSE
Note how only the `<NUM>` token is in the output.

View File

@ -20,17 +20,18 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
PUT /keep_words_example
{ {
"index" : { "settings" : {
"analysis" : { "analysis" : {
"analyzer" : { "analyzer" : {
"my_analyzer" : { "example_1" : {
"tokenizer" : "standard", "tokenizer" : "standard",
"filter" : ["standard", "lowercase", "words_till_three"] "filter" : ["standard", "lowercase", "words_till_three"]
}, },
"my_analyzer1" : { "example_2" : {
"tokenizer" : "standard", "tokenizer" : "standard",
"filter" : ["standard", "lowercase", "words_on_file"] "filter" : ["standard", "lowercase", "words_in_file"]
} }
}, },
"filter" : { "filter" : {
@ -38,12 +39,13 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
"type" : "keep", "type" : "keep",
"keep_words" : [ "one", "two", "three"] "keep_words" : [ "one", "two", "three"]
}, },
"words_on_file" : { "words_in_file" : {
"type" : "keep", "type" : "keep",
"keep_words_path" : "/path/to/word/file" "keep_words_path" : "analysis/example_word_list.txt"
} }
} }
} }
} }
} }
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -19,19 +19,124 @@ in the text.
`false`. `false`.
|======================================================================= |=======================================================================
Here is an example: You can configure it like:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index : PUT /keyword_marker_example
analysis : {
analyzer : "settings": {
myAnalyzer : "analysis": {
type : custom "analyzer": {
tokenizer : standard "protect_cats": {
filter : [lowercase, protwords, porter_stem] "type": "custom",
filter : "tokenizer": "standard",
protwords : "filter": ["lowercase", "protect_cats", "porter_stem"]
type : keyword_marker },
keywords_path : analysis/protwords.txt "normal": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "porter_stem"]
}
},
"filter": {
"protect_cats": {
"type": "keyword_marker",
"keywords": ["cats"]
}
}
}
}
}
-------------------------------------------------- --------------------------------------------------
// CONSOLE
And test it with:
[source,js]
--------------------------------------------------
POST /keyword_marker_example/_analyze
{
"analyzer" : "protect_cats",
"text" : "I like cats"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "like",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cats",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE
As compared to the `normal` analyzer which has `cats` stemmed to `cat`:
[source,js]
--------------------------------------------------
POST /keyword_marker_example/_analyze
{
"analyzer" : "normal",
"text" : "I like cats"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
Response:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "like",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cat",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE

View File

@ -9,18 +9,85 @@ subsequent stemmer will be indexed twice. Therefore, consider adding a
`unique` filter with `only_on_same_position` set to `true` to drop `unique` filter with `only_on_same_position` set to `true` to drop
unnecessary duplicates. unnecessary duplicates.
Here is an example: Here is an example of using the `keyword_repeat` token filter to
preserve both the stemmed and unstemmed version of tokens:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index : PUT /keyword_repeat_example
analysis : {
analyzer : "settings": {
myAnalyzer : "analysis": {
type : custom "analyzer": {
tokenizer : standard "stemmed_and_unstemmed": {
filter : [lowercase, keyword_repeat, porter_stem, unique_stem] "type": "custom",
unique_stem: "tokenizer": "standard",
type: unique "filter": ["lowercase", "keyword_repeat", "porter_stem", "unique_stem"]
only_on_same_position : true }
},
"filter": {
"unique_stem": {
"type": "unique",
"only_on_same_position": true
}
}
}
}
}
-------------------------------------------------- --------------------------------------------------
// CONSOLE
And you can test it with:
[source,js]
--------------------------------------------------
POST /keyword_repeat_example/_analyze
{
"analyzer" : "stemmed_and_unstemmed",
"text" : "I like cats"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]
And it'd respond:
[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "i",
"start_offset": 0,
"end_offset": 1,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "like",
"start_offset": 2,
"end_offset": 6,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "cats",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "cat",
"start_offset": 7,
"end_offset": 11,
"type": "<ALPHANUM>",
"position": 2
}
]
}
--------------------------------------------------
// TESTRESPONSE
Which preserves both the `cat` and `cats` tokens. Compare this to the example
on the <<analysis-keyword-marker-tokenfilter>>.

View File

@ -18,15 +18,25 @@ Here is an example:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index : PUT /limit_example
analysis : {
analyzer : "settings": {
myAnalyzer : "analysis": {
type : custom "analyzer": {
tokenizer : standard "limit_example": {
filter : [lowercase, five_token_limit] "type": "custom",
filter : "tokenizer": "standard",
five_token_limit : "filter": ["lowercase", "five_token_limit"]
type : limit }
max_token_count : 5 },
"filter": {
"five_token_limit": {
"type": "limit",
"max_token_count": 5
}
}
}
}
}
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -10,28 +10,30 @@ custom analyzer
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
index : PUT /lowercase_example
analysis : {
analyzer : "settings": {
myAnalyzer2 : "analysis": {
type : custom "analyzer": {
tokenizer : myTokenizer1 "standard_lowercase_example": {
filter : [myTokenFilter1, myGreekLowerCaseFilter] "type": "custom",
char_filter : [my_html] "tokenizer": "standard",
tokenizer : "filter": ["lowercase"]
myTokenizer1 : },
type : standard "greek_lowercase_example": {
max_token_length : 900 "type": "custom",
filter : "tokenizer": "standard",
myTokenFilter1 : "filter": ["greek_lowercase"]
type : stop }
stopwords : [stop1, stop2, stop3, stop4] },
myGreekLowerCaseFilter : "filter": {
type : lowercase "greek_lowercase": {
language : greek "type": "lowercase",
char_filter : "language": "greek"
my_html : }
type : html_strip }
escaped_tags : [xxx, yyy] }
read_ahead : 1024 }
}
-------------------------------------------------- --------------------------------------------------
// CONSOLE

View File

@ -0,0 +1,4 @@
test
list
of
words

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
<!-- Example hyphenation patterns file. -->
<hyphenation-info>
<hyphen-char value="-"/>
<hyphen-min before="2" after="2"/>
<classes>
aA
</classes>
<exceptions>
</exceptions>
<patterns>
.a2
</patterns>
</hyphenation-info>