CONSOLEify analysis docs

Converts the analysis docs to that were marked as json into `CONSOLE` format. A few of them were in yaml but marked as json for historical reasons. I added more complete examples for a few of the less obvious sounding ones. Relates to #18160
2025-02-20 03:45:02 +00:00 · 2017-04-02 11:15:26 -04:00 · 2017-04-02 11:15:26 -04:00 · ad69503dce
commit ad69503dce
parent 01b807f98e
15 changed files with 439 additions and 158 deletions
--- a/docs/build.gradle
+++ b/docs/build.gradle
@ -53,18 +53,6 @@ buildRestTests.expectedUnconvertedCandidates = [
  'reference/aggregations/pipeline/serial-diff-aggregation.asciidoc',
  'reference/aggregations/pipeline/stats-bucket-aggregation.asciidoc',
  'reference/aggregations/pipeline/sum-bucket-aggregation.asciidoc',
  'reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/elision-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc',
  'reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc',
  'reference/cat/snapshots.asciidoc',
  'reference/cat/templates.asciidoc',
  'reference/cat/thread_pool.asciidoc',
@ -124,10 +112,14 @@ integTestCluster {
  configFile 'scripts/my_map_script.painless'
  configFile 'scripts/my_combine_script.painless'
  configFile 'scripts/my_reduce_script.painless'
  configFile 'analysis/example_word_list.txt'
  configFile 'analysis/hyphenation_patterns.xml'
  configFile 'analysis/synonym.txt'
  configFile 'analysis/stemmer_override.txt'
  configFile 'userdict_ja.txt'
  configFile 'KeywordTokenizer.rbbi'
  extraConfigFile 'hunspell/en_US/en_US.aff', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff'
  extraConfigFile 'hunspell/en_US/en_US.dic', '../core/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic'
  // Whitelist reindexing from the local node so we can test it.
  setting 'reindex.remote.whitelist', '127.0.0.1:*'
 }
--- a/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/asciifolding-tokenfilter.asciidoc
@ -8,17 +8,21 @@ equivalents, if one exists.  Example:
 [source,js]
 --------------------------------------------------
-"index" : {
+PUT /asciifold_example
-    "analysis" : {
+{
-        "analyzer" : {
+    "settings" : {
-            "default" : {
+        "analysis" : {
-                "tokenizer" : "standard",
+            "analyzer" : {
-                "filter" : ["standard", "asciifolding"]
+                "default" : {
                    "tokenizer" : "standard",
                    "filter" : ["standard", "asciifolding"]
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
 Accepts `preserve_original` setting which defaults to false but if true
 will keep the original token as well as emit the folded token.  For
@ -26,20 +30,24 @@ example:
 [source,js]
 --------------------------------------------------
-"index" : {
+PUT /asciifold_example
-    "analysis" : {
+{
-        "analyzer" : {
+    "settings" : {
-            "default" : {
+        "analysis" : {
-                "tokenizer" : "standard",
+            "analyzer" : {
-                "filter" : ["standard", "my_ascii_folding"]
+                "default" : {
-            }
+                    "tokenizer" : "standard",
-        },
+                    "filter" : ["standard", "my_ascii_folding"]
-        "filter" : {
+                }
-            "my_ascii_folding" : {
+            },
-                "type" : "asciifolding",
+            "filter" : {
-                "preserve_original" : true
+                "my_ascii_folding" : {
                    "type" : "asciifolding",
                    "preserve_original" : true
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/cjk-bigram-tokenfilter.asciidoc
@ -16,8 +16,9 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
 [source,js]
 --------------------------------------------------
 PUT /cjk_bigram_example
 {
-    "index" : {
+    "settings" : {
        "analysis" : {
            "analyzer" : {
                "han_bigrams" : {
@ -40,3 +41,4 @@ Bigrams are generated for characters in `han`, `hiragana`, `katakana` and
    }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/common-grams-tokenfilter.asciidoc
@ -41,21 +41,33 @@ Here is an example:
 [source,js]
 --------------------------------------------------
-index :
+PUT /common_grams_example
-    analysis :
+{
-        analyzer :
+    "settings": {
-            index_grams :
+        "analysis": {
-                tokenizer : whitespace
+            "my_analyzer": {
-                filter : [common_grams]
+                "index_grams": {
-            search_grams :
+                    "tokenizer": "whitespace",
-                tokenizer : whitespace
+                    "filter": ["common_grams"]
-                filter : [common_grams_query]
+                },
-        filter :
+                "search_grams": {
-            common_grams :
+                    "tokenizer": "whitespace",
-                type : common_grams
+                    "filter": ["common_grams_query"]
-                common_words: [a, an, the]                
+                }
-            common_grams_query :
+            },
-                type : common_grams
+            "filter": {
-                query_mode: true
+                "common_grams": {
-                common_words: [a, an, the]                
+                    "type": "common_grams",
                    "common_words": ["a", "an", "the"]
                },
                "common_grams_query": {
                    "type": "common_grams",
                    "query_mode": true,
                    "common_words": ["a", "an", "the"]
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/compound-word-tokenfilter.asciidoc
@ -1,5 +1,5 @@
 [[analysis-compound-word-tokenfilter]]
-=== Compound Word Token Filter
+=== Compound Word Token Filters
 The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
 decompose compound words found in many German languages into word parts.
@ -84,20 +84,31 @@ Here is an example:
 [source,js]
 --------------------------------------------------
-index :
+PUT /compound_word_example
-    analysis :
+{
-        analyzer :
+    "index": {
-            myAnalyzer2 :
+        "analysis": {
-                type : custom
+            "analyzer": {
-                tokenizer : standard
+                "my_analyzer": {
-                filter : [myTokenFilter1, myTokenFilter2]
+                    "type": "custom",
-        filter :
+                    "tokenizer": "standard",
-            myTokenFilter1 :
+                    "filter": ["dictionary_decompounder", "hyphenation_decompounder"]
-                type : dictionary_decompounder
+                }
-                word_list: [one, two, three]
+            },
-            myTokenFilter2 :
+            "filter": {
-                type : hyphenation_decompounder
+                "dictionary_decompounder": {
-                word_list_path: path/to/words.txt
+                    "type": "dictionary_decompounder",
-                hyphenation_patterns_path: path/to/fop.xml
+                    "word_list": ["one", "two", "three"]
-                max_subword_size : 22
+                },
                "hyphenation_decompounder": {
                    "type" : "hyphenation_decompounder",
                    "word_list_path": "analysis/example_word_list.txt",
                    "hyphenation_patterns_path": "analysis/hyphenation_patterns.xml",
                    "max_subword_size": 22
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/elision-tokenfilter.asciidoc
@ -9,20 +9,24 @@ example:
 [source,js]
 --------------------------------------------------
-"index" : {
+PUT /elision_example
-    "analysis" : {
+{
-        "analyzer" : {
+    "settings" : {
-            "default" : {
+        "analysis" : {
-                "tokenizer" : "standard",
+            "analyzer" : {
-                "filter" : ["standard", "elision"]
+                "default" : {
-            }
+                    "tokenizer" : "standard",
-        },
+                    "filter" : ["standard", "elision"]
-        "filter" : {
+                }
-            "elision" : {
+            },
-                "type" : "elision",
+            "filter" : {
-                "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
+                "elision" : {
                    "type" : "elision",
                    "articles" : ["l", "m", "t", "qu", "n", "s", "j"]
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/hunspell-tokenfilter.asciidoc
@ -10,7 +10,7 @@ one or more `*.dic` files (all of which will automatically be picked up).
 For example, assuming the default hunspell location is used, the
 following directory layout will define the `en_US` dictionary:
-[source,js]
+[source,txt]
 --------------------------------------------------
 - conf
    |-- hunspell
@ -42,24 +42,28 @@ settings:
 [source,js]
 --------------------------------------------------
 PUT /hunspell_example
 {
-    "analysis" : {
+    "settings": {
-        "analyzer" : {
+        "analysis" : {
-            "en" : {
+            "analyzer" : {
-                "tokenizer" : "standard",
+                "en" : {
-                "filter" : [ "lowercase", "en_US" ]
+                    "tokenizer" : "standard",
-            }
+                    "filter" : [ "lowercase", "en_US" ]
-        },
+                }
-        "filter" : {
+            },
-            "en_US" : {
+            "filter" : {
-                "type" : "hunspell",
+                "en_US" : {
-                "locale" : "en_US",
+                    "type" : "hunspell",
-                "dedup" : true
+                    "locale" : "en_US",
                    "dedup" : true
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
 The hunspell token filter accepts four options:
--- a/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/keep-types-tokenfilter.asciidoc
@ -1,7 +1,7 @@
 [[analysis-keep-types-tokenfilter]]
 === Keep Types Token Filter
-A token filter of type `keep_types` that only keeps tokens with a token type 
+A token filter of type `keep_types` that only keeps tokens with a token type
 contained in a predefined set.
@ -14,24 +14,61 @@ types:: a list of types to keep
 [float]
 === Settings example
 You can set it up like:
 [source,js]
 --------------------------------------------------
 PUT /keep_types_example
 {
-    "index" : {
+    "settings" : {
        "analysis" : {
            "analyzer" : {
                "my_analyzer" : {
                    "tokenizer" : "standard",
                    "filter" : ["standard", "lowercase", "extract_numbers"]
-                },
+                }
            },
            "filter" : {
                "extract_numbers" : {
                    "type" : "keep_types",
                    "types" : [ "<NUM>" ]
-                },
+                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
 And test it like:
 [source,js]
 --------------------------------------------------
 POST /keep_types_example/_analyze
 {
  "analyzer" : "my_analyzer",
  "text" : "this is just 1 a test"
 }
 --------------------------------------------------
 // CONSOLE
 // TEST[continued]
 And it'd respond:
 [source,js]
 --------------------------------------------------
 {
  "tokens": [
    {
      "token": "1",
      "start_offset": 13,
      "end_offset": 14,
      "type": "<NUM>",
      "position": 3
    }
  ]
 }
 --------------------------------------------------
 // TESTRESPONSE
 Note how only the `<NUM>` token is in the output.
--- a/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/keep-words-tokenfilter.asciidoc
@ -20,17 +20,18 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
 [source,js]
 --------------------------------------------------
 PUT /keep_words_example
 {
-    "index" : {
+    "settings" : {
        "analysis" : {
            "analyzer" : {
-                "my_analyzer" : {
+                "example_1" : {
                    "tokenizer" : "standard",
                    "filter" : ["standard", "lowercase", "words_till_three"]
                },
-                "my_analyzer1" : {
+                "example_2" : {
                    "tokenizer" : "standard",
-                    "filter" : ["standard", "lowercase", "words_on_file"]
+                    "filter" : ["standard", "lowercase", "words_in_file"]
                }
            },
            "filter" : {
@ -38,12 +39,13 @@ keep_words_case:: a boolean indicating whether to lower case the words (defaults
                    "type" : "keep",
                    "keep_words" : [ "one", "two", "three"]
                },
-                "words_on_file" : {
+                "words_in_file" : {
                    "type" : "keep",
-                    "keep_words_path" : "/path/to/word/file"
+                    "keep_words_path" : "analysis/example_word_list.txt"
                }
            }
        }
    }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc
@ -19,19 +19,124 @@ in the text.
 `false`.
 |=======================================================================
-Here is an example:
+You can configure it like:
 [source,js]
 --------------------------------------------------
-index :
+PUT /keyword_marker_example
-    analysis :
+{
-        analyzer :
+  "settings": {
-            myAnalyzer :
+    "analysis": {
-                type : custom
+      "analyzer": {
-                tokenizer : standard
+        "protect_cats": {
-                filter : [lowercase, protwords, porter_stem]    
+          "type": "custom",
-        filter :
+          "tokenizer": "standard",
-            protwords :
+          "filter": ["lowercase", "protect_cats", "porter_stem"]
-                type : keyword_marker
+        },
-                keywords_path : analysis/protwords.txt
+        "normal": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase", "porter_stem"]
        }
      },
      "filter": {
        "protect_cats": {
          "type": "keyword_marker",
          "keywords": ["cats"]
        }
      }
    }
  }
 }
 --------------------------------------------------
 // CONSOLE
 And test it with:
 [source,js]
 --------------------------------------------------
 POST /keyword_marker_example/_analyze
 {
  "analyzer" : "protect_cats",
  "text" : "I like cats"
 }
 --------------------------------------------------
 // CONSOLE
 // TEST[continued]
 And it'd respond:
 [source,js]
 --------------------------------------------------
 {
  "tokens": [
    {
      "token": "i",
      "start_offset": 0,
      "end_offset": 1,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "like",
      "start_offset": 2,
      "end_offset": 6,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "cats",
      "start_offset": 7,
      "end_offset": 11,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
 }
 --------------------------------------------------
 // TESTRESPONSE
 As compared to the `normal` analyzer which has `cats` stemmed to `cat`:
 [source,js]
 --------------------------------------------------
 POST /keyword_marker_example/_analyze
 {
  "analyzer" : "normal",
  "text" : "I like cats"
 }
 --------------------------------------------------
 // CONSOLE
 // TEST[continued]
 Response:
 [source,js]
 --------------------------------------------------
 {
  "tokens": [
    {
      "token": "i",
      "start_offset": 0,
      "end_offset": 1,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "like",
      "start_offset": 2,
      "end_offset": 6,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "cat",
      "start_offset": 7,
      "end_offset": 11,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
 }
 --------------------------------------------------
 // TESTRESPONSE
--- a/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/keyword-repeat-tokenfilter.asciidoc
@ -9,18 +9,85 @@ subsequent stemmer will be indexed twice. Therefore, consider adding a
 `unique` filter with `only_on_same_position` set to `true` to drop
 unnecessary duplicates.
-Here is an example:
+Here is an example of using the `keyword_repeat` token filter to
 preserve both the stemmed and unstemmed version of tokens:
 [source,js]
 --------------------------------------------------
-index :
+PUT /keyword_repeat_example
-    analysis :
+{
-        analyzer :
+  "settings": {
-            myAnalyzer :
+    "analysis": {
-                type : custom
+      "analyzer": {
-                tokenizer : standard
+        "stemmed_and_unstemmed": {
-                filter : [lowercase, keyword_repeat, porter_stem, unique_stem]    
+          "type": "custom",
-            unique_stem:
+          "tokenizer": "standard",
-                type: unique
+          "filter": ["lowercase", "keyword_repeat", "porter_stem", "unique_stem"]
-                only_on_same_position : true
+        }
      },
      "filter": {
        "unique_stem": {
          "type": "unique",
          "only_on_same_position": true
        }
      }
    }
  }
 }
 --------------------------------------------------
 // CONSOLE
 And you can test it with:
 [source,js]
 --------------------------------------------------
 POST /keyword_repeat_example/_analyze
 {
  "analyzer" : "stemmed_and_unstemmed",
  "text" : "I like cats"
 }
 --------------------------------------------------
 // CONSOLE
 // TEST[continued]
 And it'd respond:
 [source,js]
 --------------------------------------------------
 {
  "tokens": [
    {
      "token": "i",
      "start_offset": 0,
      "end_offset": 1,
      "type": "<ALPHANUM>",
      "position": 0
    },
    {
      "token": "like",
      "start_offset": 2,
      "end_offset": 6,
      "type": "<ALPHANUM>",
      "position": 1
    },
    {
      "token": "cats",
      "start_offset": 7,
      "end_offset": 11,
      "type": "<ALPHANUM>",
      "position": 2
    },
    {
      "token": "cat",
      "start_offset": 7,
      "end_offset": 11,
      "type": "<ALPHANUM>",
      "position": 2
    }
  ]
 }
 --------------------------------------------------
 // TESTRESPONSE
 Which preserves both the `cat` and `cats` tokens. Compare this to the example
 on the <<analysis-keyword-marker-tokenfilter>>.
--- a/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/limit-token-count-tokenfilter.asciidoc
@ -18,15 +18,25 @@ Here is an example:
 [source,js]
 --------------------------------------------------
-index :
+PUT /limit_example
-    analysis :
+{
-        analyzer :
+  "settings": {
-            myAnalyzer :
+    "analysis": {
-                type : custom
+      "analyzer": {
-                tokenizer : standard
+        "limit_example": {
-                filter : [lowercase, five_token_limit]
+          "type": "custom",
-        filter :
+          "tokenizer": "standard",
-            five_token_limit :
+          "filter": ["lowercase", "five_token_limit"]
-                type : limit
+        }
-                max_token_count : 5
+      },
      "filter": {
        "five_token_limit": {
          "type": "limit",
          "max_token_count": 5
        }
      }
    }
  }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/lowercase-tokenfilter.asciidoc
@ -10,28 +10,30 @@ custom analyzer
 [source,js]
 --------------------------------------------------
-index :
+PUT /lowercase_example
-    analysis :
+{
-        analyzer : 
+  "settings": {
-            myAnalyzer2 :
+    "analysis": {
-                type : custom
+      "analyzer": {
-                tokenizer : myTokenizer1
+        "standard_lowercase_example": {
-                filter : [myTokenFilter1, myGreekLowerCaseFilter]
+          "type": "custom",
-                char_filter : [my_html]
+          "tokenizer": "standard",
-        tokenizer :
+          "filter": ["lowercase"]
-            myTokenizer1 :
+        },
-                type : standard
+        "greek_lowercase_example": {
-                max_token_length : 900
+          "type": "custom",
-        filter :
+          "tokenizer": "standard",
-            myTokenFilter1 :
+          "filter": ["greek_lowercase"]
-                type : stop
+        }
-                stopwords : [stop1, stop2, stop3, stop4]
+      },
-            myGreekLowerCaseFilter :
+      "filter": {
-                type : lowercase
+        "greek_lowercase": {
-                language : greek
+          "type": "lowercase",
-        char_filter :
+          "language": "greek"
-              my_html :
+        }
-                type : html_strip
+      }
-                escaped_tags : [xxx, yyy]
+    }
-                read_ahead : 1024
+  }
 }
 --------------------------------------------------
 // CONSOLE
--- a/docs/src/test/cluster/config/analysis/example_word_list.txt
+++ b/docs/src/test/cluster/config/analysis/example_word_list.txt
@ -0,0 +1,4 @@
 test
 list
 of
 words
--- a/docs/src/test/cluster/config/analysis/hyphenation_patterns.xml
+++ b/docs/src/test/cluster/config/analysis/hyphenation_patterns.xml
@ -0,0 +1,21 @@
 <?xml version="1.0" encoding="utf-8"?>
 <!DOCTYPE hyphenation-info SYSTEM "hyphenation.dtd">
 <!-- Example hyphenation patterns file. -->
 <hyphenation-info>
 <hyphen-char value="-"/>
 <hyphen-min before="2" after="2"/>
 <classes>
 aA
 </classes>
 <exceptions>
 </exceptions>
 <patterns>
 .a2
 </patterns>
 </hyphenation-info>