From 6d2c40e546b33eb658c31dfc612880af43c6a413 Mon Sep 17 00:00:00 2001
From: Nik Everett <nik9000@gmail.com>
Date: Thu, 17 Aug 2017 09:02:10 -0400
Subject: [PATCH] Enforce that responses in docs are valid json (#26249)

All of the snippets in our docs marked with `// TESTRESPONSE` are
checked against the response from Elasticsearch but, due to the
way they are implemented they are actually parsed as YAML instead
of JSON. Luckilly, all valid JSON is valid YAML! Unfurtunately
that means that invalid JSON has snuck into the exmples!

This adds a step during the build to parse them as JSON and fail
the build if they don't parse.

But no! It isn't quite that simple. The displayed text of some of
these responses looks like:
```
{
    ...
    "aggregations": {
        "range": {
            "buckets": [
                {
                    "to": 1.4436576E12,
                    "to_as_string": "10-2015",
                    "doc_count": 7,
                    "key": "*-10-2015"
                },
                {
                    "from": 1.4436576E12,
                    "from_as_string": "10-2015",
                    "doc_count": 0,
                    "key": "10-2015-*"
                }
            ]
        }
    }
}
```

Note the `...` which isn't valid json but we like it anyway and want
it in the output. We use substitution rules to convert the `...`
into the response we expect. That yields a response that looks like:
```
{
    "took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits,
    "aggregations": {
        "range": {
            "buckets": [
                {
                    "to": 1.4436576E12,
                    "to_as_string": "10-2015",
                    "doc_count": 7,
                    "key": "*-10-2015"
                },
                {
                    "from": 1.4436576E12,
                    "from_as_string": "10-2015",
                    "doc_count": 0,
                    "key": "10-2015-*"
                }
            ]
        }
    }
}
```

That is what the tests consume but it isn't valid JSON! Oh no! We don't
want to go update all the substitution rules because that'd be huge and,
ultimately, wouldn't buy much. So we quote the `$body.took` bits before
parsing the JSON.

Note the responses that we use for the `_cat` APIs are all converted into
regexes and there is no expectation that they are valid JSON.

Closes #26233
---
 .../gradle/doc/SnippetsTask.groovy            | 21 +++++++++++++++++++
 docs/plugins/analysis-kuromoji.asciidoc       |  1 -
 .../diversified-sampler-aggregation.asciidoc  | 20 +++++++++---------
 docs/reference/cat.asciidoc                   |  8 +++----
 docs/reference/cat/indices.asciidoc           |  4 ++--
 docs/reference/cat/nodeattrs.asciidoc         |  2 +-
 docs/reference/cat/nodes.asciidoc             |  2 +-
 docs/reference/cat/recovery.asciidoc          |  6 +++---
 docs/reference/cat/shards.asciidoc            | 10 ++++-----
 docs/reference/cat/thread_pool.asciidoc       |  2 +-
 docs/reference/docs/update.asciidoc           |  2 +-
 docs/reference/getting-started.asciidoc       |  4 ++--
 12 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy b/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy
index 94af22f4aa2..7132361e163 100644
--- a/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy
+++ b/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy
@@ -19,6 +19,10 @@
 
 package org.elasticsearch.gradle.doc
 
+import groovy.json.JsonException
+import groovy.json.JsonParserType
+import groovy.json.JsonSlurper
+
 import org.gradle.api.DefaultTask
 import org.gradle.api.InvalidUserDataException
 import org.gradle.api.file.ConfigurableFileTree
@@ -117,6 +121,23 @@ public class SnippetsTask extends DefaultTask {
                             + "contain `curl`.")
                     }
                 }
+                if (snippet.testResponse && snippet.language == 'js') {
+                    String quoted = snippet.contents
+                        // quote values starting with $
+                        .replaceAll(/([:,])\s*(\$[^ ,\n}]+)/, '$1 "$2"')
+                        // quote fields starting with $
+                        .replaceAll(/(\$[^ ,\n}]+)\s*:/, '"$1":')
+                    JsonSlurper slurper =
+                        new JsonSlurper(type: JsonParserType.INDEX_OVERLAY)
+                    try {
+                        slurper.parseText(quoted)
+                    } catch (JsonException e) {
+                        throw new InvalidUserDataException("Invalid json "
+                            + "in $snippet. The error is:\n${e.message}.\n"
+                            + "After substitutions and munging, the json "
+                            + "looks like:\n$quoted", e)
+                    }
+                }
                 perSnippet(snippet)
                 snippet = null
             }
diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc
index 7a702295dd9..383df5afb48 100644
--- a/docs/plugins/analysis-kuromoji.asciidoc
+++ b/docs/plugins/analysis-kuromoji.asciidoc
@@ -160,7 +160,6 @@ The above `analyze` request returns the following:
 
 [source,js]
 --------------------------------------------------
-# Result
 {
   "tokens" : [ {
     "token" : "東京",
diff --git a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc
index cca87d5e166..970a2ffdc1e 100644
--- a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc
@@ -6,16 +6,16 @@ The `diversified_sampler` aggregation adds the ability to limit the number of ma
 
 NOTE: Any good market researcher will tell you that when working with samples of data it is important
 that the sample represents a healthy variety of opinions rather than being skewed by any single voice.
-The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, 
-a large spike in a timeline or an over-active forum spammer).  
+The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography,
+a large spike in a timeline or an over-active forum spammer).
 
 
 .Example use cases:
 * Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches
 * Removing bias from analytics by ensuring fair representation of content from different sources
 * Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms`
- 
-A choice of `field` or `script` setting is used to provide values used for de-duplication and the `max_docs_per_value` setting controls the maximum 
+
+A choice of `field` or `script` setting is used to provide values used for de-duplication and the `max_docs_per_value` setting controls the maximum
 number of documents collected on any one shard which share a common value. The default setting for `max_docs_per_value` is 1.
 
 The aggregation will throw an error if the choice of `field` or `script` produces multiple values for a single document (de-duplication using multi-valued fields is not supported due to efficiency concerns).
@@ -39,7 +39,7 @@ POST /stackoverflow/_search?size=0
         "my_unbiased_sample": {
             "diversified_sampler": {
                 "shard_size": 200,
-                "field" : "author"   
+                "field" : "author"
             },
             "aggs": {
                 "keywords": {
@@ -89,7 +89,7 @@ Response:
 
 ==== Scripted example:
 
-In this scenario we might want to diversify on a combination of field values. We can use a `script` to produce a hash of the 
+In this scenario we might want to diversify on a combination of field values. We can use a `script` to produce a hash of the
 multiple values in a tags field to ensure we don't have a sample that consists of the same repeated combinations of tags.
 
 [source,js]
@@ -109,7 +109,7 @@ POST /stackoverflow/_search?size=0
                 "script" : {
                     "lang": "painless",
                     "source": "doc['tags'].values.hashCode()"
-                }   
+                }
             },
             "aggs": {
                 "keywords": {
@@ -150,7 +150,7 @@ Response:
                         "doc_count": 3,
                         "score": 1.34,
                         "bg_count": 200
-                    },
+                    }
                 ]
             }
         }
@@ -175,11 +175,11 @@ The default setting is "1".
 
 The optional `execution_hint` setting can influence the management of the values used for de-duplication.
 Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows:
- 
+
  - hold field values directly (`map`)
  - hold ordinals of the field as determined by the Lucene index (`global_ordinals`)
  - hold hashes of the field values - with potential for hash collisions (`bytes_hash`)
- 
+
 The default setting is to use `global_ordinals` if this information is available from the Lucene index and reverting to `map` if not.
 The `bytes_hash` setting may prove faster in some cases but introduces the possibility of false positives in de-duplication logic due to the possibility of hash collisions.
 Please note that Elasticsearch will ignore the choice of execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints.
diff --git a/docs/reference/cat.asciidoc b/docs/reference/cat.asciidoc
index b201a9b1ca8..31e0bf61707 100644
--- a/docs/reference/cat.asciidoc
+++ b/docs/reference/cat.asciidoc
@@ -35,7 +35,7 @@ GET /_cat/master?v
 
 Might respond with:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 id                     host      ip        node
 u_n93zwxThWHi1PDBJAGAg 127.0.0.1 127.0.0.1 u_n93zw
@@ -57,7 +57,7 @@ GET /_cat/master?help
 
 Might respond respond with:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 id   |   | node id
 host | h | host name
@@ -81,7 +81,7 @@ GET /_cat/nodes?h=ip,port,heapPercent,name
 
 Responds with:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 127.0.0.1 9300 27 sLBaIGK
 --------------------------------------------------
@@ -197,7 +197,7 @@ GET _cat/templates?v&s=order:desc,index_patterns
 
 returns:
 
-[source,sh]
+[source,txt]
 --------------------------------------------------
 name                  index_patterns order version
 pizza_pepperoni       [*pepperoni*]  2
diff --git a/docs/reference/cat/indices.asciidoc b/docs/reference/cat/indices.asciidoc
index 0c840071bb9..746d0b4bb58 100644
--- a/docs/reference/cat/indices.asciidoc
+++ b/docs/reference/cat/indices.asciidoc
@@ -97,7 +97,7 @@ GET /_cat/indices/twitter?pri&v&h=health,index,pri,rep,docs.count,mt
 
 Might look like:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 health index   pri rep docs.count mt pri.mt
 yellow twitter   1   1 1200       16     16
@@ -115,7 +115,7 @@ GET /_cat/indices?v&h=i,tm&s=tm:desc
 
 Might look like:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 i         tm
 twitter   8.1gb
diff --git a/docs/reference/cat/nodeattrs.asciidoc b/docs/reference/cat/nodeattrs.asciidoc
index 18feeba8d03..196f142cc35 100644
--- a/docs/reference/cat/nodeattrs.asciidoc
+++ b/docs/reference/cat/nodeattrs.asciidoc
@@ -49,7 +49,7 @@ GET /_cat/nodeattrs?v&h=name,pid,attr,value
 
 Might look like:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 name    pid   attr     value
 EK_AsJb 19566 testattr test
diff --git a/docs/reference/cat/nodes.asciidoc b/docs/reference/cat/nodes.asciidoc
index 60e204410c5..74b2d0cc2bc 100644
--- a/docs/reference/cat/nodes.asciidoc
+++ b/docs/reference/cat/nodes.asciidoc
@@ -53,7 +53,7 @@ GET /_cat/nodes?v&h=id,ip,port,v,m
 
 Might look like:
 
-["source","js",subs="attributes,callouts"]
+["source","txt",subs="attributes,callouts"]
 --------------------------------------------------
 id   ip        port  v         m
 veJR 127.0.0.1 59938 {version} *
diff --git a/docs/reference/cat/recovery.asciidoc b/docs/reference/cat/recovery.asciidoc
index 4c981f206c7..c4288f882e2 100644
--- a/docs/reference/cat/recovery.asciidoc
+++ b/docs/reference/cat/recovery.asciidoc
@@ -21,7 +21,7 @@ GET _cat/recovery?v
 
 The response of this request will be something like:
 
-[source,js]
+[source,txt]
 ---------------------------------------------------------------------------
 index   shard time type  stage source_host source_node target_host target_node repository snapshot files files_recovered files_percent files_total bytes bytes_recovered bytes_percent bytes_total translog_ops translog_ops_recovered translog_ops_percent
 twitter 0     13ms store done  n/a         n/a         127.0.0.1   node-0      n/a        n/a      0     0               100%          13          0     0               100%          9928        0            0                      100.0%
@@ -48,7 +48,7 @@ GET _cat/recovery?v&h=i,s,t,ty,st,shost,thost,f,fp,b,bp
 
 This will return a line like:
 
-[source,js]
+[source,txt]
 ----------------------------------------------------------------------------
 i       s t      ty   st    shost       thost       f     fp      b bp
 twitter 0 1252ms peer done  192.168.1.1 192.168.1.2 0     100.0%  0 100.0%
@@ -76,7 +76,7 @@ GET _cat/recovery?v&h=i,s,t,ty,st,rep,snap,f,fp,b,bp
 
 This will show a recovery of type snapshot in the response
 
-[source,js]
+[source,txt]
 --------------------------------------------------------------------------------
 i       s t      ty       st    rep     snap   f  fp   b     bp
 twitter 0 1978ms snapshot done  twitter snap_1 79 8.0% 12086 9.0%
diff --git a/docs/reference/cat/shards.asciidoc b/docs/reference/cat/shards.asciidoc
index 6d786a315c7..f63e37c6a3d 100644
--- a/docs/reference/cat/shards.asciidoc
+++ b/docs/reference/cat/shards.asciidoc
@@ -16,7 +16,7 @@ GET _cat/shards
 
 This will return
 
-[source,js]
+[source,txt]
 ---------------------------------------------------------------------------
 twitter 0 p STARTED 3014 31.1mb 192.168.56.10 H5dfFeA
 ---------------------------------------------------------------------------
@@ -42,7 +42,7 @@ GET _cat/shards/twitt*
 
 Which will return the following
 
-[source,js]
+[source,txt]
 ---------------------------------------------------------------------------
 twitter 0 p STARTED 3014 31.1mb 192.168.56.10 H5dfFeA
 ---------------------------------------------------------------------------
@@ -68,7 +68,7 @@ GET _cat/shards
 
 A relocating shard will be shown as follows
 
-[source,js]
+[source,txt]
 ---------------------------------------------------------------------------
 twitter 0 p RELOCATING 3014 31.1mb 192.168.56.10 H5dfFeA -> -> 192.168.56.30 bGG90GE
 ---------------------------------------------------------------------------
@@ -90,7 +90,7 @@ GET _cat/shards
 
 You can get the initializing state in the response like this
 
-[source,js]
+[source,txt]
 ---------------------------------------------------------------------------
 twitter 0 p STARTED      3014 31.1mb 192.168.56.10 H5dfFeA
 twitter 0 r INITIALIZING    0 14.3mb 192.168.56.30 bGG90GE
@@ -112,7 +112,7 @@ GET _cat/shards?h=index,shard,prirep,state,unassigned.reason
 
 The reason for an unassigned shard will be listed as the last field
 
-[source,js]
+[source,txt]
 ---------------------------------------------------------------------------
 twitter 0 p STARTED    3014 31.1mb 192.168.56.10 H5dfFeA
 twitter 0 r STARTED    3014 31.1mb 192.168.56.30 bGG90GE
diff --git a/docs/reference/cat/thread_pool.asciidoc b/docs/reference/cat/thread_pool.asciidoc
index 721d85e46a0..163a729e51c 100644
--- a/docs/reference/cat/thread_pool.asciidoc
+++ b/docs/reference/cat/thread_pool.asciidoc
@@ -92,7 +92,7 @@ GET /_cat/thread_pool/generic?v&h=id,name,active,rejected,completed
 
 which looks like:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 id                     name    active rejected completed
 0EWUhXeBQtaVGlexUeVwMg generic      0        0        70
diff --git a/docs/reference/docs/update.asciidoc b/docs/reference/docs/update.asciidoc
index 37091c47e0b..9e3a537e96b 100644
--- a/docs/reference/docs/update.asciidoc
+++ b/docs/reference/docs/update.asciidoc
@@ -170,7 +170,7 @@ the request was ignored.
    "_type": "type1",
    "_id": "1",
    "_version": 6,
-   "result": noop
+   "result": "noop"
 }
 --------------------------------------------------
 // TESTRESPONSE
diff --git a/docs/reference/getting-started.asciidoc b/docs/reference/getting-started.asciidoc
index 3b34a93a6d6..bb7908a80c7 100755
--- a/docs/reference/getting-started.asciidoc
+++ b/docs/reference/getting-started.asciidoc
@@ -373,7 +373,7 @@ PUT /customer/doc/1?pretty
 
 And the response:
 
-[source,sh]
+[source,js]
 --------------------------------------------------
 {
   "_index" : "customer",
@@ -672,7 +672,7 @@ GET /_cat/indices?v
 
 And the response:
 
-[source,js]
+[source,txt]
 --------------------------------------------------
 health status index uuid                   pri rep docs.count docs.deleted store.size pri.store.size
 yellow open   bank  l7sSYV2cQXmu6_4rJWVIww   5   1       1000            0    128.6kb        128.6kb