From 6d2c40e546b33eb658c31dfc612880af43c6a413 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Thu, 17 Aug 2017 09:02:10 -0400 Subject: [PATCH] Enforce that responses in docs are valid json (#26249) All of the snippets in our docs marked with `// TESTRESPONSE` are checked against the response from Elasticsearch but, due to the way they are implemented they are actually parsed as YAML instead of JSON. Luckilly, all valid JSON is valid YAML! Unfurtunately that means that invalid JSON has snuck into the exmples! This adds a step during the build to parse them as JSON and fail the build if they don't parse. But no! It isn't quite that simple. The displayed text of some of these responses looks like: ``` { ... "aggregations": { "range": { "buckets": [ { "to": 1.4436576E12, "to_as_string": "10-2015", "doc_count": 7, "key": "*-10-2015" }, { "from": 1.4436576E12, "from_as_string": "10-2015", "doc_count": 0, "key": "10-2015-*" } ] } } } ``` Note the `...` which isn't valid json but we like it anyway and want it in the output. We use substitution rules to convert the `...` into the response we expect. That yields a response that looks like: ``` { "took": $body.took,"timed_out": false,"_shards": $body._shards,"hits": $body.hits, "aggregations": { "range": { "buckets": [ { "to": 1.4436576E12, "to_as_string": "10-2015", "doc_count": 7, "key": "*-10-2015" }, { "from": 1.4436576E12, "from_as_string": "10-2015", "doc_count": 0, "key": "10-2015-*" } ] } } } ``` That is what the tests consume but it isn't valid JSON! Oh no! We don't want to go update all the substitution rules because that'd be huge and, ultimately, wouldn't buy much. So we quote the `$body.took` bits before parsing the JSON. Note the responses that we use for the `_cat` APIs are all converted into regexes and there is no expectation that they are valid JSON. Closes #26233 --- .../gradle/doc/SnippetsTask.groovy | 21 +++++++++++++++++++ docs/plugins/analysis-kuromoji.asciidoc | 1 - .../diversified-sampler-aggregation.asciidoc | 20 +++++++++--------- docs/reference/cat.asciidoc | 8 +++---- docs/reference/cat/indices.asciidoc | 4 ++-- docs/reference/cat/nodeattrs.asciidoc | 2 +- docs/reference/cat/nodes.asciidoc | 2 +- docs/reference/cat/recovery.asciidoc | 6 +++--- docs/reference/cat/shards.asciidoc | 10 ++++----- docs/reference/cat/thread_pool.asciidoc | 2 +- docs/reference/docs/update.asciidoc | 2 +- docs/reference/getting-started.asciidoc | 4 ++-- 12 files changed, 51 insertions(+), 31 deletions(-) diff --git a/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy b/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy index 94af22f4aa2..7132361e163 100644 --- a/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy +++ b/buildSrc/src/main/groovy/org/elasticsearch/gradle/doc/SnippetsTask.groovy @@ -19,6 +19,10 @@ package org.elasticsearch.gradle.doc +import groovy.json.JsonException +import groovy.json.JsonParserType +import groovy.json.JsonSlurper + import org.gradle.api.DefaultTask import org.gradle.api.InvalidUserDataException import org.gradle.api.file.ConfigurableFileTree @@ -117,6 +121,23 @@ public class SnippetsTask extends DefaultTask { + "contain `curl`.") } } + if (snippet.testResponse && snippet.language == 'js') { + String quoted = snippet.contents + // quote values starting with $ + .replaceAll(/([:,])\s*(\$[^ ,\n}]+)/, '$1 "$2"') + // quote fields starting with $ + .replaceAll(/(\$[^ ,\n}]+)\s*:/, '"$1":') + JsonSlurper slurper = + new JsonSlurper(type: JsonParserType.INDEX_OVERLAY) + try { + slurper.parseText(quoted) + } catch (JsonException e) { + throw new InvalidUserDataException("Invalid json " + + "in $snippet. The error is:\n${e.message}.\n" + + "After substitutions and munging, the json " + + "looks like:\n$quoted", e) + } + } perSnippet(snippet) snippet = null } diff --git a/docs/plugins/analysis-kuromoji.asciidoc b/docs/plugins/analysis-kuromoji.asciidoc index 7a702295dd9..383df5afb48 100644 --- a/docs/plugins/analysis-kuromoji.asciidoc +++ b/docs/plugins/analysis-kuromoji.asciidoc @@ -160,7 +160,6 @@ The above `analyze` request returns the following: [source,js] -------------------------------------------------- -# Result { "tokens" : [ { "token" : "東京", diff --git a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc index cca87d5e166..970a2ffdc1e 100644 --- a/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/diversified-sampler-aggregation.asciidoc @@ -6,16 +6,16 @@ The `diversified_sampler` aggregation adds the ability to limit the number of ma NOTE: Any good market researcher will tell you that when working with samples of data it is important that the sample represents a healthy variety of opinions rather than being skewed by any single voice. -The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, -a large spike in a timeline or an over-active forum spammer). +The same is true with aggregations and sampling with these diversify settings can offer a way to remove the bias in your content (an over-populated geography, +a large spike in a timeline or an over-active forum spammer). .Example use cases: * Tightening the focus of analytics to high-relevance matches rather than the potentially very long tail of low-quality matches * Removing bias from analytics by ensuring fair representation of content from different sources * Reducing the running cost of aggregations that can produce useful results using only samples e.g. `significant_terms` - -A choice of `field` or `script` setting is used to provide values used for de-duplication and the `max_docs_per_value` setting controls the maximum + +A choice of `field` or `script` setting is used to provide values used for de-duplication and the `max_docs_per_value` setting controls the maximum number of documents collected on any one shard which share a common value. The default setting for `max_docs_per_value` is 1. The aggregation will throw an error if the choice of `field` or `script` produces multiple values for a single document (de-duplication using multi-valued fields is not supported due to efficiency concerns). @@ -39,7 +39,7 @@ POST /stackoverflow/_search?size=0 "my_unbiased_sample": { "diversified_sampler": { "shard_size": 200, - "field" : "author" + "field" : "author" }, "aggs": { "keywords": { @@ -89,7 +89,7 @@ Response: ==== Scripted example: -In this scenario we might want to diversify on a combination of field values. We can use a `script` to produce a hash of the +In this scenario we might want to diversify on a combination of field values. We can use a `script` to produce a hash of the multiple values in a tags field to ensure we don't have a sample that consists of the same repeated combinations of tags. [source,js] @@ -109,7 +109,7 @@ POST /stackoverflow/_search?size=0 "script" : { "lang": "painless", "source": "doc['tags'].values.hashCode()" - } + } }, "aggs": { "keywords": { @@ -150,7 +150,7 @@ Response: "doc_count": 3, "score": 1.34, "bg_count": 200 - }, + } ] } } @@ -175,11 +175,11 @@ The default setting is "1". The optional `execution_hint` setting can influence the management of the values used for de-duplication. Each option will hold up to `shard_size` values in memory while performing de-duplication but the type of value held can be controlled as follows: - + - hold field values directly (`map`) - hold ordinals of the field as determined by the Lucene index (`global_ordinals`) - hold hashes of the field values - with potential for hash collisions (`bytes_hash`) - + The default setting is to use `global_ordinals` if this information is available from the Lucene index and reverting to `map` if not. The `bytes_hash` setting may prove faster in some cases but introduces the possibility of false positives in de-duplication logic due to the possibility of hash collisions. Please note that Elasticsearch will ignore the choice of execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints. diff --git a/docs/reference/cat.asciidoc b/docs/reference/cat.asciidoc index b201a9b1ca8..31e0bf61707 100644 --- a/docs/reference/cat.asciidoc +++ b/docs/reference/cat.asciidoc @@ -35,7 +35,7 @@ GET /_cat/master?v Might respond with: -[source,js] +[source,txt] -------------------------------------------------- id host ip node u_n93zwxThWHi1PDBJAGAg 127.0.0.1 127.0.0.1 u_n93zw @@ -57,7 +57,7 @@ GET /_cat/master?help Might respond respond with: -[source,js] +[source,txt] -------------------------------------------------- id | | node id host | h | host name @@ -81,7 +81,7 @@ GET /_cat/nodes?h=ip,port,heapPercent,name Responds with: -[source,js] +[source,txt] -------------------------------------------------- 127.0.0.1 9300 27 sLBaIGK -------------------------------------------------- @@ -197,7 +197,7 @@ GET _cat/templates?v&s=order:desc,index_patterns returns: -[source,sh] +[source,txt] -------------------------------------------------- name index_patterns order version pizza_pepperoni [*pepperoni*] 2 diff --git a/docs/reference/cat/indices.asciidoc b/docs/reference/cat/indices.asciidoc index 0c840071bb9..746d0b4bb58 100644 --- a/docs/reference/cat/indices.asciidoc +++ b/docs/reference/cat/indices.asciidoc @@ -97,7 +97,7 @@ GET /_cat/indices/twitter?pri&v&h=health,index,pri,rep,docs.count,mt Might look like: -[source,js] +[source,txt] -------------------------------------------------- health index pri rep docs.count mt pri.mt yellow twitter 1 1 1200 16 16 @@ -115,7 +115,7 @@ GET /_cat/indices?v&h=i,tm&s=tm:desc Might look like: -[source,js] +[source,txt] -------------------------------------------------- i tm twitter 8.1gb diff --git a/docs/reference/cat/nodeattrs.asciidoc b/docs/reference/cat/nodeattrs.asciidoc index 18feeba8d03..196f142cc35 100644 --- a/docs/reference/cat/nodeattrs.asciidoc +++ b/docs/reference/cat/nodeattrs.asciidoc @@ -49,7 +49,7 @@ GET /_cat/nodeattrs?v&h=name,pid,attr,value Might look like: -[source,js] +[source,txt] -------------------------------------------------- name pid attr value EK_AsJb 19566 testattr test diff --git a/docs/reference/cat/nodes.asciidoc b/docs/reference/cat/nodes.asciidoc index 60e204410c5..74b2d0cc2bc 100644 --- a/docs/reference/cat/nodes.asciidoc +++ b/docs/reference/cat/nodes.asciidoc @@ -53,7 +53,7 @@ GET /_cat/nodes?v&h=id,ip,port,v,m Might look like: -["source","js",subs="attributes,callouts"] +["source","txt",subs="attributes,callouts"] -------------------------------------------------- id ip port v m veJR 127.0.0.1 59938 {version} * diff --git a/docs/reference/cat/recovery.asciidoc b/docs/reference/cat/recovery.asciidoc index 4c981f206c7..c4288f882e2 100644 --- a/docs/reference/cat/recovery.asciidoc +++ b/docs/reference/cat/recovery.asciidoc @@ -21,7 +21,7 @@ GET _cat/recovery?v The response of this request will be something like: -[source,js] +[source,txt] --------------------------------------------------------------------------- index shard time type stage source_host source_node target_host target_node repository snapshot files files_recovered files_percent files_total bytes bytes_recovered bytes_percent bytes_total translog_ops translog_ops_recovered translog_ops_percent twitter 0 13ms store done n/a n/a 127.0.0.1 node-0 n/a n/a 0 0 100% 13 0 0 100% 9928 0 0 100.0% @@ -48,7 +48,7 @@ GET _cat/recovery?v&h=i,s,t,ty,st,shost,thost,f,fp,b,bp This will return a line like: -[source,js] +[source,txt] ---------------------------------------------------------------------------- i s t ty st shost thost f fp b bp twitter 0 1252ms peer done 192.168.1.1 192.168.1.2 0 100.0% 0 100.0% @@ -76,7 +76,7 @@ GET _cat/recovery?v&h=i,s,t,ty,st,rep,snap,f,fp,b,bp This will show a recovery of type snapshot in the response -[source,js] +[source,txt] -------------------------------------------------------------------------------- i s t ty st rep snap f fp b bp twitter 0 1978ms snapshot done twitter snap_1 79 8.0% 12086 9.0% diff --git a/docs/reference/cat/shards.asciidoc b/docs/reference/cat/shards.asciidoc index 6d786a315c7..f63e37c6a3d 100644 --- a/docs/reference/cat/shards.asciidoc +++ b/docs/reference/cat/shards.asciidoc @@ -16,7 +16,7 @@ GET _cat/shards This will return -[source,js] +[source,txt] --------------------------------------------------------------------------- twitter 0 p STARTED 3014 31.1mb 192.168.56.10 H5dfFeA --------------------------------------------------------------------------- @@ -42,7 +42,7 @@ GET _cat/shards/twitt* Which will return the following -[source,js] +[source,txt] --------------------------------------------------------------------------- twitter 0 p STARTED 3014 31.1mb 192.168.56.10 H5dfFeA --------------------------------------------------------------------------- @@ -68,7 +68,7 @@ GET _cat/shards A relocating shard will be shown as follows -[source,js] +[source,txt] --------------------------------------------------------------------------- twitter 0 p RELOCATING 3014 31.1mb 192.168.56.10 H5dfFeA -> -> 192.168.56.30 bGG90GE --------------------------------------------------------------------------- @@ -90,7 +90,7 @@ GET _cat/shards You can get the initializing state in the response like this -[source,js] +[source,txt] --------------------------------------------------------------------------- twitter 0 p STARTED 3014 31.1mb 192.168.56.10 H5dfFeA twitter 0 r INITIALIZING 0 14.3mb 192.168.56.30 bGG90GE @@ -112,7 +112,7 @@ GET _cat/shards?h=index,shard,prirep,state,unassigned.reason The reason for an unassigned shard will be listed as the last field -[source,js] +[source,txt] --------------------------------------------------------------------------- twitter 0 p STARTED 3014 31.1mb 192.168.56.10 H5dfFeA twitter 0 r STARTED 3014 31.1mb 192.168.56.30 bGG90GE diff --git a/docs/reference/cat/thread_pool.asciidoc b/docs/reference/cat/thread_pool.asciidoc index 721d85e46a0..163a729e51c 100644 --- a/docs/reference/cat/thread_pool.asciidoc +++ b/docs/reference/cat/thread_pool.asciidoc @@ -92,7 +92,7 @@ GET /_cat/thread_pool/generic?v&h=id,name,active,rejected,completed which looks like: -[source,js] +[source,txt] -------------------------------------------------- id name active rejected completed 0EWUhXeBQtaVGlexUeVwMg generic 0 0 70 diff --git a/docs/reference/docs/update.asciidoc b/docs/reference/docs/update.asciidoc index 37091c47e0b..9e3a537e96b 100644 --- a/docs/reference/docs/update.asciidoc +++ b/docs/reference/docs/update.asciidoc @@ -170,7 +170,7 @@ the request was ignored. "_type": "type1", "_id": "1", "_version": 6, - "result": noop + "result": "noop" } -------------------------------------------------- // TESTRESPONSE diff --git a/docs/reference/getting-started.asciidoc b/docs/reference/getting-started.asciidoc index 3b34a93a6d6..bb7908a80c7 100755 --- a/docs/reference/getting-started.asciidoc +++ b/docs/reference/getting-started.asciidoc @@ -373,7 +373,7 @@ PUT /customer/doc/1?pretty And the response: -[source,sh] +[source,js] -------------------------------------------------- { "_index" : "customer", @@ -672,7 +672,7 @@ GET /_cat/indices?v And the response: -[source,js] +[source,txt] -------------------------------------------------- health status index uuid pri rep docs.count docs.deleted store.size pri.store.size yellow open bank l7sSYV2cQXmu6_4rJWVIww 5 1 1000 0 128.6kb 128.6kb