Docs: Test examples that recreate lang analyzers (#29535)

We have a pile of documentation describing how to rebuild the built in
language analyzers and, previously, our documentation testing framework
made sure that the examples successfully built *an* analyzer but they
didn't assert that the analyzer built by the documentation matches the
built in anlayzer. Unsuprisingly, some of the examples aren't quite
right.

This adds a mechanism that tests that the analyzers built by the docs.
The mechanism is fairly simple and brutal but it seems to be working:
build a hundred random unicode sequences and send them through the
`_analyze` API with the rebuilt analyzer and then again through the
built in analyzer. Then make sure both APIs return the same results.
Each of these calls to `_anlayze` takes about 20ms on my laptop which
seems fine.
This commit is contained in:
Nik Everett 2018-05-09 09:23:10 -04:00 committed by GitHub
parent 2228e6e663
commit f9dc86836d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 344 additions and 58 deletions

View File

@ -141,9 +141,11 @@ public class RestTestsFromSnippetsTask extends SnippetsTask {
private static final String SYNTAX = {
String method = /(?<method>GET|PUT|POST|HEAD|OPTIONS|DELETE)/
String pathAndQuery = /(?<pathAndQuery>[^\n]+)/
String badBody = /GET|PUT|POST|HEAD|OPTIONS|DELETE|#/
String badBody = /GET|PUT|POST|HEAD|OPTIONS|DELETE|startyaml|#/
String body = /(?<body>(?:\n(?!$badBody)[^\n]+)+)/
String nonComment = /$method\s+$pathAndQuery$body?/
String rawRequest = /(?:$method\s+$pathAndQuery$body?)/
String yamlRequest = /(?:startyaml(?s)(?<yaml>.+?)(?-s)endyaml)/
String nonComment = /(?:$rawRequest|$yamlRequest)/
String comment = /(?<comment>#.+)/
/(?:$comment|$nonComment)\n+/
}()
@ -333,6 +335,11 @@ public class RestTestsFromSnippetsTask extends SnippetsTask {
// Comment
return
}
String yamlRequest = matcher.group("yaml");
if (yamlRequest != null) {
current.println(yamlRequest)
return
}
String method = matcher.group("method")
String pathAndQuery = matcher.group("pathAndQuery")
String body = matcher.group("body")

View File

@ -68,6 +68,23 @@ for its modifiers:
but rather than the setup defined in `docs/build.gradle` the setup is defined
right in the documentation file.
In addition to the standard CONSOLE syntax these snippets can contain blocks
of yaml surrounded by markers like this:
```
startyaml
- compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}
endyaml
```
This allows slightly more expressive testing of the snippets. Since that syntax
is not supported by CONSOLE the usual way to incorporate it is with a
`// TEST[s//]` marker like this:
```
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}\nendyaml\n/]
```
Any place you can use json you can use elements like `$body.path.to.thing`
which is replaced on the fly with the contents of the thing at `path.to.thing`
in the last response.

View File

@ -60,6 +60,8 @@ buildRestTests.docs = fileTree(projectDir) {
exclude 'build.gradle'
// That is where the snippets go, not where they come from!
exclude 'build'
// Just syntax examples
exclude 'README.asciidoc'
}
Closure setupTwitter = { String name, int count ->

View File

@ -97,10 +97,11 @@ PUT /arabic_example
}
},
"analyzer": {
"arabic": {
"rebuilt_arabic": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"arabic_stop",
"arabic_normalization",
"arabic_keywords",
@ -113,6 +114,8 @@ PUT /arabic_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"arabic_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: arabic_example, first: arabic, second: rebuilt_arabic}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -144,7 +147,7 @@ PUT /armenian_example
}
},
"analyzer": {
"armenian": {
"rebuilt_armenian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -159,6 +162,8 @@ PUT /armenian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"armenian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: armenian_example, first: armenian, second: rebuilt_armenian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -190,7 +195,7 @@ PUT /basque_example
}
},
"analyzer": {
"basque": {
"rebuilt_basque": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -205,6 +210,8 @@ PUT /basque_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"basque_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: basque_example, first: basque, second: rebuilt_basque}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -236,14 +243,15 @@ PUT /bengali_example
}
},
"analyzer": {
"bengali": {
"rebuilt_bengali": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"bengali_keywords",
"indic_normalization",
"bengali_normalization",
"bengali_stop",
"bengali_keywords",
"bengali_stemmer"
]
}
@ -253,6 +261,8 @@ PUT /bengali_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"bengali_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: bengali_example, first: bengali, second: rebuilt_bengali}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -284,7 +294,7 @@ PUT /brazilian_example
}
},
"analyzer": {
"brazilian": {
"rebuilt_brazilian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -299,6 +309,8 @@ PUT /brazilian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"brazilian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: brazilian_example, first: brazilian, second: rebuilt_brazilian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -330,7 +342,7 @@ PUT /bulgarian_example
}
},
"analyzer": {
"bulgarian": {
"rebuilt_bulgarian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -345,6 +357,8 @@ PUT /bulgarian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"bulgarian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: bulgarian_example, first: bulgarian, second: rebuilt_bulgarian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -380,7 +394,7 @@ PUT /catalan_example
}
},
"analyzer": {
"catalan": {
"rebuilt_catalan": {
"tokenizer": "standard",
"filter": [
"catalan_elision",
@ -396,6 +410,8 @@ PUT /catalan_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"catalan_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: catalan_example, first: catalan, second: rebuilt_catalan}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -415,11 +431,17 @@ PUT /cjk_example
"filter": {
"english_stop": {
"type": "stop",
"stopwords": "_english_" <1>
"stopwords": [ <1>
"a", "and", "are", "as", "at", "be", "but", "by", "for",
"if", "in", "into", "is", "it", "no", "not", "of", "on",
"or", "s", "such", "t", "that", "the", "their", "then",
"there", "these", "they", "this", "to", "was", "will",
"with", "www"
]
}
},
"analyzer": {
"cjk": {
"rebuilt_cjk": {
"tokenizer": "standard",
"filter": [
"cjk_width",
@ -434,8 +456,12 @@ PUT /cjk_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"cjk_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: cjk_example, first: cjk, second: rebuilt_cjk}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
or `stopwords_path` parameters. The default stop words are
*almost* the same as the `_english_` set, but not exactly
the same.
[[czech-analyzer]]
===== `czech` analyzer
@ -463,7 +489,7 @@ PUT /czech_example
}
},
"analyzer": {
"czech": {
"rebuilt_czech": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -478,6 +504,8 @@ PUT /czech_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"czech_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: czech_example, first: czech, second: rebuilt_czech}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -509,7 +537,7 @@ PUT /danish_example
}
},
"analyzer": {
"danish": {
"rebuilt_danish": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -524,6 +552,8 @@ PUT /danish_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"danish_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: danish_example, first: danish, second: rebuilt_danish}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -564,7 +594,7 @@ PUT /dutch_example
}
},
"analyzer": {
"dutch": {
"rebuilt_dutch": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -580,6 +610,8 @@ PUT /dutch_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"dutch_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: dutch_example, first: dutch, second: rebuilt_dutch}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -615,7 +647,7 @@ PUT /english_example
}
},
"analyzer": {
"english": {
"rebuilt_english": {
"tokenizer": "standard",
"filter": [
"english_possessive_stemmer",
@ -631,6 +663,8 @@ PUT /english_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"english_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: english_example, first: english, second: rebuilt_english}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -662,7 +696,7 @@ PUT /finnish_example
}
},
"analyzer": {
"finnish": {
"rebuilt_finnish": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -677,6 +711,8 @@ PUT /finnish_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"finnish_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: finnish_example, first: finnish, second: rebuilt_finnish}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -717,7 +753,7 @@ PUT /french_example
}
},
"analyzer": {
"french": {
"rebuilt_french": {
"tokenizer": "standard",
"filter": [
"french_elision",
@ -733,6 +769,8 @@ PUT /french_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"french_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: french_example, first: french, second: rebuilt_french}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -764,7 +802,7 @@ PUT /galician_example
}
},
"analyzer": {
"galician": {
"rebuilt_galician": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -779,6 +817,8 @@ PUT /galician_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"galician_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: galician_example, first: galician, second: rebuilt_galician}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -810,7 +850,7 @@ PUT /german_example
}
},
"analyzer": {
"german": {
"rebuilt_german": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -826,6 +866,8 @@ PUT /german_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"german_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: german_example, first: german, second: rebuilt_german}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -861,7 +903,7 @@ PUT /greek_example
}
},
"analyzer": {
"greek": {
"rebuilt_greek": {
"tokenizer": "standard",
"filter": [
"greek_lowercase",
@ -876,6 +918,8 @@ PUT /greek_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"greek_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: greek_example, first: greek, second: rebuilt_greek}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -907,14 +951,15 @@ PUT /hindi_example
}
},
"analyzer": {
"hindi": {
"rebuilt_hindi": {
"tokenizer": "standard",
"filter": [
"lowercase",
"decimal_digit",
"hindi_keywords",
"indic_normalization",
"hindi_normalization",
"hindi_stop",
"hindi_keywords",
"hindi_stemmer"
]
}
@ -924,6 +969,8 @@ PUT /hindi_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"hindi_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: hindi_example, first: hindi, second: rebuilt_hindi}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -955,7 +1002,7 @@ PUT /hungarian_example
}
},
"analyzer": {
"hungarian": {
"rebuilt_hungarian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -970,6 +1017,8 @@ PUT /hungarian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"hungarian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: hungarian_example, first: hungarian, second: rebuilt_hungarian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1002,7 +1051,7 @@ PUT /indonesian_example
}
},
"analyzer": {
"indonesian": {
"rebuilt_indonesian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1017,6 +1066,8 @@ PUT /indonesian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"indonesian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: indonesian_example, first: indonesian, second: rebuilt_indonesian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1034,9 +1085,15 @@ PUT /irish_example
"settings": {
"analysis": {
"filter": {
"irish_hyphenation": {
"type": "stop",
"stopwords": [ "h", "n", "t" ],
"ignore_case": true
},
"irish_elision": {
"type": "elision",
"articles": [ "h", "n", "t" ]
"articles": [ "d", "m", "b" ],
"articles_case": true
},
"irish_stop": {
"type": "stop",
@ -1056,12 +1113,13 @@ PUT /irish_example
}
},
"analyzer": {
"irish": {
"rebuilt_irish": {
"tokenizer": "standard",
"filter": [
"irish_stop",
"irish_hyphenation",
"irish_elision",
"irish_lowercase",
"irish_stop",
"irish_keywords",
"irish_stemmer"
]
@ -1072,6 +1130,8 @@ PUT /irish_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"irish_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: irish_example, first: irish, second: rebuilt_irish}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1112,7 +1172,7 @@ PUT /italian_example
}
},
"analyzer": {
"italian": {
"rebuilt_italian": {
"tokenizer": "standard",
"filter": [
"italian_elision",
@ -1128,6 +1188,8 @@ PUT /italian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"italian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: italian_example, first: italian, second: rebuilt_italian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1159,7 +1221,7 @@ PUT /latvian_example
}
},
"analyzer": {
"latvian": {
"rebuilt_latvian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1174,6 +1236,8 @@ PUT /latvian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"latvian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: latvian_example, first: latvian, second: rebuilt_latvian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1205,7 +1269,7 @@ PUT /lithuanian_example
}
},
"analyzer": {
"lithuanian": {
"rebuilt_lithuanian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1220,6 +1284,8 @@ PUT /lithuanian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"lithuanian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: lithuanian_example, first: lithuanian, second: rebuilt_lithuanian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1251,7 +1317,7 @@ PUT /norwegian_example
}
},
"analyzer": {
"norwegian": {
"rebuilt_norwegian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1266,6 +1332,8 @@ PUT /norwegian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"norwegian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: norwegian_example, first: norwegian, second: rebuilt_norwegian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1295,11 +1363,12 @@ PUT /persian_example
}
},
"analyzer": {
"persian": {
"rebuilt_persian": {
"tokenizer": "standard",
"char_filter": [ "zero_width_spaces" ],
"filter": [
"lowercase",
"decimal_digit",
"arabic_normalization",
"persian_normalization",
"persian_stop"
@ -1311,6 +1380,7 @@ PUT /persian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: persian_example, first: persian, second: rebuilt_persian}\nendyaml\n/]
<1> Replaces zero-width non-joiners with an ASCII space.
<2> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
@ -1341,7 +1411,7 @@ PUT /portuguese_example
}
},
"analyzer": {
"portuguese": {
"rebuilt_portuguese": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1356,6 +1426,8 @@ PUT /portuguese_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"portuguese_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: portuguese_example, first: portuguese, second: rebuilt_portuguese}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1387,7 +1459,7 @@ PUT /romanian_example
}
},
"analyzer": {
"romanian": {
"rebuilt_romanian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1402,6 +1474,8 @@ PUT /romanian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"romanian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: romanian_example, first: romanian, second: rebuilt_romanian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1434,7 +1508,7 @@ PUT /russian_example
}
},
"analyzer": {
"russian": {
"rebuilt_russian": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1449,6 +1523,8 @@ PUT /russian_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"russian_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: russian_example, first: russian, second: rebuilt_russian}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1480,11 +1556,12 @@ PUT /sorani_example
}
},
"analyzer": {
"sorani": {
"rebuilt_sorani": {
"tokenizer": "standard",
"filter": [
"sorani_normalization",
"lowercase",
"decimal_digit",
"sorani_stop",
"sorani_keywords",
"sorani_stemmer"
@ -1496,6 +1573,8 @@ PUT /sorani_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"sorani_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: sorani_example, first: sorani, second: rebuilt_sorani}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1527,7 +1606,7 @@ PUT /spanish_example
}
},
"analyzer": {
"spanish": {
"rebuilt_spanish": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1542,6 +1621,8 @@ PUT /spanish_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"spanish_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: spanish_example, first: spanish, second: rebuilt_spanish}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1573,7 +1654,7 @@ PUT /swedish_example
}
},
"analyzer": {
"swedish": {
"rebuilt_swedish": {
"tokenizer": "standard",
"filter": [
"lowercase",
@ -1588,6 +1669,8 @@ PUT /swedish_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"swedish_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: swedish_example, first: swedish, second: rebuilt_swedish}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1623,7 +1706,7 @@ PUT /turkish_example
}
},
"analyzer": {
"turkish": {
"rebuilt_turkish": {
"tokenizer": "standard",
"filter": [
"apostrophe",
@ -1639,6 +1722,8 @@ PUT /turkish_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"turkish_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: turkish_example, first: turkish, second: rebuilt_turkish}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.
<2> This filter should be removed unless there are words which should
@ -1662,10 +1747,11 @@ PUT /thai_example
}
},
"analyzer": {
"thai": {
"rebuilt_thai": {
"tokenizer": "thai",
"filter": [
"lowercase",
"decimal_digit",
"thai_stop"
]
}
@ -1675,5 +1761,7 @@ PUT /thai_example
}
----------------------------------------------------
// CONSOLE
// TEST[s/"thai_keywords",//]
// TEST[s/\n$/\nstartyaml\n - compare_analyzers: {index: thai_example, first: thai, second: rebuilt_thai}\nendyaml\n/]
<1> The default stopwords can be overridden with the `stopwords`
or `stopwords_path` parameters.

View File

@ -20,18 +20,39 @@
package org.elasticsearch.smoketest;
import org.apache.http.HttpHost;
import org.apache.lucene.util.BytesRef;
import com.carrotsearch.randomizedtesting.annotations.Name;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.elasticsearch.Version;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.ConstructingObjectParser;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentLocation;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentParser.Token;
import org.elasticsearch.test.rest.yaml.ClientYamlDocsTestClient;
import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
import org.elasticsearch.test.rest.yaml.ClientYamlTestClient;
import org.elasticsearch.test.rest.yaml.ClientYamlTestExecutionContext;
import org.elasticsearch.test.rest.yaml.ClientYamlTestResponse;
import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestSpec;
import org.elasticsearch.test.rest.yaml.section.ExecutableSection;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.common.xcontent.ConstructingObjectParser.constructorArg;
import static java.util.Collections.emptyMap;
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
@ -41,7 +62,12 @@ public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
@ParametersFactory
public static Iterable<Object[]> parameters() throws Exception {
return ESClientYamlSuiteTestCase.createParameters();
List<NamedXContentRegistry.Entry> entries = new ArrayList<>(ExecutableSection.DEFAULT_EXECUTABLE_CONTEXTS.size() + 1);
entries.addAll(ExecutableSection.DEFAULT_EXECUTABLE_CONTEXTS);
entries.add(new NamedXContentRegistry.Entry(ExecutableSection.class,
new ParseField("compare_analyzers"), CompareAnalyzers::parse));
NamedXContentRegistry executeableSectionRegistry = new NamedXContentRegistry(entries);
return ESClientYamlSuiteTestCase.createParameters(executeableSectionRegistry);
}
@Override
@ -64,5 +90,117 @@ public class DocsClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
List<HttpHost> hosts, Version esVersion) throws IOException {
return new ClientYamlDocsTestClient(restSpec, restClient, hosts, esVersion);
}
}
/**
* Compares the the results of running two analyzers against many random
* strings. The goal is to figure out if two anlayzers are "the same" by
* comparing their results. This is far from perfect but should be fairly
* accurate, especially for gross things like missing {@code decimal_digit}
* token filters, and should be fairly fast because it compares a fairly
* small number of tokens.
*/
private static class CompareAnalyzers implements ExecutableSection {
private static ConstructingObjectParser<CompareAnalyzers, XContentLocation> PARSER =
new ConstructingObjectParser<>("test_analyzer", false, (a, location) -> {
String index = (String) a[0];
String first = (String) a[1];
String second = (String) a[2];
return new CompareAnalyzers(location, index, first, second);
});
static {
PARSER.declareString(constructorArg(), new ParseField("index"));
PARSER.declareString(constructorArg(), new ParseField("first"));
PARSER.declareString(constructorArg(), new ParseField("second"));
}
private static CompareAnalyzers parse(XContentParser parser) throws IOException {
XContentLocation location = parser.getTokenLocation();
CompareAnalyzers section = PARSER.parse(parser, location);
assert parser.currentToken() == Token.END_OBJECT : "End of object required";
parser.nextToken(); // throw out the END_OBJECT to conform with other ExecutableSections
return section;
}
private final XContentLocation location;
private final String index;
private final String first;
private final String second;
private CompareAnalyzers(XContentLocation location, String index, String first, String second) {
this.location = location;
this.index = index;
this.first = first;
this.second = second;
}
@Override
public XContentLocation getLocation() {
return location;
}
@Override
public void execute(ClientYamlTestExecutionContext executionContext) throws IOException {
int size = 100;
int maxLength = 15;
List<String> testText = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
/**
* Build a string with a few unicode sequences separated by
* spaces. The unicode sequences aren't going to be of the same
* code page which is a shame because it makes the entire
* string less realistic. But this still provides a fairly
* nice string to compare.
*/
int spaces = between(0, 5);
StringBuilder b = new StringBuilder((spaces + 1) * maxLength);
b.append(randomRealisticUnicodeOfCodepointLengthBetween(1, maxLength));
for (int t = 0; t < spaces; t++) {
b.append(' ');
b.append(randomRealisticUnicodeOfCodepointLengthBetween(1, maxLength));
}
testText.add(b.toString()
// Don't look up stashed values
.replace("$", "\\$"));
}
Map<String, Object> body = new HashMap<>(2);
body.put("analyzer", first);
body.put("text", testText);
ClientYamlTestResponse response = executionContext.callApi("indices.analyze", singletonMap("index", index),
singletonList(body), emptyMap());
Iterator<?> firstTokens = ((List<?>) response.evaluate("tokens")).iterator();
body.put("analyzer", second);
response = executionContext.callApi("indices.analyze", singletonMap("index", index),
singletonList(body), emptyMap());
Iterator<?> secondTokens = ((List<?>) response.evaluate("tokens")).iterator();
Object previousFirst = null;
Object previousSecond = null;
while (firstTokens.hasNext()) {
if (false == secondTokens.hasNext()) {
fail(second + " has fewer tokens than " + first + ". "
+ first + " has [" + firstTokens.next() + "] but " + second + " is out of tokens. "
+ first + "'s last token was [" + previousFirst + "] and "
+ second + "'s last token was' [" + previousSecond + "]");
}
Map<?, ?> firstToken = (Map<?, ?>) firstTokens.next();
Map<?, ?> secondToken = (Map<?, ?>) secondTokens.next();
String firstText = (String) firstToken.get("token");
String secondText = (String) secondToken.get("token");
// Check the text and produce an error message with the utf8 sequence if they don't match.
if (false == secondText.equals(firstText)) {
fail("text differs: " + first + " was [" + firstText + "] but " + second + " was [" + secondText
+ "]. In utf8 those are\n" + new BytesRef(firstText) + " and\n" + new BytesRef(secondText));
}
// Now check the whole map just in case the text matches but something else differs
assertEquals(firstToken, secondToken);
previousFirst = firstToken;
previousSecond = secondToken;
}
if (secondTokens.hasNext()) {
fail(second + " has more tokens than " + first + ". "
+ second + " has [" + secondTokens.next() + "] but " + first + " is out of tokens. "
+ first + "'s last token was [" + previousFirst + "] and "
+ second + "'s last token was' [" + previousSecond + "]");
}
}
}
}

View File

@ -121,7 +121,7 @@ public class ClientYamlTestClient {
}
String contentType = entity.getContentType().getValue();
//randomly test the GET with source param instead of GET/POST with body
if (sendBodyAsSourceParam(supportedMethods, contentType)) {
if (sendBodyAsSourceParam(supportedMethods, contentType, entity.getContentLength())) {
logger.debug("sending the request body as source param with GET method");
queryStringParams.put("source", EntityUtils.toString(entity));
queryStringParams.put("source_content_type", contentType);
@ -177,14 +177,25 @@ public class ClientYamlTestClient {
}
}
private static boolean sendBodyAsSourceParam(List<String> supportedMethods, String contentType) {
if (supportedMethods.contains(HttpGet.METHOD_NAME)) {
if (contentType.startsWith(ContentType.APPLICATION_JSON.getMimeType()) ||
contentType.startsWith(YAML_CONTENT_TYPE.getMimeType())) {
return RandomizedTest.rarely();
}
private static boolean sendBodyAsSourceParam(List<String> supportedMethods, String contentType, long contentLength) {
if (false == supportedMethods.contains(HttpGet.METHOD_NAME)) {
// The API doesn't claim to support GET anyway
return false;
}
return false;
if (contentLength < 0) {
// Negative length means "unknown" or "huge" in this case. Either way we can't send it as a parameter
return false;
}
if (contentLength > 2000) {
// Long bodies won't fit in the parameter and will cause a too_long_frame_exception
return false;
}
if (false == contentType.startsWith(ContentType.APPLICATION_JSON.getMimeType())
&& false == contentType.startsWith(YAML_CONTENT_TYPE.getMimeType())) {
// We can only encode JSON or YAML this way.
return false;
}
return RandomizedTest.rarely();
}
private ClientYamlSuiteRestApi restApi(String apiName) {

View File

@ -28,6 +28,7 @@ import org.elasticsearch.client.RestClient;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.test.rest.ESRestTestCase;
import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestApi;
import org.elasticsearch.test.rest.yaml.restspec.ClientYamlSuiteRestSpec;
@ -143,7 +144,19 @@ public abstract class ESClientYamlSuiteTestCase extends ESRestTestCase {
return new ClientYamlTestClient(restSpec, restClient, hosts, esVersion);
}
/**
* Create parameters for this parameterized test. Uses the
* {@link ExecutableSection#XCONTENT_REGISTRY list} of executable sections
* defined in {@link ExecutableSection}.
*/
public static Iterable<Object[]> createParameters() throws Exception {
return createParameters(ExecutableSection.XCONTENT_REGISTRY);
}
/**
* Create parameters for this parameterized test.
*/
public static Iterable<Object[]> createParameters(NamedXContentRegistry executeableSectionRegistry) throws Exception {
String[] paths = resolvePathsProperty(REST_TESTS_SUITE, ""); // default to all tests under the test root
List<Object[]> tests = new ArrayList<>();
Map<String, Set<Path>> yamlSuites = loadSuites(paths);
@ -151,7 +164,7 @@ public abstract class ESClientYamlSuiteTestCase extends ESRestTestCase {
for (String api : yamlSuites.keySet()) {
List<Path> yamlFiles = new ArrayList<>(yamlSuites.get(api));
for (Path yamlFile : yamlFiles) {
ClientYamlTestSuite restTestSuite = ClientYamlTestSuite.parse(api, yamlFile);
ClientYamlTestSuite restTestSuite = ClientYamlTestSuite.parse(executeableSectionRegistry, api, yamlFile);
for (ClientYamlTestSection testSection : restTestSuite.getTestSections()) {
tests.add(new Object[]{ new ClientYamlTestCandidate(restTestSuite, testSection) });
}

View File

@ -21,6 +21,7 @@ package org.elasticsearch.test.rest.yaml.section;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.xcontent.DeprecationHandler;
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.yaml.YamlXContent;
@ -40,7 +41,7 @@ import java.util.TreeSet;
* Supports a setup section and multiple test sections.
*/
public class ClientYamlTestSuite {
public static ClientYamlTestSuite parse(String api, Path file) throws IOException {
public static ClientYamlTestSuite parse(NamedXContentRegistry executeableSectionRegistry, String api, Path file) throws IOException {
if (!Files.isRegularFile(file)) {
throw new IllegalArgumentException(file.toAbsolutePath() + " is not a file");
}
@ -64,7 +65,7 @@ public class ClientYamlTestSuite {
}
}
try (XContentParser parser = YamlXContent.yamlXContent.createParser(ExecutableSection.XCONTENT_REGISTRY,
try (XContentParser parser = YamlXContent.yamlXContent.createParser(executeableSectionRegistry,
LoggingDeprecationHandler.INSTANCE, Files.newInputStream(file))) {
return parse(api, filename, parser);
} catch(Exception e) {

View File

@ -26,15 +26,18 @@ import org.elasticsearch.test.rest.yaml.ClientYamlTestExecutionContext;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import static java.util.Collections.unmodifiableList;
/**
* Represents a test fragment that can be executed (e.g. api call, assertion)
*/
public interface ExecutableSection {
/**
* {@link NamedXContentRegistry} needed in the {@link XContentParser} before calling {@link ExecutableSection#parse(XContentParser)}.
* Default list of {@link ExecutableSection}s available for tests.
*/
NamedXContentRegistry XCONTENT_REGISTRY = new NamedXContentRegistry(Arrays.asList(
List<NamedXContentRegistry.Entry> DEFAULT_EXECUTABLE_CONTEXTS = unmodifiableList(Arrays.asList(
new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("do"), DoSection::parse),
new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("set"), SetSection::parse),
new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("match"), MatchAssertion::parse),
@ -46,6 +49,12 @@ public interface ExecutableSection {
new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("lte"), LessThanOrEqualToAssertion::parse),
new NamedXContentRegistry.Entry(ExecutableSection.class, new ParseField("length"), LengthAssertion::parse)));
/**
* {@link NamedXContentRegistry} that parses the default list of
* {@link ExecutableSection}s available for tests.
*/
NamedXContentRegistry XCONTENT_REGISTRY = new NamedXContentRegistry(DEFAULT_EXECUTABLE_CONTEXTS);
static ExecutableSection parse(XContentParser parser) throws IOException {
ParserUtils.advanceToFieldName(parser);
String section = parser.currentName();
@ -60,7 +69,7 @@ public interface ExecutableSection {
}
/**
* Get the location in the test that this was defined.
* Get the location in the test that this was defined.
*/
XContentLocation getLocation();