From b42f66c8acf6368f1d20905786b1d5c17df86075 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 18 Mar 2016 17:01:27 +0100 Subject: [PATCH] Document 5.0 mapping changes. --- .../bucket/nested-aggregation.asciidoc | 2 +- .../reverse-nested-aggregation.asciidoc | 6 +- .../analyzers/keyword-analyzer.asciidoc | 2 +- docs/reference/docs/termvectors.asciidoc | 4 +- docs/reference/docs/update-by-query.asciidoc | 6 +- .../index-modules/similarity.asciidoc | 2 +- docs/reference/indices/aliases.asciidoc | 3 +- docs/reference/indices/create-index.asciidoc | 2 +- .../indices/get-field-mapping.asciidoc | 18 +- docs/reference/indices/put-mapping.asciidoc | 24 ++- docs/reference/mapping.asciidoc | 15 +- .../mapping/dynamic/default-mapping.asciidoc | 7 +- .../mapping/dynamic/field-mapping.asciidoc | 4 +- .../mapping/dynamic/templates.asciidoc | 13 +- .../mapping/fields/all-field.asciidoc | 14 +- .../mapping/fields/parent-field.asciidoc | 6 +- .../mapping/params/analyzer.asciidoc | 6 +- docs/reference/mapping/params/boost.asciidoc | 6 +- .../reference/mapping/params/copy-to.asciidoc | 6 +- .../mapping/params/doc-values.asciidoc | 6 +- .../reference/mapping/params/dynamic.asciidoc | 2 +- .../reference/mapping/params/enabled.asciidoc | 3 +- .../mapping/params/fielddata.asciidoc | 133 ++----------- .../mapping/params/ignore-above.asciidoc | 10 +- .../mapping/params/include-in-all.asciidoc | 14 +- .../mapping/params/index-options.asciidoc | 2 +- docs/reference/mapping/params/index.asciidoc | 46 +---- .../mapping/params/multi-fields.asciidoc | 17 +- docs/reference/mapping/params/norms.asciidoc | 38 +--- .../mapping/params/null-value.asciidoc | 7 +- .../params/position-increment-gap.asciidoc | 2 +- .../mapping/params/properties.asciidoc | 4 +- .../mapping/params/search-analyzer.asciidoc | 2 +- .../mapping/params/similarity.asciidoc | 8 +- docs/reference/mapping/params/store.asciidoc | 4 +- .../mapping/params/term-vector.asciidoc | 2 +- docs/reference/mapping/types.asciidoc | 12 +- docs/reference/mapping/types/binary.asciidoc | 2 +- docs/reference/mapping/types/keyword.asciidoc | 111 +++++++++++ docs/reference/mapping/types/object.asciidoc | 7 +- docs/reference/mapping/types/string.asciidoc | 177 +----------------- docs/reference/mapping/types/text.asciidoc | 139 ++++++++++++++ .../mapping/types/token-count.asciidoc | 2 +- .../migration/migrate_5_0/mapping.asciidoc | 30 +++ .../reference/query-dsl/exists-query.asciidoc | 2 +- docs/reference/query-dsl/mlt-query.asciidoc | 10 +- docs/reference/query-dsl/term-query.asciidoc | 19 +- 47 files changed, 430 insertions(+), 527 deletions(-) create mode 100644 docs/reference/mapping/types/keyword.asciidoc create mode 100644 docs/reference/mapping/types/text.asciidoc diff --git a/docs/reference/aggregations/bucket/nested-aggregation.asciidoc b/docs/reference/aggregations/bucket/nested-aggregation.asciidoc index f5872bdc5dc..89142df13e7 100644 --- a/docs/reference/aggregations/bucket/nested-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/nested-aggregation.asciidoc @@ -16,7 +16,7 @@ price for the product. The mapping could look like: "resellers" : { <1> "type" : "nested", "properties" : { - "name" : { "type" : "string" }, + "name" : { "type" : "text" }, "price" : { "type" : "double" } } } diff --git a/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc b/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc index 03bcdb0a18c..9dba1f2adf0 100644 --- a/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/reverse-nested-aggregation.asciidoc @@ -22,12 +22,12 @@ the issue documents as nested documents. The mapping could look like: "issue" : { "properties" : { - "tags" : { "type" : "string" } + "tags" : { "type" : "text" } "comments" : { <1> "type" : "nested" "properties" : { - "username" : { "type" : "string", "index" : "not_analyzed" }, - "comment" : { "type" : "string" } + "username" : { "type" : "keyword" }, + "comment" : { "type" : "text" } } } } diff --git a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc index 7704895c9da..815037596cf 100644 --- a/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc +++ b/docs/reference/analysis/analyzers/keyword-analyzer.asciidoc @@ -4,4 +4,4 @@ An analyzer of type `keyword` that "tokenizes" an entire stream as a single token. This is useful for data like zip codes, ids and so on. Note, when using mapping definitions, it might make more sense to simply -mark the field as `not_analyzed`. +map the field as a <>. diff --git a/docs/reference/docs/termvectors.asciidoc b/docs/reference/docs/termvectors.asciidoc index 0e108430f85..34fcaaecda1 100644 --- a/docs/reference/docs/termvectors.asciidoc +++ b/docs/reference/docs/termvectors.asciidoc @@ -136,13 +136,13 @@ curl -s -XPUT 'http://localhost:9200/twitter/' -d '{ "tweet": { "properties": { "text": { - "type": "string", + "type": "text", "term_vector": "with_positions_offsets_payloads", "store" : true, "analyzer" : "fulltext_analyzer" }, "fullname": { - "type": "string", + "type": "text", "term_vector": "with_positions_offsets_payloads", "analyzer" : "fulltext_analyzer" } diff --git a/docs/reference/docs/update-by-query.asciidoc b/docs/reference/docs/update-by-query.asciidoc index 13b5f6fc0eb..c0336491d9a 100644 --- a/docs/reference/docs/update-by-query.asciidoc +++ b/docs/reference/docs/update-by-query.asciidoc @@ -281,7 +281,7 @@ PUT test "test": { "dynamic": false, <1> "properties": { - "text": {"type": "string"} + "text": {"type": "text"} } } } @@ -300,8 +300,8 @@ POST test/test?refresh PUT test/_mapping/test <2> { "properties": { - "text": {"type": "string"}, - "flag": {"type": "string", "analyzer": "keyword"} + "text": {"type": "text"}, + "flag": {"type": "text", "analyzer": "keyword"} } } -------------------------------------------------- diff --git a/docs/reference/index-modules/similarity.asciidoc b/docs/reference/index-modules/similarity.asciidoc index 3d993a3a6eb..07591dc277b 100644 --- a/docs/reference/index-modules/similarity.asciidoc +++ b/docs/reference/index-modules/similarity.asciidoc @@ -39,7 +39,7 @@ Here we configure the DFRSimilarity so it can be referenced as { "book" : { "properties" : { - "title" : { "type" : "string", "similarity" : "my_similarity" } + "title" : { "type" : "text", "similarity" : "my_similarity" } } } -------------------------------------------------- diff --git a/docs/reference/indices/aliases.asciidoc b/docs/reference/indices/aliases.asciidoc index cb8f652070b..ab4e49e7aab 100644 --- a/docs/reference/indices/aliases.asciidoc +++ b/docs/reference/indices/aliases.asciidoc @@ -116,8 +116,7 @@ curl -XPUT 'http://localhost:9200/test1' -d '{ "type1": { "properties": { "user" : { - "type": "string", - "index": "not_analyzed" + "type": "keyword" } } } diff --git a/docs/reference/indices/create-index.asciidoc b/docs/reference/indices/create-index.asciidoc index 2210870135b..11216fa4c01 100644 --- a/docs/reference/indices/create-index.asciidoc +++ b/docs/reference/indices/create-index.asciidoc @@ -78,7 +78,7 @@ curl -XPOST localhost:9200/test -d '{ "mappings" : { "type1" : { "properties" : { - "field1" : { "type" : "string", "index" : "not_analyzed" } + "field1" : { "type" : "text" } } } } diff --git a/docs/reference/indices/get-field-mapping.asciidoc b/docs/reference/indices/get-field-mapping.asciidoc index 2aeb853e9f4..39667dc0874 100644 --- a/docs/reference/indices/get-field-mapping.asciidoc +++ b/docs/reference/indices/get-field-mapping.asciidoc @@ -22,7 +22,7 @@ For which the response is (assuming `text` is a default string field): "text": { "full_name": "text", "mapping": { - "text": { "type": "string" } + "text": { "type": "text" } } } } @@ -73,13 +73,13 @@ For example, consider the following mapping: { "article": { "properties": { - "id": { "type": "string" }, - "title": { "type": "string"}, - "abstract": { "type": "string"}, + "id": { "type": "text" }, + "title": { "type": "text"}, + "abstract": { "type": "text"}, "author": { "properties": { - "id": { "type": "string" }, - "name": { "type": "string" } + "id": { "type": "text" }, + "name": { "type": "text" } } } } @@ -105,19 +105,19 @@ returns: "abstract": { "full_name": "abstract", "mapping": { - "abstract": { "type": "string" } + "abstract": { "type": "text" } } }, "author.id": { "full_name": "author.id", "mapping": { - "id": { "type": "string" } + "id": { "type": "text" } } }, "name": { "full_name": "author.name", "mapping": { - "name": { "type": "string" } + "name": { "type": "text" } } } } diff --git a/docs/reference/indices/put-mapping.asciidoc b/docs/reference/indices/put-mapping.asciidoc index 7dd2389e824..cc94a08f626 100644 --- a/docs/reference/indices/put-mapping.asciidoc +++ b/docs/reference/indices/put-mapping.asciidoc @@ -12,7 +12,7 @@ PUT twitter <1> "tweet": { "properties": { "message": { - "type": "string" + "type": "text" } } } @@ -23,7 +23,7 @@ PUT twitter/_mapping/user <2> { "properties": { "name": { - "type": "string" + "type": "text" } } } @@ -32,7 +32,7 @@ PUT twitter/_mapping/tweet <3> { "properties": { "user_name": { - "type": "string" + "type": "text" } } } @@ -86,13 +86,12 @@ PUT my_index <1> "name": { "properties": { "first": { - "type": "string" + "type": "text" } } }, "user_id": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" } } } @@ -105,13 +104,12 @@ PUT my_index/_mapping/user "name": { "properties": { "last": { <2> - "type": "string" + "type": "text" } } }, "user_id": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "ignore_above": 100 <3> } } @@ -149,7 +147,7 @@ PUT my_index "type_one": { "properties": { "text": { <1> - "type": "string", + "type": "text", "analyzer": "standard" } } @@ -157,7 +155,7 @@ PUT my_index "type_two": { "properties": { "text": { <1> - "type": "string", + "type": "text", "analyzer": "standard" } } @@ -169,7 +167,7 @@ PUT my_index/_mapping/type_one <2> { "properties": { "text": { - "type": "string", + "type": "text", "analyzer": "standard", "search_analyzer": "whitespace" } @@ -180,7 +178,7 @@ PUT my_index/_mapping/type_one?update_all_types <3> { "properties": { "text": { - "type": "string", + "type": "text", "analyzer": "standard", "search_analyzer": "whitespace" } diff --git a/docs/reference/mapping.asciidoc b/docs/reference/mapping.asciidoc index 407f43625fa..8ead0436978 100644 --- a/docs/reference/mapping.asciidoc +++ b/docs/reference/mapping.asciidoc @@ -46,7 +46,7 @@ Fields with the same name in different mapping types in the same index Each field has a data `type` which can be: -* a simple type like <>, <>, <>, +* a simple type like <>, <>, <>, <>, <>, <> or <>. * a type which supports the hierarchical nature of JSON such as <> or <>. @@ -55,7 +55,7 @@ Each field has a data `type` which can be: It is often useful to index the same field in different ways for different purposes. For instance, a `string` field could be <> as -an `analyzed` field for full-text search, and as a `not_analyzed` field for +a `text` field for full-text search, and as a `keyword` field for sorting or aggregations. Alternatively, you could index a string field with the <>, the <> analyzer, and the @@ -134,18 +134,17 @@ PUT my_index <1> "user": { <2> "_all": { "enabled": false }, <3> "properties": { <4> - "title": { "type": "string" }, <5> - "name": { "type": "string" }, <5> + "title": { "type": "text" }, <5> + "name": { "type": "text" }, <5> "age": { "type": "integer" } <5> } }, "blogpost": { <2> "properties": { <4> - "title": { "type": "string" }, <5> - "body": { "type": "string" }, <5> + "title": { "type": "text" }, <5> + "body": { "type": "text" }, <5> "user_id": { - "type": "string", <5> - "index": "not_analyzed" + "type": "keyword" <5> }, "created": { "type": "date", <5> diff --git a/docs/reference/mapping/dynamic/default-mapping.asciidoc b/docs/reference/mapping/dynamic/default-mapping.asciidoc index c1e1f8dec66..bef90301f0c 100644 --- a/docs/reference/mapping/dynamic/default-mapping.asciidoc +++ b/docs/reference/mapping/dynamic/default-mapping.asciidoc @@ -56,11 +56,10 @@ PUT _template/logging "strings": { <4> "match_mapping_type": "string", "mapping": { - "type": "string", + "type": "text", "fields": { "raw": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "ignore_above": 256 } } @@ -79,4 +78,4 @@ PUT logs-2015.10.01/event/1 <1> The `logging` template will match any indices beginning with `logs-`. <2> Matching indices will be created with a single primary shard. <3> The `_all` field will be disabled by default for new type mappings. -<4> String fields will be created with an `analyzed` main field, and a `not_analyzed` `.raw` field. +<4> String fields will be created with a `text` main field, and a `keyword` `.raw` field. diff --git a/docs/reference/mapping/dynamic/field-mapping.asciidoc b/docs/reference/mapping/dynamic/field-mapping.asciidoc index 585931d5e3f..f8612958f9c 100644 --- a/docs/reference/mapping/dynamic/field-mapping.asciidoc +++ b/docs/reference/mapping/dynamic/field-mapping.asciidoc @@ -22,7 +22,7 @@ string:: Either a <> field (if the value passes <>), a <> or <> field (if the value passes <>) - or an <> <> field. + or an <> field. These are the only <> that are dynamically detected. All other datatypes must be mapped explicitly. @@ -81,7 +81,7 @@ PUT my_index/my_type/1 <1> -------------------------------------------------- // AUTOSENSE -<1> The `create_date` field has been added as a <> field. +<1> The `create_date` field has been added as a <> field. ===== Customising detected date formats diff --git a/docs/reference/mapping/dynamic/templates.asciidoc b/docs/reference/mapping/dynamic/templates.asciidoc index b903f1af066..1137f454ffd 100644 --- a/docs/reference/mapping/dynamic/templates.asciidoc +++ b/docs/reference/mapping/dynamic/templates.asciidoc @@ -52,7 +52,7 @@ can be automatically detected: `boolean`, `date`, `double`, `long`, `object`, `string`. It also accepts `*` to match all datatypes. For example, if we wanted to map all integer fields as `integer` instead of -`long`, and all `string` fields as both `analyzed` and `not_analyzed`, we +`long`, and all `string` fields as both `text` and `keyword`, we could use the following template: [source,js] @@ -74,11 +74,10 @@ PUT my_index "strings": { "match_mapping_type": "string", "mapping": { - "type": "string", + "type": "text", "fields": { "raw": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "ignore_above": 256 } } @@ -99,7 +98,7 @@ PUT my_index/my_type/1 -------------------------------------------------- // AUTOSENSE <1> The `my_integer` field is mapped as an `integer`. -<2> The `my_string` field is mapped as an analyzed `string`, with a `not_analyzed` <>. +<2> The `my_string` field is mapped as a `text`, with a `keyword` <>. [[match-unmatch]] @@ -180,7 +179,7 @@ PUT my_index "path_match": "name.*", "path_unmatch": "*.middle", "mapping": { - "type": "string", + "type": "text", "copy_to": "full_name" } } @@ -221,7 +220,7 @@ PUT my_index "match_mapping_type": "string", "match": "*", "mapping": { - "type": "string", + "type": "text", "analyzer": "{name}" } } diff --git a/docs/reference/mapping/fields/all-field.asciidoc b/docs/reference/mapping/fields/all-field.asciidoc index ae52fc1d0d1..6c5f073aee8 100644 --- a/docs/reference/mapping/fields/all-field.asciidoc +++ b/docs/reference/mapping/fields/all-field.asciidoc @@ -45,7 +45,7 @@ from each field as a string. It does not combine the _terms_ from each field. ============================================================================= -The `_all` field is just a <> field, and accepts the same +The `_all` field is just a <> field, and accepts the same parameters that other string fields accept, including `analyzer`, `term_vectors`, `index_options`, and `store`. @@ -136,7 +136,7 @@ PUT my_index }, "properties": { "content": { - "type": "string" + "type": "text" } } } @@ -172,11 +172,11 @@ PUT myindex "mytype": { "properties": { "title": { <1> - "type": "string", + "type": "text", "boost": 2 }, "content": { <1> - "type": "string" + "type": "text" } } } @@ -210,15 +210,15 @@ PUT myindex "mytype": { "properties": { "first_name": { - "type": "string", + "type": "text", "copy_to": "full_name" <1> }, "last_name": { - "type": "string", + "type": "text", "copy_to": "full_name" <1> }, "full_name": { - "type": "string" + "type": "text" } } } diff --git a/docs/reference/mapping/fields/parent-field.asciidoc b/docs/reference/mapping/fields/parent-field.asciidoc index 64f4a9934a6..fb066580044 100644 --- a/docs/reference/mapping/fields/parent-field.asciidoc +++ b/docs/reference/mapping/fields/parent-field.asciidoc @@ -127,7 +127,7 @@ global ordinals for the `_parent` field. Global ordinals, by default, are built lazily: the first parent-child query or aggregation after a refresh will trigger building of global ordinals. This can introduce a significant latency spike for your users. You can use -<> to shift the cost of building global +<> to shift the cost of building global ordinals from query time to refresh time, by mapping the `_parent` field as follows: [source,js] @@ -139,9 +139,7 @@ PUT my_index "my_child": { "_parent": { "type": "my_parent", - "fielddata": { - "loading": "eager_global_ordinals" - } + "eager_global_ordinals": true } } } diff --git a/docs/reference/mapping/params/analyzer.asciidoc b/docs/reference/mapping/params/analyzer.asciidoc index 68009e600db..2a452465e2d 100644 --- a/docs/reference/mapping/params/analyzer.asciidoc +++ b/docs/reference/mapping/params/analyzer.asciidoc @@ -47,10 +47,10 @@ PUT my_index "my_type": { "properties": { "text": { <1> - "type": "string", + "type": "text", "fields": { "english": { <2> - "type": "string", + "type": "text", "analyzer": "english" } } @@ -124,7 +124,7 @@ PUT /my_index "my_type":{ "properties":{ "title": { - "type":"string", + "type":"text", "analyzer":"my_analyzer", <3> "search_analyzer":"my_stop_analyzer", <4> "search_quote_analyzer":"my_analyzer" <5> diff --git a/docs/reference/mapping/params/boost.asciidoc b/docs/reference/mapping/params/boost.asciidoc index 22c0e2e69ea..add6f806844 100644 --- a/docs/reference/mapping/params/boost.asciidoc +++ b/docs/reference/mapping/params/boost.asciidoc @@ -12,11 +12,11 @@ PUT my_index "my_type": { "properties": { "title": { - "type": "string", + "type": "text", "boost": 2 <1> }, "content": { - "type": "string" + "type": "text" } } } @@ -83,4 +83,4 @@ We advise against using index time boosting for the following reasons: byte. This reduces the resolution of the field length normalization factor which can lead to lower quality relevance calculations. -================================================== \ No newline at end of file +================================================== diff --git a/docs/reference/mapping/params/copy-to.asciidoc b/docs/reference/mapping/params/copy-to.asciidoc index b437a87424a..863bf1996cd 100644 --- a/docs/reference/mapping/params/copy-to.asciidoc +++ b/docs/reference/mapping/params/copy-to.asciidoc @@ -15,15 +15,15 @@ PUT /my_index "my_type": { "properties": { "first_name": { - "type": "string", + "type": "text", "copy_to": "full_name" <1> }, "last_name": { - "type": "string", + "type": "text", "copy_to": "full_name" <1> }, "full_name": { - "type": "string" + "type": "text" } } } diff --git a/docs/reference/mapping/params/doc-values.asciidoc b/docs/reference/mapping/params/doc-values.asciidoc index 81f9b6e3c64..4ded2212de1 100644 --- a/docs/reference/mapping/params/doc-values.asciidoc +++ b/docs/reference/mapping/params/doc-values.asciidoc @@ -29,12 +29,10 @@ PUT my_index "my_type": { "properties": { "status_code": { <1> - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "session_id": { <2> - "type": "string", - "index": "not_analyzed", + "type": "keyword", "doc_values": false } } diff --git a/docs/reference/mapping/params/dynamic.asciidoc b/docs/reference/mapping/params/dynamic.asciidoc index db73709f4f3..72bbd369d7f 100644 --- a/docs/reference/mapping/params/dynamic.asciidoc +++ b/docs/reference/mapping/params/dynamic.asciidoc @@ -67,7 +67,7 @@ PUT my_index "user": { <2> "properties": { "name": { - "type": "string" + "type": "text" }, "social_networks": { <3> "dynamic": true, diff --git a/docs/reference/mapping/params/enabled.asciidoc b/docs/reference/mapping/params/enabled.asciidoc index 6f72f4da890..7bffcfddf2e 100644 --- a/docs/reference/mapping/params/enabled.asciidoc +++ b/docs/reference/mapping/params/enabled.asciidoc @@ -21,8 +21,7 @@ PUT my_index "session": { "properties": { "user_id": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "last_updated": { "type": "date" diff --git a/docs/reference/mapping/params/fielddata.asciidoc b/docs/reference/mapping/params/fielddata.asciidoc index 4d96fb61132..e67b47a831f 100644 --- a/docs/reference/mapping/params/fielddata.asciidoc +++ b/docs/reference/mapping/params/fielddata.asciidoc @@ -12,28 +12,28 @@ documents, we need to be able to look up the document and find the terms that it has in a field. Most fields can use index-time, on-disk <> to support -this type of data access pattern, but `analyzed` string fields do not support -`doc_values`. +this type of data access pattern, but `text` fields do not support `doc_values`. -Instead, `analyzed` strings use a query-time data structure called +Instead, `text` strings use a query-time data structure called `fielddata`. This data structure is built on demand the first time that a field is used for aggregations, sorting, or is accessed in a script. It is built by reading the entire inverted index for each segment from disk, inverting the term ↔︎ document relationship, and storing the result in memory, in the JVM heap. -Loading fielddata is an expensive process so, once it has been loaded, it -remains in memory for the lifetime of the segment. +Loading fielddata is an expensive process so it is disabled by default. Also, +when enabled, once it has been loaded, it remains in memory for the lifetime of +the segment. [WARNING] .Fielddata can fill up your heap space ============================================================================== Fielddata can consume a lot of heap space, especially when loading high -cardinality `analyzed` string fields. Most of the time, it doesn't make sense -to sort or aggregate on `analyzed` string fields (with the notable exception +cardinality `text` fields. Most of the time, it doesn't make sense +to sort or aggregate on `text` fields (with the notable exception of the <> -aggregation). Always think about whether a `not_analyzed` field (which can +aggregation). Always think about whether a <> field (which can use `doc_values`) would be a better fit for your use case. ============================================================================== @@ -42,71 +42,6 @@ same name in the same index. Its value can be updated on existing fields using the <>. -[[fielddata-format]] -==== `fielddata.format` - -For `analyzed` string fields, the fielddata `format` controls whether -fielddata should be enabled or not. It accepts: `disabled` and `paged_bytes` -(enabled, which is the default). To disable fielddata loading, you can use -the following mapping: - -[source,js] --------------------------------------------------- -PUT my_index -{ - "mappings": { - "my_type": { - "properties": { - "text": { - "type": "string", - "fielddata": { - "format": "disabled" <1> - } - } - } - } - } -} --------------------------------------------------- -// AUTOSENSE -<1> The `text` field cannot be used for sorting, aggregations, or in scripts. - -.Fielddata and other datatypes -[NOTE] -================================================== - -Historically, other field datatypes also used fielddata, but this has been replaced -by index-time, disk-based <>. - -================================================== - - -[[fielddata-loading]] -==== `fielddata.loading` - -This per-field setting controls when fielddata is loaded into memory. It -accepts three options: - -[horizontal] -`lazy`:: - - Fielddata is only loaded into memory when it is needed. (default) - -`eager`:: - - Fielddata is loaded into memory before a new search segment becomes - visible to search. This can reduce the latency that a user may experience - if their search request has to trigger lazy loading from a big segment. - -`eager_global_ordinals`:: - - Loading fielddata into memory is only part of the work that is required. - After loading the fielddata for each segment, Elasticsearch builds the - <> data structure to make a list of all unique terms - across all the segments in a shard. By default, global ordinals are built - lazily. If the field has a very high cardinality, global ordinals may - take some time to build, in which case you can use eager loading instead. - [[global-ordinals]] .Global ordinals ***************************************** @@ -141,15 +76,10 @@ can move the loading time from the first search request, to the refresh itself. ***************************************** [[field-data-filtering]] -==== `fielddata.filter` +==== `fielddata_frequency_filter` Fielddata filtering can be used to reduce the number of terms loaded into -memory, and thus reduce memory usage. Terms can be filtered by _frequency_ or -by _regular expression_, or a combination of the two: - -Filtering by frequency:: -+ --- +memory, and thus reduce memory usage. Terms can be filtered by _frequency_: The frequency filter allows you to only load terms whose term frequency falls between a `min` and `max` value, which can be expressed an absolute @@ -169,7 +99,7 @@ PUT my_index "my_type": { "properties": { "tag": { - "type": "string", + "type": "text", "fielddata": { "filter": { "frequency": { @@ -186,44 +116,3 @@ PUT my_index } -------------------------------------------------- // AUTOSENSE --- - -Filtering by regex:: -+ --- -Terms can also be filtered by regular expression - only values which -match the regular expression are loaded. Note: the regular expression is -applied to each term in the field, not to the whole field value. For -instance, to only load hashtags from a tweet, we can use a regular -expression which matches terms beginning with `#`: - -[source,js] --------------------------------------------------- -PUT my_index -{ - "mappings": { - "my_type": { - "properties": { - "tweet": { - "type": "string", - "analyzer": "whitespace", - "fielddata": { - "filter": { - "regex": { - "pattern": "^#.*" - } - } - } - } - } - } - } -} --------------------------------------------------- -// AUTOSENSE --- - -These filters can be updated on an existing field mapping and will take -effect the next time the fielddata for a segment is loaded. Use the -<> API -to reload the fielddata using the new filters. diff --git a/docs/reference/mapping/params/ignore-above.asciidoc b/docs/reference/mapping/params/ignore-above.asciidoc index 3a8e527860a..a9fa5377144 100644 --- a/docs/reference/mapping/params/ignore-above.asciidoc +++ b/docs/reference/mapping/params/ignore-above.asciidoc @@ -1,12 +1,7 @@ [[ignore-above]] === `ignore_above` -Strings longer than the `ignore_above` setting will not be processed by the -<> and will not be indexed. This is mainly useful for -<> string fields, which are typically used for -filtering, aggregations, and sorting. These are structured fields and it -doesn't usually make sense to allow very long terms to be indexed in these -fields. +Strings longer than the `ignore_above` setting will not be indexed or stored. [source,js] -------------------------------------------------- @@ -16,8 +11,7 @@ PUT my_index "my_type": { "properties": { "message": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "ignore_above": 20 <1> } } diff --git a/docs/reference/mapping/params/include-in-all.asciidoc b/docs/reference/mapping/params/include-in-all.asciidoc index 5061c16d374..76a0d14527c 100644 --- a/docs/reference/mapping/params/include-in-all.asciidoc +++ b/docs/reference/mapping/params/include-in-all.asciidoc @@ -14,10 +14,10 @@ PUT my_index "my_type": { "properties": { "title": { <1> - "type": "string" + "type": "text" } "content": { <1> - "type": "string" + "type": "text" }, "date": { <2> "type": "date", @@ -50,18 +50,18 @@ PUT my_index "my_type": { "include_in_all": false, <1> "properties": { - "title": { "type": "string" }, + "title": { "type": "text" }, "author": { "include_in_all": true, <2> "properties": { - "first_name": { "type": "string" }, - "last_name": { "type": "string" } + "first_name": { "type": "text" }, + "last_name": { "type": "text" } } }, "editor": { "properties": { - "first_name": { "type": "string" }, <3> - "last_name": { "type": "string", "include_in_all": true } <3> + "first_name": { "type": "text" }, <3> + "last_name": { "type": "text", "include_in_all": true } <3> } } } diff --git a/docs/reference/mapping/params/index-options.asciidoc b/docs/reference/mapping/params/index-options.asciidoc index f4608714258..9f327adb969 100644 --- a/docs/reference/mapping/params/index-options.asciidoc +++ b/docs/reference/mapping/params/index-options.asciidoc @@ -39,7 +39,7 @@ PUT my_index "my_type": { "properties": { "text": { - "type": "string", + "type": "text", "index_options": "offsets" } } diff --git a/docs/reference/mapping/params/index.asciidoc b/docs/reference/mapping/params/index.asciidoc index 6dd9151814d..e097293d142 100644 --- a/docs/reference/mapping/params/index.asciidoc +++ b/docs/reference/mapping/params/index.asciidoc @@ -1,48 +1,6 @@ [[mapping-index]] === `index` -The `index` option controls how field values are indexed and, thus, how they -are searchable. It accepts three values: +The `index` option controls whether field values are indexed. It accepts `true` +or `false`. Fields that are not indexed are not queryable. -[horizontal] -`no`:: - - Do not add this field value to the index. With this setting, the field - will not be queryable. - -`not_analyzed`:: - - Add the field value to the index unchanged, as a single term. This is the - default for all fields that support this option except for - <> fields. `not_analyzed` fields are usually used with - <> for structured search. - -`analyzed`:: - - This option applies only to `string` fields, for which it is the default. - The string field value is first <> to convert the - string into terms (e.g. a list of individual words), which are then - indexed. At search time, the query string is passed through - (<>) the same analyzer to generate terms - in the same format as those in the index. It is this process that enables - <>. - -For example, you can create a `not_analyzed` string field with the following: - -[source,js] --------------------------------------------------- -PUT /my_index -{ - "mappings": { - "my_type": { - "properties": { - "status_code": { - "type": "string", - "index": "not_analyzed" - } - } - } - } -} --------------------------------------------------- -// AUTOSENSE \ No newline at end of file diff --git a/docs/reference/mapping/params/multi-fields.asciidoc b/docs/reference/mapping/params/multi-fields.asciidoc index 994d2fddbc1..8ca2809c7f2 100644 --- a/docs/reference/mapping/params/multi-fields.asciidoc +++ b/docs/reference/mapping/params/multi-fields.asciidoc @@ -3,8 +3,8 @@ It is often useful to index the same field in different ways for different purposes. This is the purpose of _multi-fields_. For instance, a `string` -field could be <> as an `analyzed` field for full-text -search, and as a `not_analyzed` field for sorting or aggregations: +field could be mapped as a `text` field for full-text +search, and as a `keyword` field for sorting or aggregations: [source,js] -------------------------------------------------- @@ -14,11 +14,10 @@ PUT /my_index "my_type": { "properties": { "city": { - "type": "string", + "type": "text", "fields": { "raw": { <1> - "type": "string", - "index": "not_analyzed" + "type": "keyword" } } } @@ -57,8 +56,8 @@ GET /my_index/_search } -------------------------------------------------- // AUTOSENSE -<1> The `city.raw` field is a `not_analyzed` version of the `city` field. -<2> The analyzed `city` field can be used for full text search. +<1> The `city.raw` field is a `keyword` version of the `city` field. +<2> The `city` field can be used for full text search. <3> The `city.raw` field can be used for sorting and aggregations NOTE: Multi-fields do not change the original `_source` field. @@ -83,10 +82,10 @@ PUT my_index "my_type": { "properties": { "text": { <1> - "type": "string", + "type": "text", "fields": { "english": { <2> - "type": "string", + "type": "text", "analyzer": "english" } } diff --git a/docs/reference/mapping/params/norms.asciidoc b/docs/reference/mapping/params/norms.asciidoc index f83f93caf56..f6e42219a1f 100644 --- a/docs/reference/mapping/params/norms.asciidoc +++ b/docs/reference/mapping/params/norms.asciidoc @@ -4,14 +4,14 @@ Norms store various normalization factors that are later used at query time in order to compute the score of a document relatively to a query. -Although useful for scoring, norms also require quite a lot of memory +Although useful for scoring, norms also require quite a lot of disk (typically in the order of one byte per document per field in your index, even for documents that don't have this specific field). As a consequence, if you don't need scoring on a specific field, you should disable norms on that field. In particular, this is the case for fields that are used solely for filtering or aggregations. -TIP: The `norms.enabled` setting must have the same setting for fields of the +TIP: The `norms` setting must have the same setting for fields of the same name in the same index. Norms can be disabled on existing fields using the <>. @@ -24,10 +24,8 @@ PUT my_index/_mapping/my_type { "properties": { "title": { - "type": "string", - "norms": { - "enabled": false - } + "type": "text", + "norms": false } } } @@ -41,31 +39,3 @@ results since some documents won't have norms anymore while other documents might still have norms. -==== Lazy loading of norms - -Norms can be loaded into memory eagerly (`eager`), whenever a new segment -comes online, or they can loaded lazily (`lazy`, default), only when the field -is queried. - -Eager loading can be configured as follows: - -[source,js] ------------- -PUT my_index/_mapping/my_type -{ - "properties": { - "title": { - "type": "string", - "norms": { - "loading": "eager" - } - } - } -} ------------- -// AUTOSENSE - -TIP: The `norms.loading` setting must have the same setting for fields of the -same name in the same index. Its value can be updated on existing fields -using the <>. - diff --git a/docs/reference/mapping/params/null-value.asciidoc b/docs/reference/mapping/params/null-value.asciidoc index 4d70d4a6ac5..b77a2e4da69 100644 --- a/docs/reference/mapping/params/null-value.asciidoc +++ b/docs/reference/mapping/params/null-value.asciidoc @@ -16,8 +16,7 @@ PUT my_index "my_type": { "properties": { "status_code": { - "type": "string", - "index": "not_analyzed", + "type": "keyword", "null_value": "NULL" <1> } } @@ -50,6 +49,4 @@ GET my_index/_search <3> A query for `NULL` returns document 1, but not document 2. IMPORTANT: The `null_value` needs to be the same datatype as the field. For -instance, a `long` field cannot have a string `null_value`. String fields -which are `analyzed` will also pass the `null_value` through the configured -analyzer. +instance, a `long` field cannot have a string `null_value`. diff --git a/docs/reference/mapping/params/position-increment-gap.asciidoc b/docs/reference/mapping/params/position-increment-gap.asciidoc index 962e2178469..d2cf1360080 100644 --- a/docs/reference/mapping/params/position-increment-gap.asciidoc +++ b/docs/reference/mapping/params/position-increment-gap.asciidoc @@ -57,7 +57,7 @@ PUT my_index "groups": { "properties": { "names": { - "type": "string", + "type": "text", "position_increment_gap": 0 <1> } } diff --git a/docs/reference/mapping/params/properties.asciidoc b/docs/reference/mapping/params/properties.asciidoc index 7683272ea19..a4f5277649a 100644 --- a/docs/reference/mapping/params/properties.asciidoc +++ b/docs/reference/mapping/params/properties.asciidoc @@ -23,14 +23,14 @@ PUT my_index "manager": { <2> "properties": { "age": { "type": "integer" }, - "name": { "type": "string" } + "name": { "type": "text" } } }, "employees": { <3> "type": "nested", "properties": { "age": { "type": "integer" }, - "name": { "type": "string" } + "name": { "type": "text" } } } } diff --git a/docs/reference/mapping/params/search-analyzer.asciidoc b/docs/reference/mapping/params/search-analyzer.asciidoc index b1aa7b6c688..5a732ee3aac 100644 --- a/docs/reference/mapping/params/search-analyzer.asciidoc +++ b/docs/reference/mapping/params/search-analyzer.asciidoc @@ -41,7 +41,7 @@ PUT /my_index "my_type": { "properties": { "text": { - "type": "string", + "type": "text", "analyzer": "autocomplete", <2> "search_analyzer": "standard" <2> } diff --git a/docs/reference/mapping/params/similarity.asciidoc b/docs/reference/mapping/params/similarity.asciidoc index 0cdbd80a93e..731c557e730 100644 --- a/docs/reference/mapping/params/similarity.asciidoc +++ b/docs/reference/mapping/params/similarity.asciidoc @@ -5,8 +5,8 @@ Elasticsearch allows you to configure a scoring algorithm or _similarity_ per field. The `similarity` setting provides a simple way of choosing a similarity algorithm other than the default TF/IDF, such as `BM25`. -Similarities are mostly useful for <> fields, especially -`analyzed` string fields, but can also apply to other field types. +Similarities are mostly useful for <> fields, but can also apply +to other field types. Custom similarities can be configured by tuning the parameters of the built-in similarities. For more details about this expert options, see the @@ -37,10 +37,10 @@ PUT my_index "my_type": { "properties": { "default_field": { <1> - "type": "string" + "type": "text" }, "bm25_field": { - "type": "string", + "type": "text", "similarity": "BM25" <2> } } diff --git a/docs/reference/mapping/params/store.asciidoc b/docs/reference/mapping/params/store.asciidoc index 46d57e9d8b5..9f10b25724c 100644 --- a/docs/reference/mapping/params/store.asciidoc +++ b/docs/reference/mapping/params/store.asciidoc @@ -24,7 +24,7 @@ PUT /my_index "my_type": { "properties": { "title": { - "type": "string", + "type": "text", "store": true <1> }, "date": { @@ -32,7 +32,7 @@ PUT /my_index "store": true <1> }, "content": { - "type": "string" + "type": "text" } } } diff --git a/docs/reference/mapping/params/term-vector.asciidoc b/docs/reference/mapping/params/term-vector.asciidoc index 74c4c416d95..136ab084149 100644 --- a/docs/reference/mapping/params/term-vector.asciidoc +++ b/docs/reference/mapping/params/term-vector.asciidoc @@ -35,7 +35,7 @@ PUT my_index "my_type": { "properties": { "text": { - "type": "string", + "type": "text", "term_vector": "with_positions_offsets" } } diff --git a/docs/reference/mapping/types.asciidoc b/docs/reference/mapping/types.asciidoc index 2ac579f273a..30d6bd56b1f 100644 --- a/docs/reference/mapping/types.asciidoc +++ b/docs/reference/mapping/types.asciidoc @@ -7,7 +7,7 @@ document: [float] === Core datatypes -<>:: `string` +string:: <> and <> <>:: `long`, `integer`, `short`, `byte`, `double`, `float` <>:: `date` <>:: `boolean` @@ -45,9 +45,9 @@ Attachment datatype:: === Multi-fields It is often useful to index the same field in different ways for different -purposes. For instance, a `string` field could be <> as -an `analyzed` field for full-text search, and as a `not_analyzed` field for -sorting or aggregations. Alternatively, you could index a string field with +purposes. For instance, a `string` field could be mapped as +a `text` field for full-text search, and as a `keyword` field for +sorting or aggregations. Alternatively, you could index a text field with the <>, the <> analyzer, and the <>. @@ -69,6 +69,8 @@ include::types/geo-shape.asciidoc[] include::types/ip.asciidoc[] +include::types/keyword.asciidoc[] + include::types/nested.asciidoc[] include::types/numeric.asciidoc[] @@ -77,6 +79,8 @@ include::types/object.asciidoc[] include::types/string.asciidoc[] +include::types/text.asciidoc[] + include::types/token-count.asciidoc[] diff --git a/docs/reference/mapping/types/binary.asciidoc b/docs/reference/mapping/types/binary.asciidoc index 4e5f6b4bc27..7f82523416f 100644 --- a/docs/reference/mapping/types/binary.asciidoc +++ b/docs/reference/mapping/types/binary.asciidoc @@ -13,7 +13,7 @@ PUT my_index "my_type": { "properties": { "name": { - "type": "string" + "type": "text" }, "blob": { "type": "binary" diff --git a/docs/reference/mapping/types/keyword.asciidoc b/docs/reference/mapping/types/keyword.asciidoc new file mode 100644 index 00000000000..66c7135c37d --- /dev/null +++ b/docs/reference/mapping/types/keyword.asciidoc @@ -0,0 +1,111 @@ +[[keyword]] +=== Keyword datatype + +A field to index structured content such as email addresses, hostnames, status +codes, zip codes or tags. + +They are typically used for filtering (_Find me all blog posts where +++status++ is ++published++_), for sorting, and for aggregations. Keyword +fields are ony searchable by their exact value. + +If you need to index full text content such as email bodies or product +descriptions, it is likely that you should rather use a <> field. + +Below is an example of a mapping for a keyword field: + +[source,js] +-------------------------------- +PUT my_index +{ + "mappings": { + "my_type": { + "properties": { + "tags": { + "type": "keyword" + } + } + } + } +} +-------------------------------- +// AUTOSENSE + +[[keyword-params]] +==== Parameters for keyword fields + +The following parameters are accepted by `string` fields: + +[horizontal] + +<>:: + + Mapping field-level query time boosting. Accepts a floating point number, defaults + to `1.0`. + +<>:: + + Should the field be stored on disk in a column-stride fashion, so that it + can later be used for sorting, aggregations, or scripting? Accepts `true` + (default) or `false`. + +<>:: + + Should global ordinals be loaded eagerly on refresh? Accepts `true` or `false` + (default). Enabling this is a good idea on fields that are frequently used for + terms aggregations. + +<>:: + + Multi-fields allow the same string value to be indexed in multiple ways for + different purposes, such as one field for search and a multi-field for + sorting and aggregations. + +<>:: + + Do not index or analyze any string longer than this value. Defaults to + `2147483647` so that all values would be accepted. + +<>:: + + Whether or not the field value should be included in the + <> field? Accepts `true` or `false`. Defaults + to `false` if <> is set to `no`, or if a parent + <> field sets `include_in_all` to `false`. + Otherwise defaults to `true`. + +<>:: + + Should the field be searchable? Accepts `true` (default) or `false`. + +<>:: + + What information should be stored in the index, for scoring purposes. + Defaults to `docs` but can also be set to `freqs` to take term frequency into account + when computing scores. + +<>:: + + Whether field-length should be taken into account when scoring queries. + Accepts `true` or `false` (default). + +<>:: + + Accepts a string value which is substituted for any explicit `null` + values. Defaults to `null`, which means the field is treated as missing. + +<>:: + + Whether the field value should be stored and retrievable separately from + the <> field. Accepts `true` or `false` + (default). + +<>:: + + The <> that should be used at search time on + <> fields. Defaults to the `analyzer` setting. + +<>:: + + Which scoring algorithm or _similarity_ should be used. Defaults + to `classic`, which uses TF/IDF. + diff --git a/docs/reference/mapping/types/object.asciidoc b/docs/reference/mapping/types/object.asciidoc index 0d159d7e1ef..6d35e6aee97 100644 --- a/docs/reference/mapping/types/object.asciidoc +++ b/docs/reference/mapping/types/object.asciidoc @@ -46,16 +46,15 @@ PUT my_index "my_type": { <1> "properties": { "region": { - "type": "string", - "index": "not_analyzed" + "type": "keyword" }, "manager": { <2> "properties": { "age": { "type": "integer" }, "name": { <3> "properties": { - "first": { "type": "string" }, - "last": { "type": "string" } + "first": { "type": "text" }, + "last": { "type": "text" } } } } diff --git a/docs/reference/mapping/types/string.asciidoc b/docs/reference/mapping/types/string.asciidoc index 6ff78aa4732..88ca4e1d920 100644 --- a/docs/reference/mapping/types/string.asciidoc +++ b/docs/reference/mapping/types/string.asciidoc @@ -1,179 +1,4 @@ [[string]] === String datatype -Fields of type `string` accept text values. Strings may be sub-divided into: - -Full text:: -+ --- - -Full text values, like the body of an email, are typically used for text based -relevance searches, such as: _Find the most relevant documents that match a -query for "quick brown fox"_. - -These fields are `analyzed`, that is they are passed through an -<> to convert the string into a list of individual terms -before being indexed. The analysis process allows Elasticsearch to search for -individual words _within_ each full text field. Full text fields are not -used for sorting and seldom used for aggregations (although the -<> is a notable exception). - --- - -Keywords:: - -Keywords are exact values like email addresses, hostnames, status codes, or -tags. They are typically used for filtering (_Find me all blog posts where -++status++ is ++published++_), for sorting, and for aggregations. Keyword -fields are `not_analyzed`. Instead, the exact string value is added to the -index as a single term. - -Below is an example of a mapping for a full text (`analyzed`) and a keyword -(`not_analyzed`) string field: - -[source,js] --------------------------------- -PUT my_index -{ - "mappings": { - "my_type": { - "properties": { - "full_name": { <1> - "type": "string" - }, - "status": { - "type": "string", <2> - "index": "not_analyzed" - } - } - } - } -} --------------------------------- -// AUTOSENSE -<1> The `full_name` field is an `analyzed` full text field -- `index:analyzed` is the default. -<2> The `status` field is a `not_analyzed` keyword field. - -Sometimes it is useful to have both a full text (`analyzed`) and a keyword -(`not_analyzed`) version of the same field: one for full text search and the -other for aggregations and sorting. This can be achieved with -<>. - - -[[string-params]] -==== Parameters for string fields - -The following parameters are accepted by `string` fields: - -[horizontal] - -<>:: - - The <> which should be used for - <> string fields, both at index-time and at - search-time (unless overridden by the <>). - Defaults to the default index analyzer, or the - <>. - -<>:: - - Mapping field-level query time boosting. Accepts a floating point number, defaults - to `1.0`. - -<>:: - - Should the field be stored on disk in a column-stride fashion, so that it - can later be used for sorting, aggregations, or scripting? Accepts `true` - or `false`. Defaults to `true` for `not_analyzed` fields. Analyzed fields - do not support doc values. - -<>:: - - Can the field use in-memory fielddata for sorting, aggregations, - or scripting? Accepts `disabled` or `paged_bytes` (default). - Not analyzed fields will use <> in preference - to fielddata. - -<>:: - - Multi-fields allow the same string value to be indexed in multiple ways for - different purposes, such as one field for search and a multi-field for - sorting and aggregations, or the same string value analyzed by different - analyzers. - -<>:: - - Do not index or analyze any string longer than this value. Defaults to `0` (disabled). - -<>:: - - Whether or not the field value should be included in the - <> field? Accepts `true` or `false`. Defaults - to `false` if <> is set to `no`, or if a parent - <> field sets `include_in_all` to `false`. - Otherwise defaults to `true`. - -<>:: - - Should the field be searchable? Accepts `analyzed` (default, treat as full-text field), - `not_analyzed` (treat as keyword field) and `no`. - -<>:: - - What information should be stored in the index, for search and highlighting purposes. - Defaults to `positions` for <> fields, and to `docs` for - `not_analyzed` fields. - - -<>:: -+ --- - -Whether field-length should be taken into account when scoring queries. -Defaults depend on the <> setting: - -* `analyzed` fields default to `{ "enabled": true, "loading": "lazy" }`. -* `not_analyzed` fields default to `{ "enabled": false }`. --- - -<>:: - - Accepts a string value which is substituted for any explicit `null` - values. Defaults to `null`, which means the field is treated as missing. - If the field is `analyzed`, the `null_value` will also be analyzed. - -<>:: - - The number of fake term positions which should be inserted between - each element of an array of strings. Defaults to 0. - The number of fake term position which should be inserted between each - element of an array of strings. Defaults to the position_increment_gap - configured on the analyzer which defaults to 100. 100 was chosen because it - prevents phrase queries with reasonably large slops (less than 100) from - matching terms across field values. - -<>:: - - Whether the field value should be stored and retrievable separately from - the <> field. Accepts `true` or `false` - (default). - -<>:: - - The <> that should be used at search time on - <> fields. Defaults to the `analyzer` setting. - -<>:: - - The <> that should be used at search time when a - phrase is encountered. Defaults to the `search_analyzer` setting. - -<>:: - - Which scoring algorithm or _similarity_ should be used. Defaults - to `classic`, which uses TF/IDF. - -<>:: - - Whether term vectors should be stored for an <> - field. Defaults to `no`. +NOTE: The `string` field has been removed in favor of the `text` and `keyword` fields. diff --git a/docs/reference/mapping/types/text.asciidoc b/docs/reference/mapping/types/text.asciidoc new file mode 100644 index 00000000000..7798b2c41f0 --- /dev/null +++ b/docs/reference/mapping/types/text.asciidoc @@ -0,0 +1,139 @@ +[[text]] +=== Text datatype + +A field to index full-text values, such as the body of on email or the +description of a product. These fields are `analyzed`, that is they are passed through an +<> to convert the string into a list of individual terms +before being indexed. The analysis process allows Elasticsearch to search for +individual words _within_ each full text field. Text fields are not +used for sorting and seldom used for aggregations (although the +<> +is a notable exception). + +If you need to index structured content such as email addresses, hostnames, status +codes, or tags, it is likely that you should rather use a <> field. + +Below is an example of a mapping for a text field: + +[source,js] +-------------------------------- +PUT my_index +{ + "mappings": { + "my_type": { + "properties": { + "full_name": { + "type": "text" + } + } + } + } +} +-------------------------------- +// AUTOSENSE + +Sometimes it is useful to have both a full text (`text`) and a keyword +(`keyword`) version of the same field: one for full text search and the +other for aggregations and sorting. This can be achieved with +<>. + +[[text-params]] +==== Parameters for text fields + +The following parameters are accepted by `text` fields: + +[horizontal] + +<>:: + + The <> which should be used for + <> string fields, both at index-time and at + search-time (unless overridden by the <>). + Defaults to the default index analyzer, or the + <>. + +<>:: + + Mapping field-level query time boosting. Accepts a floating point number, defaults + to `1.0`. + +<>:: + + Should global ordinals be loaded eagerly on refresh? Accepts `true` or `false` + (default). Enabling this is a good idea on fields that are frequently used for + (significant) terms aggregations. + +<>:: + + Can the field use in-memory fielddata for sorting, aggregations, + or scripting? Accepts `true` or `false` (default). + +<>:: + + Expert settings which allow to decide which values to load in memory when `fielddata` + is enabled. By default all values are loaded. + +<>:: + + Multi-fields allow the same string value to be indexed in multiple ways for + different purposes, such as one field for search and a multi-field for + sorting and aggregations, or the same string value analyzed by different + analyzers. + +<>:: + + Whether or not the field value should be included in the + <> field? Accepts `true` or `false`. Defaults + to `false` if <> is set to `no`, or if a parent + <> field sets `include_in_all` to `false`. + Otherwise defaults to `true`. + +<>:: + + Should the field be searchable? Accepts `true` (default) or `false`. + +<>:: + + What information should be stored in the index, for search and highlighting purposes. + Defaults to `positions`. + +<>:: + + Whether field-length should be taken into account when scoring queries. + Accepts `true` (default) or `false`. + +<>:: + + The number of fake term positions which should be inserted between + each element of an array of strings. Defaults to 0. + The number of fake term position which should be inserted between each + element of an array of strings. Defaults to the position_increment_gap + configured on the analyzer which defaults to 100. 100 was chosen because it + prevents phrase queries with reasonably large slops (less than 100) from + matching terms across field values. + +<>:: + + Whether the field value should be stored and retrievable separately from + the <> field. Accepts `true` or `false` + (default). + +<>:: + + The <> that should be used at search time on + <> fields. Defaults to the `analyzer` setting. + +<>:: + + The <> that should be used at search time when a + phrase is encountered. Defaults to the `search_analyzer` setting. + +<>:: + + Which scoring algorithm or _similarity_ should be used. Defaults + to `classic`, which uses TF/IDF. + +<>:: + + Whether term vectors should be stored for an <> + field. Defaults to `no`. diff --git a/docs/reference/mapping/types/token-count.asciidoc b/docs/reference/mapping/types/token-count.asciidoc index ca2ed6f0428..baa6d409ead 100644 --- a/docs/reference/mapping/types/token-count.asciidoc +++ b/docs/reference/mapping/types/token-count.asciidoc @@ -15,7 +15,7 @@ PUT my_index "my_type": { "properties": { "name": { <1> - "type": "string", + "type": "text", "fields": { "length": { <2> "type": "token_count", diff --git a/docs/reference/migration/migrate_5_0/mapping.asciidoc b/docs/reference/migration/migrate_5_0/mapping.asciidoc index 768a2438d3e..23298cd733c 100644 --- a/docs/reference/migration/migrate_5_0/mapping.asciidoc +++ b/docs/reference/migration/migrate_5_0/mapping.asciidoc @@ -16,6 +16,26 @@ values. For backwards compatibility purposes, during the 5.x series: with `string` fields are no longer possible with `text`/`keyword` fields such as enabling `term_vectors` on a not-analyzed `keyword` field. +==== Default string mappings + +String mappings now have the following default mappings: + +[source,json] +--------------- +{ + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } +} +--------------- + +This allows to perform full-text search on the original field name and to sort +and run aggregations on the sub keyword field. + ==== `index` property On all field datatypes (except for the deprecated `string` field), the `index` @@ -35,12 +55,22 @@ now defaults to using `float` instead of `double`. The reasoning is that floats should be more than enough for most cases but would decrease storage requirements significantly. +==== `norms` + +`norms` now take a boolean instead of an object. This boolean is the replacement +for `norms.enabled`. There is no replacement for `norms.loading` since eager +loading of norms is not useful anymore now that norms are disk-based. + ==== `fielddata.format` Setting `fielddata.format: doc_values` in the mappings used to implicitly enable doc-values on a field. This no longer works: the only way to enable or disable doc-values is by using the `doc_values` property of mappings. +==== `fielddata.frequency.regex` + +Regex filters are not supported anymore and will be dropped on upgrade. + ==== Source-transform removed The source `transform` feature has been removed. Instead, use an ingest pipeline diff --git a/docs/reference/query-dsl/exists-query.asciidoc b/docs/reference/query-dsl/exists-query.asciidoc index 404dce4a4ae..b484d47f4b6 100644 --- a/docs/reference/query-dsl/exists-query.asciidoc +++ b/docs/reference/query-dsl/exists-query.asciidoc @@ -47,7 +47,7 @@ instance, if the `user` field were mapped as follows: [source,js] -------------------------------------------------- "user": { - "type": "string", + "type": "text", "null_value": "_null_" } -------------------------------------------------- diff --git a/docs/reference/query-dsl/mlt-query.asciidoc b/docs/reference/query-dsl/mlt-query.asciidoc index ce2d34144ee..d2d521f9492 100644 --- a/docs/reference/query-dsl/mlt-query.asciidoc +++ b/docs/reference/query-dsl/mlt-query.asciidoc @@ -116,18 +116,18 @@ curl -s -XPUT 'http://localhost:9200/imdb/' -d '{ "movies": { "properties": { "title": { - "type": "string", + "type": "text", "term_vector": "yes" }, "description": { - "type": "string" + "type": "text" }, "tags": { - "type": "string", + "type": "text", "fields" : { "raw": { - "type" : "string", - "index" : "not_analyzed", + "type" : "text", + "analyzer": "keyword", "term_vector" : "yes" } } diff --git a/docs/reference/query-dsl/term-query.asciidoc b/docs/reference/query-dsl/term-query.asciidoc index 85608ca3aa5..801abf65af8 100644 --- a/docs/reference/query-dsl/term-query.asciidoc +++ b/docs/reference/query-dsl/term-query.asciidoc @@ -49,13 +49,13 @@ GET /_search .Why doesn't the `term` query match my document? ************************************************** -String fields can be `analyzed` (treated as full text, like the body of an -email), or `not_analyzed` (treated as exact values, like an email address or a -zip code). Exact values (like numbers, dates, and `not_analyzed` strings) have +String fields can be of type `text` (treated as full text, like the body of an +email), or `keyword` (treated as exact values, like an email address or a +zip code). Exact values (like numbers, dates, and keywords) have the exact value specified in the field added to the inverted index in order to make them searchable. -By default, however, `string` fields are `analyzed`. This means that their +However, `text` fields are `analyzed`. This means that their values are first passed through an <> to produce a list of terms, which are then added to the inverted index. @@ -70,7 +70,7 @@ within a big block of full text. The `term` query looks for the *exact* term in the field's inverted index -- it doesn't know anything about the field's analyzer. This makes it useful for -looking up values in `not_analyzed` string fields, or in numeric or date +looking up values in keyword fields, or in numeric or date fields. When querying full text fields, use the <> instead, which understands how the field has been analyzed. @@ -86,11 +86,10 @@ PUT my_index "my_type": { "properties": { "full_text": { - "type": "string" <1> + "type": "text" <1> }, "exact_value": { - "type": "string", - "index": "not_analyzed" <2> + "type": "keyword" <2> } } } @@ -105,8 +104,8 @@ PUT my_index/my_type/1 -------------------------------------------------- // AUTOSENSE -<1> The `full_text` field is `analyzed` by default. -<2> The `exact_value` field is set to be `not_analyzed`. +<1> The `full_text` field is of type `text` and will be analyzed. +<2> The `exact_value` field is of type `keyword` and will NOT be analyzed. <3> The `full_text` inverted index will contain the terms: [`quick`, `foxes`]. <4> The `exact_value` inverted index will contain the exact term: [`Quick Foxes!`].