Document 5.0 mapping changes.

This commit is contained in:
Adrien Grand 2016-03-18 17:01:27 +01:00
parent 3764b3ff80
commit b42f66c8ac
47 changed files with 430 additions and 527 deletions

View File

@ -16,7 +16,7 @@ price for the product. The mapping could look like:
"resellers" : { <1>
"type" : "nested",
"properties" : {
"name" : { "type" : "string" },
"name" : { "type" : "text" },
"price" : { "type" : "double" }
}
}

View File

@ -22,12 +22,12 @@ the issue documents as nested documents. The mapping could look like:
"issue" : {
"properties" : {
"tags" : { "type" : "string" }
"tags" : { "type" : "text" }
"comments" : { <1>
"type" : "nested"
"properties" : {
"username" : { "type" : "string", "index" : "not_analyzed" },
"comment" : { "type" : "string" }
"username" : { "type" : "keyword" },
"comment" : { "type" : "text" }
}
}
}

View File

@ -4,4 +4,4 @@
An analyzer of type `keyword` that "tokenizes" an entire stream as a
single token. This is useful for data like zip codes, ids and so on.
Note, when using mapping definitions, it might make more sense to simply
mark the field as `not_analyzed`.
map the field as a <<keyword,`keyword`>>.

View File

@ -136,13 +136,13 @@ curl -s -XPUT 'http://localhost:9200/twitter/' -d '{
"tweet": {
"properties": {
"text": {
"type": "string",
"type": "text",
"term_vector": "with_positions_offsets_payloads",
"store" : true,
"analyzer" : "fulltext_analyzer"
},
"fullname": {
"type": "string",
"type": "text",
"term_vector": "with_positions_offsets_payloads",
"analyzer" : "fulltext_analyzer"
}

View File

@ -281,7 +281,7 @@ PUT test
"test": {
"dynamic": false, <1>
"properties": {
"text": {"type": "string"}
"text": {"type": "text"}
}
}
}
@ -300,8 +300,8 @@ POST test/test?refresh
PUT test/_mapping/test <2>
{
"properties": {
"text": {"type": "string"},
"flag": {"type": "string", "analyzer": "keyword"}
"text": {"type": "text"},
"flag": {"type": "text", "analyzer": "keyword"}
}
}
--------------------------------------------------

View File

@ -39,7 +39,7 @@ Here we configure the DFRSimilarity so it can be referenced as
{
"book" : {
"properties" : {
"title" : { "type" : "string", "similarity" : "my_similarity" }
"title" : { "type" : "text", "similarity" : "my_similarity" }
}
}
--------------------------------------------------

View File

@ -116,8 +116,7 @@ curl -XPUT 'http://localhost:9200/test1' -d '{
"type1": {
"properties": {
"user" : {
"type": "string",
"index": "not_analyzed"
"type": "keyword"
}
}
}

View File

@ -78,7 +78,7 @@ curl -XPOST localhost:9200/test -d '{
"mappings" : {
"type1" : {
"properties" : {
"field1" : { "type" : "string", "index" : "not_analyzed" }
"field1" : { "type" : "text" }
}
}
}

View File

@ -22,7 +22,7 @@ For which the response is (assuming `text` is a default string field):
"text": {
"full_name": "text",
"mapping": {
"text": { "type": "string" }
"text": { "type": "text" }
}
}
}
@ -73,13 +73,13 @@ For example, consider the following mapping:
{
"article": {
"properties": {
"id": { "type": "string" },
"title": { "type": "string"},
"abstract": { "type": "string"},
"id": { "type": "text" },
"title": { "type": "text"},
"abstract": { "type": "text"},
"author": {
"properties": {
"id": { "type": "string" },
"name": { "type": "string" }
"id": { "type": "text" },
"name": { "type": "text" }
}
}
}
@ -105,19 +105,19 @@ returns:
"abstract": {
"full_name": "abstract",
"mapping": {
"abstract": { "type": "string" }
"abstract": { "type": "text" }
}
},
"author.id": {
"full_name": "author.id",
"mapping": {
"id": { "type": "string" }
"id": { "type": "text" }
}
},
"name": {
"full_name": "author.name",
"mapping": {
"name": { "type": "string" }
"name": { "type": "text" }
}
}
}

View File

@ -12,7 +12,7 @@ PUT twitter <1>
"tweet": {
"properties": {
"message": {
"type": "string"
"type": "text"
}
}
}
@ -23,7 +23,7 @@ PUT twitter/_mapping/user <2>
{
"properties": {
"name": {
"type": "string"
"type": "text"
}
}
}
@ -32,7 +32,7 @@ PUT twitter/_mapping/tweet <3>
{
"properties": {
"user_name": {
"type": "string"
"type": "text"
}
}
}
@ -86,13 +86,12 @@ PUT my_index <1>
"name": {
"properties": {
"first": {
"type": "string"
"type": "text"
}
}
},
"user_id": {
"type": "string",
"index": "not_analyzed"
"type": "keyword"
}
}
}
@ -105,13 +104,12 @@ PUT my_index/_mapping/user
"name": {
"properties": {
"last": { <2>
"type": "string"
"type": "text"
}
}
},
"user_id": {
"type": "string",
"index": "not_analyzed",
"type": "keyword",
"ignore_above": 100 <3>
}
}
@ -149,7 +147,7 @@ PUT my_index
"type_one": {
"properties": {
"text": { <1>
"type": "string",
"type": "text",
"analyzer": "standard"
}
}
@ -157,7 +155,7 @@ PUT my_index
"type_two": {
"properties": {
"text": { <1>
"type": "string",
"type": "text",
"analyzer": "standard"
}
}
@ -169,7 +167,7 @@ PUT my_index/_mapping/type_one <2>
{
"properties": {
"text": {
"type": "string",
"type": "text",
"analyzer": "standard",
"search_analyzer": "whitespace"
}
@ -180,7 +178,7 @@ PUT my_index/_mapping/type_one?update_all_types <3>
{
"properties": {
"text": {
"type": "string",
"type": "text",
"analyzer": "standard",
"search_analyzer": "whitespace"
}

View File

@ -46,7 +46,7 @@ Fields with the same name in different mapping types in the same index
Each field has a data `type` which can be:
* a simple type like <<string,`string`>>, <<date,`date`>>, <<number,`long`>>,
* a simple type like <<text,`text`>>, <<keyword,`keyword`>>, <<date,`date`>>, <<number,`long`>>,
<<number,`double`>>, <<boolean,`boolean`>> or <<ip,`ip`>>.
* a type which supports the hierarchical nature of JSON such as
<<object,`object`>> or <<nested,`nested`>>.
@ -55,7 +55,7 @@ Each field has a data `type` which can be:
It is often useful to index the same field in different ways for different
purposes. For instance, a `string` field could be <<mapping-index,indexed>> as
an `analyzed` field for full-text search, and as a `not_analyzed` field for
a `text` field for full-text search, and as a `keyword` field for
sorting or aggregations. Alternatively, you could index a string field with
the <<analysis-standard-analyzer,`standard` analyzer>>, the
<<english-analyzer,`english`>> analyzer, and the
@ -134,18 +134,17 @@ PUT my_index <1>
"user": { <2>
"_all": { "enabled": false }, <3>
"properties": { <4>
"title": { "type": "string" }, <5>
"name": { "type": "string" }, <5>
"title": { "type": "text" }, <5>
"name": { "type": "text" }, <5>
"age": { "type": "integer" } <5>
}
},
"blogpost": { <2>
"properties": { <4>
"title": { "type": "string" }, <5>
"body": { "type": "string" }, <5>
"title": { "type": "text" }, <5>
"body": { "type": "text" }, <5>
"user_id": {
"type": "string", <5>
"index": "not_analyzed"
"type": "keyword" <5>
},
"created": {
"type": "date", <5>

View File

@ -56,11 +56,10 @@ PUT _template/logging
"strings": { <4>
"match_mapping_type": "string",
"mapping": {
"type": "string",
"type": "text",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"type": "keyword",
"ignore_above": 256
}
}
@ -79,4 +78,4 @@ PUT logs-2015.10.01/event/1
<1> The `logging` template will match any indices beginning with `logs-`.
<2> Matching indices will be created with a single primary shard.
<3> The `_all` field will be disabled by default for new type mappings.
<4> String fields will be created with an `analyzed` main field, and a `not_analyzed` `.raw` field.
<4> String fields will be created with a `text` main field, and a `keyword` `.raw` field.

View File

@ -22,7 +22,7 @@ string:: Either a <<date,`date`>> field
(if the value passes <<date-detection,date detection>>),
a <<number,`double`>> or <<number,`long`>> field
(if the value passes <<numeric-detection,numeric detection>>)
or an <<mapping-index,`analyzed`>> <<string,`string`>> field.
or an <<text,`text`>> field.
These are the only <<mapping-types,field datatypes>> that are dynamically
detected. All other datatypes must be mapped explicitly.
@ -81,7 +81,7 @@ PUT my_index/my_type/1 <1>
--------------------------------------------------
// AUTOSENSE
<1> The `create_date` field has been added as a <<string,`string`>> field.
<1> The `create_date` field has been added as a <<text,`text`>> field.
===== Customising detected date formats

View File

@ -52,7 +52,7 @@ can be automatically detected: `boolean`, `date`, `double`, `long`, `object`,
`string`. It also accepts `*` to match all datatypes.
For example, if we wanted to map all integer fields as `integer` instead of
`long`, and all `string` fields as both `analyzed` and `not_analyzed`, we
`long`, and all `string` fields as both `text` and `keyword`, we
could use the following template:
[source,js]
@ -74,11 +74,10 @@ PUT my_index
"strings": {
"match_mapping_type": "string",
"mapping": {
"type": "string",
"type": "text",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed",
"type": "keyword",
"ignore_above": 256
}
}
@ -99,7 +98,7 @@ PUT my_index/my_type/1
--------------------------------------------------
// AUTOSENSE
<1> The `my_integer` field is mapped as an `integer`.
<2> The `my_string` field is mapped as an analyzed `string`, with a `not_analyzed` <<multi-fields,multi field>>.
<2> The `my_string` field is mapped as a `text`, with a `keyword` <<multi-fields,multi field>>.
[[match-unmatch]]
@ -180,7 +179,7 @@ PUT my_index
"path_match": "name.*",
"path_unmatch": "*.middle",
"mapping": {
"type": "string",
"type": "text",
"copy_to": "full_name"
}
}
@ -221,7 +220,7 @@ PUT my_index
"match_mapping_type": "string",
"match": "*",
"mapping": {
"type": "string",
"type": "text",
"analyzer": "{name}"
}
}

View File

@ -45,7 +45,7 @@ from each field as a string. It does not combine the _terms_ from each field.
=============================================================================
The `_all` field is just a <<string,`string`>> field, and accepts the same
The `_all` field is just a <<text,`text`>> field, and accepts the same
parameters that other string fields accept, including `analyzer`,
`term_vectors`, `index_options`, and `store`.
@ -136,7 +136,7 @@ PUT my_index
},
"properties": {
"content": {
"type": "string"
"type": "text"
}
}
}
@ -172,11 +172,11 @@ PUT myindex
"mytype": {
"properties": {
"title": { <1>
"type": "string",
"type": "text",
"boost": 2
},
"content": { <1>
"type": "string"
"type": "text"
}
}
}
@ -210,15 +210,15 @@ PUT myindex
"mytype": {
"properties": {
"first_name": {
"type": "string",
"type": "text",
"copy_to": "full_name" <1>
},
"last_name": {
"type": "string",
"type": "text",
"copy_to": "full_name" <1>
},
"full_name": {
"type": "string"
"type": "text"
}
}
}

View File

@ -127,7 +127,7 @@ global ordinals for the `_parent` field.
Global ordinals, by default, are built lazily: the first parent-child query or
aggregation after a refresh will trigger building of global ordinals. This can
introduce a significant latency spike for your users. You can use
<<fielddata-loading,eager_global_ordinals>> to shift the cost of building global
<<global-ordinals,eager_global_ordinals>> to shift the cost of building global
ordinals from query time to refresh time, by mapping the `_parent` field as follows:
[source,js]
@ -139,9 +139,7 @@ PUT my_index
"my_child": {
"_parent": {
"type": "my_parent",
"fielddata": {
"loading": "eager_global_ordinals"
}
"eager_global_ordinals": true
}
}
}

View File

@ -47,10 +47,10 @@ PUT my_index
"my_type": {
"properties": {
"text": { <1>
"type": "string",
"type": "text",
"fields": {
"english": { <2>
"type": "string",
"type": "text",
"analyzer": "english"
}
}
@ -124,7 +124,7 @@ PUT /my_index
"my_type":{
"properties":{
"title": {
"type":"string",
"type":"text",
"analyzer":"my_analyzer", <3>
"search_analyzer":"my_stop_analyzer", <4>
"search_quote_analyzer":"my_analyzer" <5>

View File

@ -12,11 +12,11 @@ PUT my_index
"my_type": {
"properties": {
"title": {
"type": "string",
"type": "text",
"boost": 2 <1>
},
"content": {
"type": "string"
"type": "text"
}
}
}

View File

@ -15,15 +15,15 @@ PUT /my_index
"my_type": {
"properties": {
"first_name": {
"type": "string",
"type": "text",
"copy_to": "full_name" <1>
},
"last_name": {
"type": "string",
"type": "text",
"copy_to": "full_name" <1>
},
"full_name": {
"type": "string"
"type": "text"
}
}
}

View File

@ -29,12 +29,10 @@ PUT my_index
"my_type": {
"properties": {
"status_code": { <1>
"type": "string",
"index": "not_analyzed"
"type": "keyword"
},
"session_id": { <2>
"type": "string",
"index": "not_analyzed",
"type": "keyword",
"doc_values": false
}
}

View File

@ -67,7 +67,7 @@ PUT my_index
"user": { <2>
"properties": {
"name": {
"type": "string"
"type": "text"
},
"social_networks": { <3>
"dynamic": true,

View File

@ -21,8 +21,7 @@ PUT my_index
"session": {
"properties": {
"user_id": {
"type": "string",
"index": "not_analyzed"
"type": "keyword"
},
"last_updated": {
"type": "date"

View File

@ -12,28 +12,28 @@ documents, we need to be able to look up the document and find the terms that
it has in a field.
Most fields can use index-time, on-disk <<doc-values,`doc_values`>> to support
this type of data access pattern, but `analyzed` string fields do not support
`doc_values`.
this type of data access pattern, but `text` fields do not support `doc_values`.
Instead, `analyzed` strings use a query-time data structure called
Instead, `text` strings use a query-time data structure called
`fielddata`. This data structure is built on demand the first time that a
field is used for aggregations, sorting, or is accessed in a script. It is built
by reading the entire inverted index for each segment from disk, inverting the
term ↔︎ document relationship, and storing the result in memory, in the
JVM heap.
Loading fielddata is an expensive process so, once it has been loaded, it
remains in memory for the lifetime of the segment.
Loading fielddata is an expensive process so it is disabled by default. Also,
when enabled, once it has been loaded, it remains in memory for the lifetime of
the segment.
[WARNING]
.Fielddata can fill up your heap space
==============================================================================
Fielddata can consume a lot of heap space, especially when loading high
cardinality `analyzed` string fields. Most of the time, it doesn't make sense
to sort or aggregate on `analyzed` string fields (with the notable exception
cardinality `text` fields. Most of the time, it doesn't make sense
to sort or aggregate on `text` fields (with the notable exception
of the
<<search-aggregations-bucket-significantterms-aggregation,`significant_terms`>>
aggregation). Always think about whether a `not_analyzed` field (which can
aggregation). Always think about whether a <<keyword,`keyword`>> field (which can
use `doc_values`) would be a better fit for your use case.
==============================================================================
@ -42,71 +42,6 @@ same name in the same index. Its value can be updated on existing fields
using the <<indices-put-mapping,PUT mapping API>>.
[[fielddata-format]]
==== `fielddata.format`
For `analyzed` string fields, the fielddata `format` controls whether
fielddata should be enabled or not. It accepts: `disabled` and `paged_bytes`
(enabled, which is the default). To disable fielddata loading, you can use
the following mapping:
[source,js]
--------------------------------------------------
PUT my_index
{
"mappings": {
"my_type": {
"properties": {
"text": {
"type": "string",
"fielddata": {
"format": "disabled" <1>
}
}
}
}
}
}
--------------------------------------------------
// AUTOSENSE
<1> The `text` field cannot be used for sorting, aggregations, or in scripts.
.Fielddata and other datatypes
[NOTE]
==================================================
Historically, other field datatypes also used fielddata, but this has been replaced
by index-time, disk-based <<doc-values,`doc_values`>>.
==================================================
[[fielddata-loading]]
==== `fielddata.loading`
This per-field setting controls when fielddata is loaded into memory. It
accepts three options:
[horizontal]
`lazy`::
Fielddata is only loaded into memory when it is needed. (default)
`eager`::
Fielddata is loaded into memory before a new search segment becomes
visible to search. This can reduce the latency that a user may experience
if their search request has to trigger lazy loading from a big segment.
`eager_global_ordinals`::
Loading fielddata into memory is only part of the work that is required.
After loading the fielddata for each segment, Elasticsearch builds the
<<global-ordinals>> data structure to make a list of all unique terms
across all the segments in a shard. By default, global ordinals are built
lazily. If the field has a very high cardinality, global ordinals may
take some time to build, in which case you can use eager loading instead.
[[global-ordinals]]
.Global ordinals
*****************************************
@ -141,15 +76,10 @@ can move the loading time from the first search request, to the refresh itself.
*****************************************
[[field-data-filtering]]
==== `fielddata.filter`
==== `fielddata_frequency_filter`
Fielddata filtering can be used to reduce the number of terms loaded into
memory, and thus reduce memory usage. Terms can be filtered by _frequency_ or
by _regular expression_, or a combination of the two:
Filtering by frequency::
+
--
memory, and thus reduce memory usage. Terms can be filtered by _frequency_:
The frequency filter allows you to only load terms whose term frequency falls
between a `min` and `max` value, which can be expressed an absolute
@ -169,7 +99,7 @@ PUT my_index
"my_type": {
"properties": {
"tag": {
"type": "string",
"type": "text",
"fielddata": {
"filter": {
"frequency": {
@ -186,44 +116,3 @@ PUT my_index
}
--------------------------------------------------
// AUTOSENSE
--
Filtering by regex::
+
--
Terms can also be filtered by regular expression - only values which
match the regular expression are loaded. Note: the regular expression is
applied to each term in the field, not to the whole field value. For
instance, to only load hashtags from a tweet, we can use a regular
expression which matches terms beginning with `#`:
[source,js]
--------------------------------------------------
PUT my_index
{
"mappings": {
"my_type": {
"properties": {
"tweet": {
"type": "string",
"analyzer": "whitespace",
"fielddata": {
"filter": {
"regex": {
"pattern": "^#.*"
}
}
}
}
}
}
}
}
--------------------------------------------------
// AUTOSENSE
--
These filters can be updated on an existing field mapping and will take
effect the next time the fielddata for a segment is loaded. Use the
<<indices-clearcache,Clear Cache>> API
to reload the fielddata using the new filters.

View File

@ -1,12 +1,7 @@
[[ignore-above]]
=== `ignore_above`
Strings longer than the `ignore_above` setting will not be processed by the
<<analyzer,analyzer>> and will not be indexed. This is mainly useful for
<<mapping-index,`not_analyzed`>> string fields, which are typically used for
filtering, aggregations, and sorting. These are structured fields and it
doesn't usually make sense to allow very long terms to be indexed in these
fields.
Strings longer than the `ignore_above` setting will not be indexed or stored.
[source,js]
--------------------------------------------------
@ -16,8 +11,7 @@ PUT my_index
"my_type": {
"properties": {
"message": {
"type": "string",
"index": "not_analyzed",
"type": "keyword",
"ignore_above": 20 <1>
}
}

View File

@ -14,10 +14,10 @@ PUT my_index
"my_type": {
"properties": {
"title": { <1>
"type": "string"
"type": "text"
}
"content": { <1>
"type": "string"
"type": "text"
},
"date": { <2>
"type": "date",
@ -50,18 +50,18 @@ PUT my_index
"my_type": {
"include_in_all": false, <1>
"properties": {
"title": { "type": "string" },
"title": { "type": "text" },
"author": {
"include_in_all": true, <2>
"properties": {
"first_name": { "type": "string" },
"last_name": { "type": "string" }
"first_name": { "type": "text" },
"last_name": { "type": "text" }
}
},
"editor": {
"properties": {
"first_name": { "type": "string" }, <3>
"last_name": { "type": "string", "include_in_all": true } <3>
"first_name": { "type": "text" }, <3>
"last_name": { "type": "text", "include_in_all": true } <3>
}
}
}

View File

@ -39,7 +39,7 @@ PUT my_index
"my_type": {
"properties": {
"text": {
"type": "string",
"type": "text",
"index_options": "offsets"
}
}

View File

@ -1,48 +1,6 @@
[[mapping-index]]
=== `index`
The `index` option controls how field values are indexed and, thus, how they
are searchable. It accepts three values:
The `index` option controls whether field values are indexed. It accepts `true`
or `false`. Fields that are not indexed are not queryable.
[horizontal]
`no`::
Do not add this field value to the index. With this setting, the field
will not be queryable.
`not_analyzed`::
Add the field value to the index unchanged, as a single term. This is the
default for all fields that support this option except for
<<string,`string`>> fields. `not_analyzed` fields are usually used with
<<term-level-queries,term-level queries>> for structured search.
`analyzed`::
This option applies only to `string` fields, for which it is the default.
The string field value is first <<analysis,analyzed>> to convert the
string into terms (e.g. a list of individual words), which are then
indexed. At search time, the query string is passed through
(<<search-analyzer,usually>>) the same analyzer to generate terms
in the same format as those in the index. It is this process that enables
<<full-text-queries,full text search>>.
For example, you can create a `not_analyzed` string field with the following:
[source,js]
--------------------------------------------------
PUT /my_index
{
"mappings": {
"my_type": {
"properties": {
"status_code": {
"type": "string",
"index": "not_analyzed"
}
}
}
}
}
--------------------------------------------------
// AUTOSENSE

View File

@ -3,8 +3,8 @@
It is often useful to index the same field in different ways for different
purposes. This is the purpose of _multi-fields_. For instance, a `string`
field could be <<mapping-index,indexed>> as an `analyzed` field for full-text
search, and as a `not_analyzed` field for sorting or aggregations:
field could be mapped as a `text` field for full-text
search, and as a `keyword` field for sorting or aggregations:
[source,js]
--------------------------------------------------
@ -14,11 +14,10 @@ PUT /my_index
"my_type": {
"properties": {
"city": {
"type": "string",
"type": "text",
"fields": {
"raw": { <1>
"type": "string",
"index": "not_analyzed"
"type": "keyword"
}
}
}
@ -57,8 +56,8 @@ GET /my_index/_search
}
--------------------------------------------------
// AUTOSENSE
<1> The `city.raw` field is a `not_analyzed` version of the `city` field.
<2> The analyzed `city` field can be used for full text search.
<1> The `city.raw` field is a `keyword` version of the `city` field.
<2> The `city` field can be used for full text search.
<3> The `city.raw` field can be used for sorting and aggregations
NOTE: Multi-fields do not change the original `_source` field.
@ -83,10 +82,10 @@ PUT my_index
"my_type": {
"properties": {
"text": { <1>
"type": "string",
"type": "text",
"fields": {
"english": { <2>
"type": "string",
"type": "text",
"analyzer": "english"
}
}

View File

@ -4,14 +4,14 @@
Norms store various normalization factors that are later used at query time
in order to compute the score of a document relatively to a query.
Although useful for scoring, norms also require quite a lot of memory
Although useful for scoring, norms also require quite a lot of disk
(typically in the order of one byte per document per field in your index, even
for documents that don't have this specific field). As a consequence, if you
don't need scoring on a specific field, you should disable norms on that
field. In particular, this is the case for fields that are used solely for
filtering or aggregations.
TIP: The `norms.enabled` setting must have the same setting for fields of the
TIP: The `norms` setting must have the same setting for fields of the
same name in the same index. Norms can be disabled on existing fields using
the <<indices-put-mapping,PUT mapping API>>.
@ -24,10 +24,8 @@ PUT my_index/_mapping/my_type
{
"properties": {
"title": {
"type": "string",
"norms": {
"enabled": false
}
"type": "text",
"norms": false
}
}
}
@ -41,31 +39,3 @@ results since some documents won't have norms anymore while other documents
might still have norms.
==== Lazy loading of norms
Norms can be loaded into memory eagerly (`eager`), whenever a new segment
comes online, or they can loaded lazily (`lazy`, default), only when the field
is queried.
Eager loading can be configured as follows:
[source,js]
------------
PUT my_index/_mapping/my_type
{
"properties": {
"title": {
"type": "string",
"norms": {
"loading": "eager"
}
}
}
}
------------
// AUTOSENSE
TIP: The `norms.loading` setting must have the same setting for fields of the
same name in the same index. Its value can be updated on existing fields
using the <<indices-put-mapping,PUT mapping API>>.

View File

@ -16,8 +16,7 @@ PUT my_index
"my_type": {
"properties": {
"status_code": {
"type": "string",
"index": "not_analyzed",
"type": "keyword",
"null_value": "NULL" <1>
}
}
@ -50,6 +49,4 @@ GET my_index/_search
<3> A query for `NULL` returns document 1, but not document 2.
IMPORTANT: The `null_value` needs to be the same datatype as the field. For
instance, a `long` field cannot have a string `null_value`. String fields
which are `analyzed` will also pass the `null_value` through the configured
analyzer.
instance, a `long` field cannot have a string `null_value`.

View File

@ -57,7 +57,7 @@ PUT my_index
"groups": {
"properties": {
"names": {
"type": "string",
"type": "text",
"position_increment_gap": 0 <1>
}
}

View File

@ -23,14 +23,14 @@ PUT my_index
"manager": { <2>
"properties": {
"age": { "type": "integer" },
"name": { "type": "string" }
"name": { "type": "text" }
}
},
"employees": { <3>
"type": "nested",
"properties": {
"age": { "type": "integer" },
"name": { "type": "string" }
"name": { "type": "text" }
}
}
}

View File

@ -41,7 +41,7 @@ PUT /my_index
"my_type": {
"properties": {
"text": {
"type": "string",
"type": "text",
"analyzer": "autocomplete", <2>
"search_analyzer": "standard" <2>
}

View File

@ -5,8 +5,8 @@ Elasticsearch allows you to configure a scoring algorithm or _similarity_ per
field. The `similarity` setting provides a simple way of choosing a similarity
algorithm other than the default TF/IDF, such as `BM25`.
Similarities are mostly useful for <<string,`string`>> fields, especially
`analyzed` string fields, but can also apply to other field types.
Similarities are mostly useful for <<text,`text`>> fields, but can also apply
to other field types.
Custom similarities can be configured by tuning the parameters of the built-in
similarities. For more details about this expert options, see the
@ -37,10 +37,10 @@ PUT my_index
"my_type": {
"properties": {
"default_field": { <1>
"type": "string"
"type": "text"
},
"bm25_field": {
"type": "string",
"type": "text",
"similarity": "BM25" <2>
}
}

View File

@ -24,7 +24,7 @@ PUT /my_index
"my_type": {
"properties": {
"title": {
"type": "string",
"type": "text",
"store": true <1>
},
"date": {
@ -32,7 +32,7 @@ PUT /my_index
"store": true <1>
},
"content": {
"type": "string"
"type": "text"
}
}
}

View File

@ -35,7 +35,7 @@ PUT my_index
"my_type": {
"properties": {
"text": {
"type": "string",
"type": "text",
"term_vector": "with_positions_offsets"
}
}

View File

@ -7,7 +7,7 @@ document:
[float]
=== Core datatypes
<<string>>:: `string`
string:: <<text,`text`>> and <<keyword,`keyword`>>
<<number>>:: `long`, `integer`, `short`, `byte`, `double`, `float`
<<date>>:: `date`
<<boolean>>:: `boolean`
@ -45,9 +45,9 @@ Attachment datatype::
=== Multi-fields
It is often useful to index the same field in different ways for different
purposes. For instance, a `string` field could be <<mapping-index,indexed>> as
an `analyzed` field for full-text search, and as a `not_analyzed` field for
sorting or aggregations. Alternatively, you could index a string field with
purposes. For instance, a `string` field could be mapped as
a `text` field for full-text search, and as a `keyword` field for
sorting or aggregations. Alternatively, you could index a text field with
the <<analysis-standard-analyzer,`standard` analyzer>>, the
<<english-analyzer,`english`>> analyzer, and the
<<french-analyzer,`french` analyzer>>.
@ -69,6 +69,8 @@ include::types/geo-shape.asciidoc[]
include::types/ip.asciidoc[]
include::types/keyword.asciidoc[]
include::types/nested.asciidoc[]
include::types/numeric.asciidoc[]
@ -77,6 +79,8 @@ include::types/object.asciidoc[]
include::types/string.asciidoc[]
include::types/text.asciidoc[]
include::types/token-count.asciidoc[]

View File

@ -13,7 +13,7 @@ PUT my_index
"my_type": {
"properties": {
"name": {
"type": "string"
"type": "text"
},
"blob": {
"type": "binary"

View File

@ -0,0 +1,111 @@
[[keyword]]
=== Keyword datatype
A field to index structured content such as email addresses, hostnames, status
codes, zip codes or tags.
They are typically used for filtering (_Find me all blog posts where
++status++ is ++published++_), for sorting, and for aggregations. Keyword
fields are ony searchable by their exact value.
If you need to index full text content such as email bodies or product
descriptions, it is likely that you should rather use a <<text,`text`>> field.
Below is an example of a mapping for a keyword field:
[source,js]
--------------------------------
PUT my_index
{
"mappings": {
"my_type": {
"properties": {
"tags": {
"type": "keyword"
}
}
}
}
}
--------------------------------
// AUTOSENSE
[[keyword-params]]
==== Parameters for keyword fields
The following parameters are accepted by `string` fields:
[horizontal]
<<mapping-boost,`boost`>>::
Mapping field-level query time boosting. Accepts a floating point number, defaults
to `1.0`.
<<doc-values,`doc_values`>>::
Should the field be stored on disk in a column-stride fashion, so that it
can later be used for sorting, aggregations, or scripting? Accepts `true`
(default) or `false`.
<<global-ordinals,`eager_global_ordinals`>>::
Should global ordinals be loaded eagerly on refresh? Accepts `true` or `false`
(default). Enabling this is a good idea on fields that are frequently used for
terms aggregations.
<<multi-fields,`fields`>>::
Multi-fields allow the same string value to be indexed in multiple ways for
different purposes, such as one field for search and a multi-field for
sorting and aggregations.
<<ignore-above,`ignore_above`>>::
Do not index or analyze any string longer than this value. Defaults to
`2147483647` so that all values would be accepted.
<<include-in-all,`include_in_all`>>::
Whether or not the field value should be included in the
<<mapping-all-field,`_all`>> field? Accepts `true` or `false`. Defaults
to `false` if <<mapping-index,`index`>> is set to `no`, or if a parent
<<object,`object`>> field sets `include_in_all` to `false`.
Otherwise defaults to `true`.
<<mapping-index,`index`>>::
Should the field be searchable? Accepts `true` (default) or `false`.
<<index-options,`index_options`>>::
What information should be stored in the index, for scoring purposes.
Defaults to `docs` but can also be set to `freqs` to take term frequency into account
when computing scores.
<<norms,`norms`>>::
Whether field-length should be taken into account when scoring queries.
Accepts `true` or `false` (default).
<<null-value,`null_value`>>::
Accepts a string value which is substituted for any explicit `null`
values. Defaults to `null`, which means the field is treated as missing.
<<mapping-store,`store`>>::
Whether the field value should be stored and retrievable separately from
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
(default).
<<search-analyzer,`search_analyzer`>>::
The <<analyzer,`analyzer`>> that should be used at search time on
<<mapping-index,`analyzed`>> fields. Defaults to the `analyzer` setting.
<<similarity,`similarity`>>::
Which scoring algorithm or _similarity_ should be used. Defaults
to `classic`, which uses TF/IDF.

View File

@ -46,16 +46,15 @@ PUT my_index
"my_type": { <1>
"properties": {
"region": {
"type": "string",
"index": "not_analyzed"
"type": "keyword"
},
"manager": { <2>
"properties": {
"age": { "type": "integer" },
"name": { <3>
"properties": {
"first": { "type": "string" },
"last": { "type": "string" }
"first": { "type": "text" },
"last": { "type": "text" }
}
}
}

View File

@ -1,179 +1,4 @@
[[string]]
=== String datatype
Fields of type `string` accept text values. Strings may be sub-divided into:
Full text::
+
--
Full text values, like the body of an email, are typically used for text based
relevance searches, such as: _Find the most relevant documents that match a
query for "quick brown fox"_.
These fields are `analyzed`, that is they are passed through an
<<analysis,analyzer>> to convert the string into a list of individual terms
before being indexed. The analysis process allows Elasticsearch to search for
individual words _within_ each full text field. Full text fields are not
used for sorting and seldom used for aggregations (although the
<<search-aggregations-bucket-significantterms-aggregation,significant terms aggregation>> is a notable exception).
--
Keywords::
Keywords are exact values like email addresses, hostnames, status codes, or
tags. They are typically used for filtering (_Find me all blog posts where
++status++ is ++published++_), for sorting, and for aggregations. Keyword
fields are `not_analyzed`. Instead, the exact string value is added to the
index as a single term.
Below is an example of a mapping for a full text (`analyzed`) and a keyword
(`not_analyzed`) string field:
[source,js]
--------------------------------
PUT my_index
{
"mappings": {
"my_type": {
"properties": {
"full_name": { <1>
"type": "string"
},
"status": {
"type": "string", <2>
"index": "not_analyzed"
}
}
}
}
}
--------------------------------
// AUTOSENSE
<1> The `full_name` field is an `analyzed` full text field -- `index:analyzed` is the default.
<2> The `status` field is a `not_analyzed` keyword field.
Sometimes it is useful to have both a full text (`analyzed`) and a keyword
(`not_analyzed`) version of the same field: one for full text search and the
other for aggregations and sorting. This can be achieved with
<<multi-fields,multi-fields>>.
[[string-params]]
==== Parameters for string fields
The following parameters are accepted by `string` fields:
[horizontal]
<<analyzer,`analyzer`>>::
The <<analysis,analyzer>> which should be used for
<<mapping-index,`analyzed`>> string fields, both at index-time and at
search-time (unless overridden by the <<search-analyzer,`search_analyzer`>>).
Defaults to the default index analyzer, or the
<<analysis-standard-analyzer,`standard` analyzer>>.
<<mapping-boost,`boost`>>::
Mapping field-level query time boosting. Accepts a floating point number, defaults
to `1.0`.
<<doc-values,`doc_values`>>::
Should the field be stored on disk in a column-stride fashion, so that it
can later be used for sorting, aggregations, or scripting? Accepts `true`
or `false`. Defaults to `true` for `not_analyzed` fields. Analyzed fields
do not support doc values.
<<fielddata,`fielddata`>>::
Can the field use in-memory fielddata for sorting, aggregations,
or scripting? Accepts `disabled` or `paged_bytes` (default).
Not analyzed fields will use <<doc-values,doc values>> in preference
to fielddata.
<<multi-fields,`fields`>>::
Multi-fields allow the same string value to be indexed in multiple ways for
different purposes, such as one field for search and a multi-field for
sorting and aggregations, or the same string value analyzed by different
analyzers.
<<ignore-above,`ignore_above`>>::
Do not index or analyze any string longer than this value. Defaults to `0` (disabled).
<<include-in-all,`include_in_all`>>::
Whether or not the field value should be included in the
<<mapping-all-field,`_all`>> field? Accepts `true` or `false`. Defaults
to `false` if <<mapping-index,`index`>> is set to `no`, or if a parent
<<object,`object`>> field sets `include_in_all` to `false`.
Otherwise defaults to `true`.
<<mapping-index,`index`>>::
Should the field be searchable? Accepts `analyzed` (default, treat as full-text field),
`not_analyzed` (treat as keyword field) and `no`.
<<index-options,`index_options`>>::
What information should be stored in the index, for search and highlighting purposes.
Defaults to `positions` for <<mapping-index,`analyzed`>> fields, and to `docs` for
`not_analyzed` fields.
<<norms,`norms`>>::
+
--
Whether field-length should be taken into account when scoring queries.
Defaults depend on the <<mapping-index,`index`>> setting:
* `analyzed` fields default to `{ "enabled": true, "loading": "lazy" }`.
* `not_analyzed` fields default to `{ "enabled": false }`.
--
<<null-value,`null_value`>>::
Accepts a string value which is substituted for any explicit `null`
values. Defaults to `null`, which means the field is treated as missing.
If the field is `analyzed`, the `null_value` will also be analyzed.
<<position-increment-gap,`position_increment_gap`>>::
The number of fake term positions which should be inserted between
each element of an array of strings. Defaults to 0.
The number of fake term position which should be inserted between each
element of an array of strings. Defaults to the position_increment_gap
configured on the analyzer which defaults to 100. 100 was chosen because it
prevents phrase queries with reasonably large slops (less than 100) from
matching terms across field values.
<<mapping-store,`store`>>::
Whether the field value should be stored and retrievable separately from
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
(default).
<<search-analyzer,`search_analyzer`>>::
The <<analyzer,`analyzer`>> that should be used at search time on
<<mapping-index,`analyzed`>> fields. Defaults to the `analyzer` setting.
<<search-quote-analyzer,`search_quote_analyzer`>>::
The <<analyzer,`analyzer`>> that should be used at search time when a
phrase is encountered. Defaults to the `search_analyzer` setting.
<<similarity,`similarity`>>::
Which scoring algorithm or _similarity_ should be used. Defaults
to `classic`, which uses TF/IDF.
<<term-vector,`term_vector`>>::
Whether term vectors should be stored for an <<mapping-index,`analyzed`>>
field. Defaults to `no`.
NOTE: The `string` field has been removed in favor of the `text` and `keyword` fields.

View File

@ -0,0 +1,139 @@
[[text]]
=== Text datatype
A field to index full-text values, such as the body of on email or the
description of a product. These fields are `analyzed`, that is they are passed through an
<<analysis,analyzer>> to convert the string into a list of individual terms
before being indexed. The analysis process allows Elasticsearch to search for
individual words _within_ each full text field. Text fields are not
used for sorting and seldom used for aggregations (although the
<<search-aggregations-bucket-significantterms-aggregation,significant terms aggregation>>
is a notable exception).
If you need to index structured content such as email addresses, hostnames, status
codes, or tags, it is likely that you should rather use a <<keyword,`keyword`>> field.
Below is an example of a mapping for a text field:
[source,js]
--------------------------------
PUT my_index
{
"mappings": {
"my_type": {
"properties": {
"full_name": {
"type": "text"
}
}
}
}
}
--------------------------------
// AUTOSENSE
Sometimes it is useful to have both a full text (`text`) and a keyword
(`keyword`) version of the same field: one for full text search and the
other for aggregations and sorting. This can be achieved with
<<multi-fields,multi-fields>>.
[[text-params]]
==== Parameters for text fields
The following parameters are accepted by `text` fields:
[horizontal]
<<analyzer,`analyzer`>>::
The <<analysis,analyzer>> which should be used for
<<mapping-index,`analyzed`>> string fields, both at index-time and at
search-time (unless overridden by the <<search-analyzer,`search_analyzer`>>).
Defaults to the default index analyzer, or the
<<analysis-standard-analyzer,`standard` analyzer>>.
<<mapping-boost,`boost`>>::
Mapping field-level query time boosting. Accepts a floating point number, defaults
to `1.0`.
<<global-ordinals,`eager_global_ordinals`>>::
Should global ordinals be loaded eagerly on refresh? Accepts `true` or `false`
(default). Enabling this is a good idea on fields that are frequently used for
(significant) terms aggregations.
<<fielddata,`fielddata`>>::
Can the field use in-memory fielddata for sorting, aggregations,
or scripting? Accepts `true` or `false` (default).
<<field-data-filtering,`fielddata_frequency_filter`>>::
Expert settings which allow to decide which values to load in memory when `fielddata`
is enabled. By default all values are loaded.
<<multi-fields,`fields`>>::
Multi-fields allow the same string value to be indexed in multiple ways for
different purposes, such as one field for search and a multi-field for
sorting and aggregations, or the same string value analyzed by different
analyzers.
<<include-in-all,`include_in_all`>>::
Whether or not the field value should be included in the
<<mapping-all-field,`_all`>> field? Accepts `true` or `false`. Defaults
to `false` if <<mapping-index,`index`>> is set to `no`, or if a parent
<<object,`object`>> field sets `include_in_all` to `false`.
Otherwise defaults to `true`.
<<mapping-index,`index`>>::
Should the field be searchable? Accepts `true` (default) or `false`.
<<index-options,`index_options`>>::
What information should be stored in the index, for search and highlighting purposes.
Defaults to `positions`.
<<norms,`norms`>>::
Whether field-length should be taken into account when scoring queries.
Accepts `true` (default) or `false`.
<<position-increment-gap,`position_increment_gap`>>::
The number of fake term positions which should be inserted between
each element of an array of strings. Defaults to 0.
The number of fake term position which should be inserted between each
element of an array of strings. Defaults to the position_increment_gap
configured on the analyzer which defaults to 100. 100 was chosen because it
prevents phrase queries with reasonably large slops (less than 100) from
matching terms across field values.
<<mapping-store,`store`>>::
Whether the field value should be stored and retrievable separately from
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
(default).
<<search-analyzer,`search_analyzer`>>::
The <<analyzer,`analyzer`>> that should be used at search time on
<<mapping-index,`analyzed`>> fields. Defaults to the `analyzer` setting.
<<search-quote-analyzer,`search_quote_analyzer`>>::
The <<analyzer,`analyzer`>> that should be used at search time when a
phrase is encountered. Defaults to the `search_analyzer` setting.
<<similarity,`similarity`>>::
Which scoring algorithm or _similarity_ should be used. Defaults
to `classic`, which uses TF/IDF.
<<term-vector,`term_vector`>>::
Whether term vectors should be stored for an <<mapping-index,`analyzed`>>
field. Defaults to `no`.

View File

@ -15,7 +15,7 @@ PUT my_index
"my_type": {
"properties": {
"name": { <1>
"type": "string",
"type": "text",
"fields": {
"length": { <2>
"type": "token_count",

View File

@ -16,6 +16,26 @@ values. For backwards compatibility purposes, during the 5.x series:
with `string` fields are no longer possible with `text`/`keyword` fields
such as enabling `term_vectors` on a not-analyzed `keyword` field.
==== Default string mappings
String mappings now have the following default mappings:
[source,json]
---------------
{
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
---------------
This allows to perform full-text search on the original field name and to sort
and run aggregations on the sub keyword field.
==== `index` property
On all field datatypes (except for the deprecated `string` field), the `index`
@ -35,12 +55,22 @@ now defaults to using `float` instead of `double`. The reasoning is that
floats should be more than enough for most cases but would decrease storage
requirements significantly.
==== `norms`
`norms` now take a boolean instead of an object. This boolean is the replacement
for `norms.enabled`. There is no replacement for `norms.loading` since eager
loading of norms is not useful anymore now that norms are disk-based.
==== `fielddata.format`
Setting `fielddata.format: doc_values` in the mappings used to implicitly
enable doc-values on a field. This no longer works: the only way to enable or
disable doc-values is by using the `doc_values` property of mappings.
==== `fielddata.frequency.regex`
Regex filters are not supported anymore and will be dropped on upgrade.
==== Source-transform removed
The source `transform` feature has been removed. Instead, use an ingest pipeline

View File

@ -47,7 +47,7 @@ instance, if the `user` field were mapped as follows:
[source,js]
--------------------------------------------------
"user": {
"type": "string",
"type": "text",
"null_value": "_null_"
}
--------------------------------------------------

View File

@ -116,18 +116,18 @@ curl -s -XPUT 'http://localhost:9200/imdb/' -d '{
"movies": {
"properties": {
"title": {
"type": "string",
"type": "text",
"term_vector": "yes"
},
"description": {
"type": "string"
"type": "text"
},
"tags": {
"type": "string",
"type": "text",
"fields" : {
"raw": {
"type" : "string",
"index" : "not_analyzed",
"type" : "text",
"analyzer": "keyword",
"term_vector" : "yes"
}
}

View File

@ -49,13 +49,13 @@ GET /_search
.Why doesn't the `term` query match my document?
**************************************************
String fields can be `analyzed` (treated as full text, like the body of an
email), or `not_analyzed` (treated as exact values, like an email address or a
zip code). Exact values (like numbers, dates, and `not_analyzed` strings) have
String fields can be of type `text` (treated as full text, like the body of an
email), or `keyword` (treated as exact values, like an email address or a
zip code). Exact values (like numbers, dates, and keywords) have
the exact value specified in the field added to the inverted index in order
to make them searchable.
By default, however, `string` fields are `analyzed`. This means that their
However, `text` fields are `analyzed`. This means that their
values are first passed through an <<analysis,analyzer>> to produce a list of
terms, which are then added to the inverted index.
@ -70,7 +70,7 @@ within a big block of full text.
The `term` query looks for the *exact* term in the field's inverted index --
it doesn't know anything about the field's analyzer. This makes it useful for
looking up values in `not_analyzed` string fields, or in numeric or date
looking up values in keyword fields, or in numeric or date
fields. When querying full text fields, use the
<<query-dsl-match-query,`match` query>> instead, which understands how the field
has been analyzed.
@ -86,11 +86,10 @@ PUT my_index
"my_type": {
"properties": {
"full_text": {
"type": "string" <1>
"type": "text" <1>
},
"exact_value": {
"type": "string",
"index": "not_analyzed" <2>
"type": "keyword" <2>
}
}
}
@ -105,8 +104,8 @@ PUT my_index/my_type/1
--------------------------------------------------
// AUTOSENSE
<1> The `full_text` field is `analyzed` by default.
<2> The `exact_value` field is set to be `not_analyzed`.
<1> The `full_text` field is of type `text` and will be analyzed.
<2> The `exact_value` field is of type `keyword` and will NOT be analyzed.
<3> The `full_text` inverted index will contain the terms: [`quick`, `foxes`].
<4> The `exact_value` inverted index will contain the exact term: [`Quick Foxes!`].