Document 5.0 mapping changes.
This commit is contained in:
parent
3764b3ff80
commit
b42f66c8ac
|
@ -16,7 +16,7 @@ price for the product. The mapping could look like:
|
|||
"resellers" : { <1>
|
||||
"type" : "nested",
|
||||
"properties" : {
|
||||
"name" : { "type" : "string" },
|
||||
"name" : { "type" : "text" },
|
||||
"price" : { "type" : "double" }
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,12 +22,12 @@ the issue documents as nested documents. The mapping could look like:
|
|||
|
||||
"issue" : {
|
||||
"properties" : {
|
||||
"tags" : { "type" : "string" }
|
||||
"tags" : { "type" : "text" }
|
||||
"comments" : { <1>
|
||||
"type" : "nested"
|
||||
"properties" : {
|
||||
"username" : { "type" : "string", "index" : "not_analyzed" },
|
||||
"comment" : { "type" : "string" }
|
||||
"username" : { "type" : "keyword" },
|
||||
"comment" : { "type" : "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,4 +4,4 @@
|
|||
An analyzer of type `keyword` that "tokenizes" an entire stream as a
|
||||
single token. This is useful for data like zip codes, ids and so on.
|
||||
Note, when using mapping definitions, it might make more sense to simply
|
||||
mark the field as `not_analyzed`.
|
||||
map the field as a <<keyword,`keyword`>>.
|
||||
|
|
|
@ -136,13 +136,13 @@ curl -s -XPUT 'http://localhost:9200/twitter/' -d '{
|
|||
"tweet": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets_payloads",
|
||||
"store" : true,
|
||||
"analyzer" : "fulltext_analyzer"
|
||||
},
|
||||
"fullname": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets_payloads",
|
||||
"analyzer" : "fulltext_analyzer"
|
||||
}
|
||||
|
|
|
@ -281,7 +281,7 @@ PUT test
|
|||
"test": {
|
||||
"dynamic": false, <1>
|
||||
"properties": {
|
||||
"text": {"type": "string"}
|
||||
"text": {"type": "text"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -300,8 +300,8 @@ POST test/test?refresh
|
|||
PUT test/_mapping/test <2>
|
||||
{
|
||||
"properties": {
|
||||
"text": {"type": "string"},
|
||||
"flag": {"type": "string", "analyzer": "keyword"}
|
||||
"text": {"type": "text"},
|
||||
"flag": {"type": "text", "analyzer": "keyword"}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -39,7 +39,7 @@ Here we configure the DFRSimilarity so it can be referenced as
|
|||
{
|
||||
"book" : {
|
||||
"properties" : {
|
||||
"title" : { "type" : "string", "similarity" : "my_similarity" }
|
||||
"title" : { "type" : "text", "similarity" : "my_similarity" }
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -116,8 +116,7 @@ curl -XPUT 'http://localhost:9200/test1' -d '{
|
|||
"type1": {
|
||||
"properties": {
|
||||
"user" : {
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -78,7 +78,7 @@ curl -XPOST localhost:9200/test -d '{
|
|||
"mappings" : {
|
||||
"type1" : {
|
||||
"properties" : {
|
||||
"field1" : { "type" : "string", "index" : "not_analyzed" }
|
||||
"field1" : { "type" : "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ For which the response is (assuming `text` is a default string field):
|
|||
"text": {
|
||||
"full_name": "text",
|
||||
"mapping": {
|
||||
"text": { "type": "string" }
|
||||
"text": { "type": "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -73,13 +73,13 @@ For example, consider the following mapping:
|
|||
{
|
||||
"article": {
|
||||
"properties": {
|
||||
"id": { "type": "string" },
|
||||
"title": { "type": "string"},
|
||||
"abstract": { "type": "string"},
|
||||
"id": { "type": "text" },
|
||||
"title": { "type": "text"},
|
||||
"abstract": { "type": "text"},
|
||||
"author": {
|
||||
"properties": {
|
||||
"id": { "type": "string" },
|
||||
"name": { "type": "string" }
|
||||
"id": { "type": "text" },
|
||||
"name": { "type": "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -105,19 +105,19 @@ returns:
|
|||
"abstract": {
|
||||
"full_name": "abstract",
|
||||
"mapping": {
|
||||
"abstract": { "type": "string" }
|
||||
"abstract": { "type": "text" }
|
||||
}
|
||||
},
|
||||
"author.id": {
|
||||
"full_name": "author.id",
|
||||
"mapping": {
|
||||
"id": { "type": "string" }
|
||||
"id": { "type": "text" }
|
||||
}
|
||||
},
|
||||
"name": {
|
||||
"full_name": "author.name",
|
||||
"mapping": {
|
||||
"name": { "type": "string" }
|
||||
"name": { "type": "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ PUT twitter <1>
|
|||
"tweet": {
|
||||
"properties": {
|
||||
"message": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ PUT twitter/_mapping/user <2>
|
|||
{
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -32,7 +32,7 @@ PUT twitter/_mapping/tweet <3>
|
|||
{
|
||||
"properties": {
|
||||
"user_name": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -86,13 +86,12 @@ PUT my_index <1>
|
|||
"name": {
|
||||
"properties": {
|
||||
"first": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
},
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -105,13 +104,12 @@ PUT my_index/_mapping/user
|
|||
"name": {
|
||||
"properties": {
|
||||
"last": { <2>
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
},
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed",
|
||||
"type": "keyword",
|
||||
"ignore_above": 100 <3>
|
||||
}
|
||||
}
|
||||
|
@ -149,7 +147,7 @@ PUT my_index
|
|||
"type_one": {
|
||||
"properties": {
|
||||
"text": { <1>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
}
|
||||
}
|
||||
|
@ -157,7 +155,7 @@ PUT my_index
|
|||
"type_two": {
|
||||
"properties": {
|
||||
"text": { <1>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "standard"
|
||||
}
|
||||
}
|
||||
|
@ -169,7 +167,7 @@ PUT my_index/_mapping/type_one <2>
|
|||
{
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "standard",
|
||||
"search_analyzer": "whitespace"
|
||||
}
|
||||
|
@ -180,7 +178,7 @@ PUT my_index/_mapping/type_one?update_all_types <3>
|
|||
{
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "standard",
|
||||
"search_analyzer": "whitespace"
|
||||
}
|
||||
|
|
|
@ -46,7 +46,7 @@ Fields with the same name in different mapping types in the same index
|
|||
|
||||
Each field has a data `type` which can be:
|
||||
|
||||
* a simple type like <<string,`string`>>, <<date,`date`>>, <<number,`long`>>,
|
||||
* a simple type like <<text,`text`>>, <<keyword,`keyword`>>, <<date,`date`>>, <<number,`long`>>,
|
||||
<<number,`double`>>, <<boolean,`boolean`>> or <<ip,`ip`>>.
|
||||
* a type which supports the hierarchical nature of JSON such as
|
||||
<<object,`object`>> or <<nested,`nested`>>.
|
||||
|
@ -55,7 +55,7 @@ Each field has a data `type` which can be:
|
|||
|
||||
It is often useful to index the same field in different ways for different
|
||||
purposes. For instance, a `string` field could be <<mapping-index,indexed>> as
|
||||
an `analyzed` field for full-text search, and as a `not_analyzed` field for
|
||||
a `text` field for full-text search, and as a `keyword` field for
|
||||
sorting or aggregations. Alternatively, you could index a string field with
|
||||
the <<analysis-standard-analyzer,`standard` analyzer>>, the
|
||||
<<english-analyzer,`english`>> analyzer, and the
|
||||
|
@ -134,18 +134,17 @@ PUT my_index <1>
|
|||
"user": { <2>
|
||||
"_all": { "enabled": false }, <3>
|
||||
"properties": { <4>
|
||||
"title": { "type": "string" }, <5>
|
||||
"name": { "type": "string" }, <5>
|
||||
"title": { "type": "text" }, <5>
|
||||
"name": { "type": "text" }, <5>
|
||||
"age": { "type": "integer" } <5>
|
||||
}
|
||||
},
|
||||
"blogpost": { <2>
|
||||
"properties": { <4>
|
||||
"title": { "type": "string" }, <5>
|
||||
"body": { "type": "string" }, <5>
|
||||
"title": { "type": "text" }, <5>
|
||||
"body": { "type": "text" }, <5>
|
||||
"user_id": {
|
||||
"type": "string", <5>
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword" <5>
|
||||
},
|
||||
"created": {
|
||||
"type": "date", <5>
|
||||
|
|
|
@ -56,11 +56,10 @@ PUT _template/logging
|
|||
"strings": { <4>
|
||||
"match_mapping_type": "string",
|
||||
"mapping": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"raw": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed",
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
|
@ -79,4 +78,4 @@ PUT logs-2015.10.01/event/1
|
|||
<1> The `logging` template will match any indices beginning with `logs-`.
|
||||
<2> Matching indices will be created with a single primary shard.
|
||||
<3> The `_all` field will be disabled by default for new type mappings.
|
||||
<4> String fields will be created with an `analyzed` main field, and a `not_analyzed` `.raw` field.
|
||||
<4> String fields will be created with a `text` main field, and a `keyword` `.raw` field.
|
||||
|
|
|
@ -22,7 +22,7 @@ string:: Either a <<date,`date`>> field
|
|||
(if the value passes <<date-detection,date detection>>),
|
||||
a <<number,`double`>> or <<number,`long`>> field
|
||||
(if the value passes <<numeric-detection,numeric detection>>)
|
||||
or an <<mapping-index,`analyzed`>> <<string,`string`>> field.
|
||||
or an <<text,`text`>> field.
|
||||
|
||||
These are the only <<mapping-types,field datatypes>> that are dynamically
|
||||
detected. All other datatypes must be mapped explicitly.
|
||||
|
@ -81,7 +81,7 @@ PUT my_index/my_type/1 <1>
|
|||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
|
||||
<1> The `create_date` field has been added as a <<string,`string`>> field.
|
||||
<1> The `create_date` field has been added as a <<text,`text`>> field.
|
||||
|
||||
===== Customising detected date formats
|
||||
|
||||
|
|
|
@ -52,7 +52,7 @@ can be automatically detected: `boolean`, `date`, `double`, `long`, `object`,
|
|||
`string`. It also accepts `*` to match all datatypes.
|
||||
|
||||
For example, if we wanted to map all integer fields as `integer` instead of
|
||||
`long`, and all `string` fields as both `analyzed` and `not_analyzed`, we
|
||||
`long`, and all `string` fields as both `text` and `keyword`, we
|
||||
could use the following template:
|
||||
|
||||
[source,js]
|
||||
|
@ -74,11 +74,10 @@ PUT my_index
|
|||
"strings": {
|
||||
"match_mapping_type": "string",
|
||||
"mapping": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"raw": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed",
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
|
@ -99,7 +98,7 @@ PUT my_index/my_type/1
|
|||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
<1> The `my_integer` field is mapped as an `integer`.
|
||||
<2> The `my_string` field is mapped as an analyzed `string`, with a `not_analyzed` <<multi-fields,multi field>>.
|
||||
<2> The `my_string` field is mapped as a `text`, with a `keyword` <<multi-fields,multi field>>.
|
||||
|
||||
|
||||
[[match-unmatch]]
|
||||
|
@ -180,7 +179,7 @@ PUT my_index
|
|||
"path_match": "name.*",
|
||||
"path_unmatch": "*.middle",
|
||||
"mapping": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"copy_to": "full_name"
|
||||
}
|
||||
}
|
||||
|
@ -221,7 +220,7 @@ PUT my_index
|
|||
"match_mapping_type": "string",
|
||||
"match": "*",
|
||||
"mapping": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "{name}"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,7 +45,7 @@ from each field as a string. It does not combine the _terms_ from each field.
|
|||
|
||||
=============================================================================
|
||||
|
||||
The `_all` field is just a <<string,`string`>> field, and accepts the same
|
||||
The `_all` field is just a <<text,`text`>> field, and accepts the same
|
||||
parameters that other string fields accept, including `analyzer`,
|
||||
`term_vectors`, `index_options`, and `store`.
|
||||
|
||||
|
@ -136,7 +136,7 @@ PUT my_index
|
|||
},
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -172,11 +172,11 @@ PUT myindex
|
|||
"mytype": {
|
||||
"properties": {
|
||||
"title": { <1>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"boost": 2
|
||||
},
|
||||
"content": { <1>
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -210,15 +210,15 @@ PUT myindex
|
|||
"mytype": {
|
||||
"properties": {
|
||||
"first_name": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"copy_to": "full_name" <1>
|
||||
},
|
||||
"last_name": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"copy_to": "full_name" <1>
|
||||
},
|
||||
"full_name": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -127,7 +127,7 @@ global ordinals for the `_parent` field.
|
|||
Global ordinals, by default, are built lazily: the first parent-child query or
|
||||
aggregation after a refresh will trigger building of global ordinals. This can
|
||||
introduce a significant latency spike for your users. You can use
|
||||
<<fielddata-loading,eager_global_ordinals>> to shift the cost of building global
|
||||
<<global-ordinals,eager_global_ordinals>> to shift the cost of building global
|
||||
ordinals from query time to refresh time, by mapping the `_parent` field as follows:
|
||||
|
||||
[source,js]
|
||||
|
@ -139,9 +139,7 @@ PUT my_index
|
|||
"my_child": {
|
||||
"_parent": {
|
||||
"type": "my_parent",
|
||||
"fielddata": {
|
||||
"loading": "eager_global_ordinals"
|
||||
}
|
||||
"eager_global_ordinals": true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,10 +47,10 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"text": { <1>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"english": { <2>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "english"
|
||||
}
|
||||
}
|
||||
|
@ -124,7 +124,7 @@ PUT /my_index
|
|||
"my_type":{
|
||||
"properties":{
|
||||
"title": {
|
||||
"type":"string",
|
||||
"type":"text",
|
||||
"analyzer":"my_analyzer", <3>
|
||||
"search_analyzer":"my_stop_analyzer", <4>
|
||||
"search_quote_analyzer":"my_analyzer" <5>
|
||||
|
|
|
@ -12,11 +12,11 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"boost": 2 <1>
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,15 +15,15 @@ PUT /my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"first_name": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"copy_to": "full_name" <1>
|
||||
},
|
||||
"last_name": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"copy_to": "full_name" <1>
|
||||
},
|
||||
"full_name": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,12 +29,10 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"status_code": { <1>
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword"
|
||||
},
|
||||
"session_id": { <2>
|
||||
"type": "string",
|
||||
"index": "not_analyzed",
|
||||
"type": "keyword",
|
||||
"doc_values": false
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,7 +67,7 @@ PUT my_index
|
|||
"user": { <2>
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
},
|
||||
"social_networks": { <3>
|
||||
"dynamic": true,
|
||||
|
|
|
@ -21,8 +21,7 @@ PUT my_index
|
|||
"session": {
|
||||
"properties": {
|
||||
"user_id": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword"
|
||||
},
|
||||
"last_updated": {
|
||||
"type": "date"
|
||||
|
|
|
@ -12,28 +12,28 @@ documents, we need to be able to look up the document and find the terms that
|
|||
it has in a field.
|
||||
|
||||
Most fields can use index-time, on-disk <<doc-values,`doc_values`>> to support
|
||||
this type of data access pattern, but `analyzed` string fields do not support
|
||||
`doc_values`.
|
||||
this type of data access pattern, but `text` fields do not support `doc_values`.
|
||||
|
||||
Instead, `analyzed` strings use a query-time data structure called
|
||||
Instead, `text` strings use a query-time data structure called
|
||||
`fielddata`. This data structure is built on demand the first time that a
|
||||
field is used for aggregations, sorting, or is accessed in a script. It is built
|
||||
by reading the entire inverted index for each segment from disk, inverting the
|
||||
term ↔︎ document relationship, and storing the result in memory, in the
|
||||
JVM heap.
|
||||
|
||||
Loading fielddata is an expensive process so, once it has been loaded, it
|
||||
remains in memory for the lifetime of the segment.
|
||||
Loading fielddata is an expensive process so it is disabled by default. Also,
|
||||
when enabled, once it has been loaded, it remains in memory for the lifetime of
|
||||
the segment.
|
||||
|
||||
[WARNING]
|
||||
.Fielddata can fill up your heap space
|
||||
==============================================================================
|
||||
Fielddata can consume a lot of heap space, especially when loading high
|
||||
cardinality `analyzed` string fields. Most of the time, it doesn't make sense
|
||||
to sort or aggregate on `analyzed` string fields (with the notable exception
|
||||
cardinality `text` fields. Most of the time, it doesn't make sense
|
||||
to sort or aggregate on `text` fields (with the notable exception
|
||||
of the
|
||||
<<search-aggregations-bucket-significantterms-aggregation,`significant_terms`>>
|
||||
aggregation). Always think about whether a `not_analyzed` field (which can
|
||||
aggregation). Always think about whether a <<keyword,`keyword`>> field (which can
|
||||
use `doc_values`) would be a better fit for your use case.
|
||||
==============================================================================
|
||||
|
||||
|
@ -42,71 +42,6 @@ same name in the same index. Its value can be updated on existing fields
|
|||
using the <<indices-put-mapping,PUT mapping API>>.
|
||||
|
||||
|
||||
[[fielddata-format]]
|
||||
==== `fielddata.format`
|
||||
|
||||
For `analyzed` string fields, the fielddata `format` controls whether
|
||||
fielddata should be enabled or not. It accepts: `disabled` and `paged_bytes`
|
||||
(enabled, which is the default). To disable fielddata loading, you can use
|
||||
the following mapping:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"fielddata": {
|
||||
"format": "disabled" <1>
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
<1> The `text` field cannot be used for sorting, aggregations, or in scripts.
|
||||
|
||||
.Fielddata and other datatypes
|
||||
[NOTE]
|
||||
==================================================
|
||||
|
||||
Historically, other field datatypes also used fielddata, but this has been replaced
|
||||
by index-time, disk-based <<doc-values,`doc_values`>>.
|
||||
|
||||
==================================================
|
||||
|
||||
|
||||
[[fielddata-loading]]
|
||||
==== `fielddata.loading`
|
||||
|
||||
This per-field setting controls when fielddata is loaded into memory. It
|
||||
accepts three options:
|
||||
|
||||
[horizontal]
|
||||
`lazy`::
|
||||
|
||||
Fielddata is only loaded into memory when it is needed. (default)
|
||||
|
||||
`eager`::
|
||||
|
||||
Fielddata is loaded into memory before a new search segment becomes
|
||||
visible to search. This can reduce the latency that a user may experience
|
||||
if their search request has to trigger lazy loading from a big segment.
|
||||
|
||||
`eager_global_ordinals`::
|
||||
|
||||
Loading fielddata into memory is only part of the work that is required.
|
||||
After loading the fielddata for each segment, Elasticsearch builds the
|
||||
<<global-ordinals>> data structure to make a list of all unique terms
|
||||
across all the segments in a shard. By default, global ordinals are built
|
||||
lazily. If the field has a very high cardinality, global ordinals may
|
||||
take some time to build, in which case you can use eager loading instead.
|
||||
|
||||
[[global-ordinals]]
|
||||
.Global ordinals
|
||||
*****************************************
|
||||
|
@ -141,15 +76,10 @@ can move the loading time from the first search request, to the refresh itself.
|
|||
*****************************************
|
||||
|
||||
[[field-data-filtering]]
|
||||
==== `fielddata.filter`
|
||||
==== `fielddata_frequency_filter`
|
||||
|
||||
Fielddata filtering can be used to reduce the number of terms loaded into
|
||||
memory, and thus reduce memory usage. Terms can be filtered by _frequency_ or
|
||||
by _regular expression_, or a combination of the two:
|
||||
|
||||
Filtering by frequency::
|
||||
+
|
||||
--
|
||||
memory, and thus reduce memory usage. Terms can be filtered by _frequency_:
|
||||
|
||||
The frequency filter allows you to only load terms whose term frequency falls
|
||||
between a `min` and `max` value, which can be expressed an absolute
|
||||
|
@ -169,7 +99,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"tag": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fielddata": {
|
||||
"filter": {
|
||||
"frequency": {
|
||||
|
@ -186,44 +116,3 @@ PUT my_index
|
|||
}
|
||||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
--
|
||||
|
||||
Filtering by regex::
|
||||
+
|
||||
--
|
||||
Terms can also be filtered by regular expression - only values which
|
||||
match the regular expression are loaded. Note: the regular expression is
|
||||
applied to each term in the field, not to the whole field value. For
|
||||
instance, to only load hashtags from a tweet, we can use a regular
|
||||
expression which matches terms beginning with `#`:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"tweet": {
|
||||
"type": "string",
|
||||
"analyzer": "whitespace",
|
||||
"fielddata": {
|
||||
"filter": {
|
||||
"regex": {
|
||||
"pattern": "^#.*"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
--
|
||||
|
||||
These filters can be updated on an existing field mapping and will take
|
||||
effect the next time the fielddata for a segment is loaded. Use the
|
||||
<<indices-clearcache,Clear Cache>> API
|
||||
to reload the fielddata using the new filters.
|
||||
|
|
|
@ -1,12 +1,7 @@
|
|||
[[ignore-above]]
|
||||
=== `ignore_above`
|
||||
|
||||
Strings longer than the `ignore_above` setting will not be processed by the
|
||||
<<analyzer,analyzer>> and will not be indexed. This is mainly useful for
|
||||
<<mapping-index,`not_analyzed`>> string fields, which are typically used for
|
||||
filtering, aggregations, and sorting. These are structured fields and it
|
||||
doesn't usually make sense to allow very long terms to be indexed in these
|
||||
fields.
|
||||
Strings longer than the `ignore_above` setting will not be indexed or stored.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -16,8 +11,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"message": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed",
|
||||
"type": "keyword",
|
||||
"ignore_above": 20 <1>
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,10 +14,10 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"title": { <1>
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
"content": { <1>
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
},
|
||||
"date": { <2>
|
||||
"type": "date",
|
||||
|
@ -50,18 +50,18 @@ PUT my_index
|
|||
"my_type": {
|
||||
"include_in_all": false, <1>
|
||||
"properties": {
|
||||
"title": { "type": "string" },
|
||||
"title": { "type": "text" },
|
||||
"author": {
|
||||
"include_in_all": true, <2>
|
||||
"properties": {
|
||||
"first_name": { "type": "string" },
|
||||
"last_name": { "type": "string" }
|
||||
"first_name": { "type": "text" },
|
||||
"last_name": { "type": "text" }
|
||||
}
|
||||
},
|
||||
"editor": {
|
||||
"properties": {
|
||||
"first_name": { "type": "string" }, <3>
|
||||
"last_name": { "type": "string", "include_in_all": true } <3>
|
||||
"first_name": { "type": "text" }, <3>
|
||||
"last_name": { "type": "text", "include_in_all": true } <3>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,7 +39,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"index_options": "offsets"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,48 +1,6 @@
|
|||
[[mapping-index]]
|
||||
=== `index`
|
||||
|
||||
The `index` option controls how field values are indexed and, thus, how they
|
||||
are searchable. It accepts three values:
|
||||
The `index` option controls whether field values are indexed. It accepts `true`
|
||||
or `false`. Fields that are not indexed are not queryable.
|
||||
|
||||
[horizontal]
|
||||
`no`::
|
||||
|
||||
Do not add this field value to the index. With this setting, the field
|
||||
will not be queryable.
|
||||
|
||||
`not_analyzed`::
|
||||
|
||||
Add the field value to the index unchanged, as a single term. This is the
|
||||
default for all fields that support this option except for
|
||||
<<string,`string`>> fields. `not_analyzed` fields are usually used with
|
||||
<<term-level-queries,term-level queries>> for structured search.
|
||||
|
||||
`analyzed`::
|
||||
|
||||
This option applies only to `string` fields, for which it is the default.
|
||||
The string field value is first <<analysis,analyzed>> to convert the
|
||||
string into terms (e.g. a list of individual words), which are then
|
||||
indexed. At search time, the query string is passed through
|
||||
(<<search-analyzer,usually>>) the same analyzer to generate terms
|
||||
in the same format as those in the index. It is this process that enables
|
||||
<<full-text-queries,full text search>>.
|
||||
|
||||
For example, you can create a `not_analyzed` string field with the following:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
PUT /my_index
|
||||
{
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"status_code": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
// AUTOSENSE
|
|
@ -3,8 +3,8 @@
|
|||
|
||||
It is often useful to index the same field in different ways for different
|
||||
purposes. This is the purpose of _multi-fields_. For instance, a `string`
|
||||
field could be <<mapping-index,indexed>> as an `analyzed` field for full-text
|
||||
search, and as a `not_analyzed` field for sorting or aggregations:
|
||||
field could be mapped as a `text` field for full-text
|
||||
search, and as a `keyword` field for sorting or aggregations:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
|
@ -14,11 +14,10 @@ PUT /my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"city": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"raw": { <1>
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -57,8 +56,8 @@ GET /my_index/_search
|
|||
}
|
||||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
<1> The `city.raw` field is a `not_analyzed` version of the `city` field.
|
||||
<2> The analyzed `city` field can be used for full text search.
|
||||
<1> The `city.raw` field is a `keyword` version of the `city` field.
|
||||
<2> The `city` field can be used for full text search.
|
||||
<3> The `city.raw` field can be used for sorting and aggregations
|
||||
|
||||
NOTE: Multi-fields do not change the original `_source` field.
|
||||
|
@ -83,10 +82,10 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"text": { <1>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"english": { <2>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "english"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
Norms store various normalization factors that are later used at query time
|
||||
in order to compute the score of a document relatively to a query.
|
||||
|
||||
Although useful for scoring, norms also require quite a lot of memory
|
||||
Although useful for scoring, norms also require quite a lot of disk
|
||||
(typically in the order of one byte per document per field in your index, even
|
||||
for documents that don't have this specific field). As a consequence, if you
|
||||
don't need scoring on a specific field, you should disable norms on that
|
||||
field. In particular, this is the case for fields that are used solely for
|
||||
filtering or aggregations.
|
||||
|
||||
TIP: The `norms.enabled` setting must have the same setting for fields of the
|
||||
TIP: The `norms` setting must have the same setting for fields of the
|
||||
same name in the same index. Norms can be disabled on existing fields using
|
||||
the <<indices-put-mapping,PUT mapping API>>.
|
||||
|
||||
|
@ -24,10 +24,8 @@ PUT my_index/_mapping/my_type
|
|||
{
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"norms": {
|
||||
"enabled": false
|
||||
}
|
||||
"type": "text",
|
||||
"norms": false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -41,31 +39,3 @@ results since some documents won't have norms anymore while other documents
|
|||
might still have norms.
|
||||
|
||||
|
||||
==== Lazy loading of norms
|
||||
|
||||
Norms can be loaded into memory eagerly (`eager`), whenever a new segment
|
||||
comes online, or they can loaded lazily (`lazy`, default), only when the field
|
||||
is queried.
|
||||
|
||||
Eager loading can be configured as follows:
|
||||
|
||||
[source,js]
|
||||
------------
|
||||
PUT my_index/_mapping/my_type
|
||||
{
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"norms": {
|
||||
"loading": "eager"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
------------
|
||||
// AUTOSENSE
|
||||
|
||||
TIP: The `norms.loading` setting must have the same setting for fields of the
|
||||
same name in the same index. Its value can be updated on existing fields
|
||||
using the <<indices-put-mapping,PUT mapping API>>.
|
||||
|
||||
|
|
|
@ -16,8 +16,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"status_code": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed",
|
||||
"type": "keyword",
|
||||
"null_value": "NULL" <1>
|
||||
}
|
||||
}
|
||||
|
@ -50,6 +49,4 @@ GET my_index/_search
|
|||
<3> A query for `NULL` returns document 1, but not document 2.
|
||||
|
||||
IMPORTANT: The `null_value` needs to be the same datatype as the field. For
|
||||
instance, a `long` field cannot have a string `null_value`. String fields
|
||||
which are `analyzed` will also pass the `null_value` through the configured
|
||||
analyzer.
|
||||
instance, a `long` field cannot have a string `null_value`.
|
||||
|
|
|
@ -57,7 +57,7 @@ PUT my_index
|
|||
"groups": {
|
||||
"properties": {
|
||||
"names": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"position_increment_gap": 0 <1>
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,14 +23,14 @@ PUT my_index
|
|||
"manager": { <2>
|
||||
"properties": {
|
||||
"age": { "type": "integer" },
|
||||
"name": { "type": "string" }
|
||||
"name": { "type": "text" }
|
||||
}
|
||||
},
|
||||
"employees": { <3>
|
||||
"type": "nested",
|
||||
"properties": {
|
||||
"age": { "type": "integer" },
|
||||
"name": { "type": "string" }
|
||||
"name": { "type": "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ PUT /my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"analyzer": "autocomplete", <2>
|
||||
"search_analyzer": "standard" <2>
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@ Elasticsearch allows you to configure a scoring algorithm or _similarity_ per
|
|||
field. The `similarity` setting provides a simple way of choosing a similarity
|
||||
algorithm other than the default TF/IDF, such as `BM25`.
|
||||
|
||||
Similarities are mostly useful for <<string,`string`>> fields, especially
|
||||
`analyzed` string fields, but can also apply to other field types.
|
||||
Similarities are mostly useful for <<text,`text`>> fields, but can also apply
|
||||
to other field types.
|
||||
|
||||
Custom similarities can be configured by tuning the parameters of the built-in
|
||||
similarities. For more details about this expert options, see the
|
||||
|
@ -37,10 +37,10 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"default_field": { <1>
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
},
|
||||
"bm25_field": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"similarity": "BM25" <2>
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ PUT /my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"store": true <1>
|
||||
},
|
||||
"date": {
|
||||
|
@ -32,7 +32,7 @@ PUT /my_index
|
|||
"store": true <1>
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"term_vector": "with_positions_offsets"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,7 +7,7 @@ document:
|
|||
[float]
|
||||
=== Core datatypes
|
||||
|
||||
<<string>>:: `string`
|
||||
string:: <<text,`text`>> and <<keyword,`keyword`>>
|
||||
<<number>>:: `long`, `integer`, `short`, `byte`, `double`, `float`
|
||||
<<date>>:: `date`
|
||||
<<boolean>>:: `boolean`
|
||||
|
@ -45,9 +45,9 @@ Attachment datatype::
|
|||
=== Multi-fields
|
||||
|
||||
It is often useful to index the same field in different ways for different
|
||||
purposes. For instance, a `string` field could be <<mapping-index,indexed>> as
|
||||
an `analyzed` field for full-text search, and as a `not_analyzed` field for
|
||||
sorting or aggregations. Alternatively, you could index a string field with
|
||||
purposes. For instance, a `string` field could be mapped as
|
||||
a `text` field for full-text search, and as a `keyword` field for
|
||||
sorting or aggregations. Alternatively, you could index a text field with
|
||||
the <<analysis-standard-analyzer,`standard` analyzer>>, the
|
||||
<<english-analyzer,`english`>> analyzer, and the
|
||||
<<french-analyzer,`french` analyzer>>.
|
||||
|
@ -69,6 +69,8 @@ include::types/geo-shape.asciidoc[]
|
|||
|
||||
include::types/ip.asciidoc[]
|
||||
|
||||
include::types/keyword.asciidoc[]
|
||||
|
||||
include::types/nested.asciidoc[]
|
||||
|
||||
include::types/numeric.asciidoc[]
|
||||
|
@ -77,6 +79,8 @@ include::types/object.asciidoc[]
|
|||
|
||||
include::types/string.asciidoc[]
|
||||
|
||||
include::types/text.asciidoc[]
|
||||
|
||||
include::types/token-count.asciidoc[]
|
||||
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
},
|
||||
"blob": {
|
||||
"type": "binary"
|
||||
|
|
|
@ -0,0 +1,111 @@
|
|||
[[keyword]]
|
||||
=== Keyword datatype
|
||||
|
||||
A field to index structured content such as email addresses, hostnames, status
|
||||
codes, zip codes or tags.
|
||||
|
||||
They are typically used for filtering (_Find me all blog posts where
|
||||
++status++ is ++published++_), for sorting, and for aggregations. Keyword
|
||||
fields are ony searchable by their exact value.
|
||||
|
||||
If you need to index full text content such as email bodies or product
|
||||
descriptions, it is likely that you should rather use a <<text,`text`>> field.
|
||||
|
||||
Below is an example of a mapping for a keyword field:
|
||||
|
||||
[source,js]
|
||||
--------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"tags": {
|
||||
"type": "keyword"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------
|
||||
// AUTOSENSE
|
||||
|
||||
[[keyword-params]]
|
||||
==== Parameters for keyword fields
|
||||
|
||||
The following parameters are accepted by `string` fields:
|
||||
|
||||
[horizontal]
|
||||
|
||||
<<mapping-boost,`boost`>>::
|
||||
|
||||
Mapping field-level query time boosting. Accepts a floating point number, defaults
|
||||
to `1.0`.
|
||||
|
||||
<<doc-values,`doc_values`>>::
|
||||
|
||||
Should the field be stored on disk in a column-stride fashion, so that it
|
||||
can later be used for sorting, aggregations, or scripting? Accepts `true`
|
||||
(default) or `false`.
|
||||
|
||||
<<global-ordinals,`eager_global_ordinals`>>::
|
||||
|
||||
Should global ordinals be loaded eagerly on refresh? Accepts `true` or `false`
|
||||
(default). Enabling this is a good idea on fields that are frequently used for
|
||||
terms aggregations.
|
||||
|
||||
<<multi-fields,`fields`>>::
|
||||
|
||||
Multi-fields allow the same string value to be indexed in multiple ways for
|
||||
different purposes, such as one field for search and a multi-field for
|
||||
sorting and aggregations.
|
||||
|
||||
<<ignore-above,`ignore_above`>>::
|
||||
|
||||
Do not index or analyze any string longer than this value. Defaults to
|
||||
`2147483647` so that all values would be accepted.
|
||||
|
||||
<<include-in-all,`include_in_all`>>::
|
||||
|
||||
Whether or not the field value should be included in the
|
||||
<<mapping-all-field,`_all`>> field? Accepts `true` or `false`. Defaults
|
||||
to `false` if <<mapping-index,`index`>> is set to `no`, or if a parent
|
||||
<<object,`object`>> field sets `include_in_all` to `false`.
|
||||
Otherwise defaults to `true`.
|
||||
|
||||
<<mapping-index,`index`>>::
|
||||
|
||||
Should the field be searchable? Accepts `true` (default) or `false`.
|
||||
|
||||
<<index-options,`index_options`>>::
|
||||
|
||||
What information should be stored in the index, for scoring purposes.
|
||||
Defaults to `docs` but can also be set to `freqs` to take term frequency into account
|
||||
when computing scores.
|
||||
|
||||
<<norms,`norms`>>::
|
||||
|
||||
Whether field-length should be taken into account when scoring queries.
|
||||
Accepts `true` or `false` (default).
|
||||
|
||||
<<null-value,`null_value`>>::
|
||||
|
||||
Accepts a string value which is substituted for any explicit `null`
|
||||
values. Defaults to `null`, which means the field is treated as missing.
|
||||
|
||||
<<mapping-store,`store`>>::
|
||||
|
||||
Whether the field value should be stored and retrievable separately from
|
||||
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
|
||||
(default).
|
||||
|
||||
<<search-analyzer,`search_analyzer`>>::
|
||||
|
||||
The <<analyzer,`analyzer`>> that should be used at search time on
|
||||
<<mapping-index,`analyzed`>> fields. Defaults to the `analyzer` setting.
|
||||
|
||||
<<similarity,`similarity`>>::
|
||||
|
||||
Which scoring algorithm or _similarity_ should be used. Defaults
|
||||
to `classic`, which uses TF/IDF.
|
||||
|
|
@ -46,16 +46,15 @@ PUT my_index
|
|||
"my_type": { <1>
|
||||
"properties": {
|
||||
"region": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed"
|
||||
"type": "keyword"
|
||||
},
|
||||
"manager": { <2>
|
||||
"properties": {
|
||||
"age": { "type": "integer" },
|
||||
"name": { <3>
|
||||
"properties": {
|
||||
"first": { "type": "string" },
|
||||
"last": { "type": "string" }
|
||||
"first": { "type": "text" },
|
||||
"last": { "type": "text" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,179 +1,4 @@
|
|||
[[string]]
|
||||
=== String datatype
|
||||
|
||||
Fields of type `string` accept text values. Strings may be sub-divided into:
|
||||
|
||||
Full text::
|
||||
+
|
||||
--
|
||||
|
||||
Full text values, like the body of an email, are typically used for text based
|
||||
relevance searches, such as: _Find the most relevant documents that match a
|
||||
query for "quick brown fox"_.
|
||||
|
||||
These fields are `analyzed`, that is they are passed through an
|
||||
<<analysis,analyzer>> to convert the string into a list of individual terms
|
||||
before being indexed. The analysis process allows Elasticsearch to search for
|
||||
individual words _within_ each full text field. Full text fields are not
|
||||
used for sorting and seldom used for aggregations (although the
|
||||
<<search-aggregations-bucket-significantterms-aggregation,significant terms aggregation>> is a notable exception).
|
||||
|
||||
--
|
||||
|
||||
Keywords::
|
||||
|
||||
Keywords are exact values like email addresses, hostnames, status codes, or
|
||||
tags. They are typically used for filtering (_Find me all blog posts where
|
||||
++status++ is ++published++_), for sorting, and for aggregations. Keyword
|
||||
fields are `not_analyzed`. Instead, the exact string value is added to the
|
||||
index as a single term.
|
||||
|
||||
Below is an example of a mapping for a full text (`analyzed`) and a keyword
|
||||
(`not_analyzed`) string field:
|
||||
|
||||
[source,js]
|
||||
--------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"full_name": { <1>
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string", <2>
|
||||
"index": "not_analyzed"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------
|
||||
// AUTOSENSE
|
||||
<1> The `full_name` field is an `analyzed` full text field -- `index:analyzed` is the default.
|
||||
<2> The `status` field is a `not_analyzed` keyword field.
|
||||
|
||||
Sometimes it is useful to have both a full text (`analyzed`) and a keyword
|
||||
(`not_analyzed`) version of the same field: one for full text search and the
|
||||
other for aggregations and sorting. This can be achieved with
|
||||
<<multi-fields,multi-fields>>.
|
||||
|
||||
|
||||
[[string-params]]
|
||||
==== Parameters for string fields
|
||||
|
||||
The following parameters are accepted by `string` fields:
|
||||
|
||||
[horizontal]
|
||||
|
||||
<<analyzer,`analyzer`>>::
|
||||
|
||||
The <<analysis,analyzer>> which should be used for
|
||||
<<mapping-index,`analyzed`>> string fields, both at index-time and at
|
||||
search-time (unless overridden by the <<search-analyzer,`search_analyzer`>>).
|
||||
Defaults to the default index analyzer, or the
|
||||
<<analysis-standard-analyzer,`standard` analyzer>>.
|
||||
|
||||
<<mapping-boost,`boost`>>::
|
||||
|
||||
Mapping field-level query time boosting. Accepts a floating point number, defaults
|
||||
to `1.0`.
|
||||
|
||||
<<doc-values,`doc_values`>>::
|
||||
|
||||
Should the field be stored on disk in a column-stride fashion, so that it
|
||||
can later be used for sorting, aggregations, or scripting? Accepts `true`
|
||||
or `false`. Defaults to `true` for `not_analyzed` fields. Analyzed fields
|
||||
do not support doc values.
|
||||
|
||||
<<fielddata,`fielddata`>>::
|
||||
|
||||
Can the field use in-memory fielddata for sorting, aggregations,
|
||||
or scripting? Accepts `disabled` or `paged_bytes` (default).
|
||||
Not analyzed fields will use <<doc-values,doc values>> in preference
|
||||
to fielddata.
|
||||
|
||||
<<multi-fields,`fields`>>::
|
||||
|
||||
Multi-fields allow the same string value to be indexed in multiple ways for
|
||||
different purposes, such as one field for search and a multi-field for
|
||||
sorting and aggregations, or the same string value analyzed by different
|
||||
analyzers.
|
||||
|
||||
<<ignore-above,`ignore_above`>>::
|
||||
|
||||
Do not index or analyze any string longer than this value. Defaults to `0` (disabled).
|
||||
|
||||
<<include-in-all,`include_in_all`>>::
|
||||
|
||||
Whether or not the field value should be included in the
|
||||
<<mapping-all-field,`_all`>> field? Accepts `true` or `false`. Defaults
|
||||
to `false` if <<mapping-index,`index`>> is set to `no`, or if a parent
|
||||
<<object,`object`>> field sets `include_in_all` to `false`.
|
||||
Otherwise defaults to `true`.
|
||||
|
||||
<<mapping-index,`index`>>::
|
||||
|
||||
Should the field be searchable? Accepts `analyzed` (default, treat as full-text field),
|
||||
`not_analyzed` (treat as keyword field) and `no`.
|
||||
|
||||
<<index-options,`index_options`>>::
|
||||
|
||||
What information should be stored in the index, for search and highlighting purposes.
|
||||
Defaults to `positions` for <<mapping-index,`analyzed`>> fields, and to `docs` for
|
||||
`not_analyzed` fields.
|
||||
|
||||
|
||||
<<norms,`norms`>>::
|
||||
+
|
||||
--
|
||||
|
||||
Whether field-length should be taken into account when scoring queries.
|
||||
Defaults depend on the <<mapping-index,`index`>> setting:
|
||||
|
||||
* `analyzed` fields default to `{ "enabled": true, "loading": "lazy" }`.
|
||||
* `not_analyzed` fields default to `{ "enabled": false }`.
|
||||
--
|
||||
|
||||
<<null-value,`null_value`>>::
|
||||
|
||||
Accepts a string value which is substituted for any explicit `null`
|
||||
values. Defaults to `null`, which means the field is treated as missing.
|
||||
If the field is `analyzed`, the `null_value` will also be analyzed.
|
||||
|
||||
<<position-increment-gap,`position_increment_gap`>>::
|
||||
|
||||
The number of fake term positions which should be inserted between
|
||||
each element of an array of strings. Defaults to 0.
|
||||
The number of fake term position which should be inserted between each
|
||||
element of an array of strings. Defaults to the position_increment_gap
|
||||
configured on the analyzer which defaults to 100. 100 was chosen because it
|
||||
prevents phrase queries with reasonably large slops (less than 100) from
|
||||
matching terms across field values.
|
||||
|
||||
<<mapping-store,`store`>>::
|
||||
|
||||
Whether the field value should be stored and retrievable separately from
|
||||
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
|
||||
(default).
|
||||
|
||||
<<search-analyzer,`search_analyzer`>>::
|
||||
|
||||
The <<analyzer,`analyzer`>> that should be used at search time on
|
||||
<<mapping-index,`analyzed`>> fields. Defaults to the `analyzer` setting.
|
||||
|
||||
<<search-quote-analyzer,`search_quote_analyzer`>>::
|
||||
|
||||
The <<analyzer,`analyzer`>> that should be used at search time when a
|
||||
phrase is encountered. Defaults to the `search_analyzer` setting.
|
||||
|
||||
<<similarity,`similarity`>>::
|
||||
|
||||
Which scoring algorithm or _similarity_ should be used. Defaults
|
||||
to `classic`, which uses TF/IDF.
|
||||
|
||||
<<term-vector,`term_vector`>>::
|
||||
|
||||
Whether term vectors should be stored for an <<mapping-index,`analyzed`>>
|
||||
field. Defaults to `no`.
|
||||
NOTE: The `string` field has been removed in favor of the `text` and `keyword` fields.
|
||||
|
|
|
@ -0,0 +1,139 @@
|
|||
[[text]]
|
||||
=== Text datatype
|
||||
|
||||
A field to index full-text values, such as the body of on email or the
|
||||
description of a product. These fields are `analyzed`, that is they are passed through an
|
||||
<<analysis,analyzer>> to convert the string into a list of individual terms
|
||||
before being indexed. The analysis process allows Elasticsearch to search for
|
||||
individual words _within_ each full text field. Text fields are not
|
||||
used for sorting and seldom used for aggregations (although the
|
||||
<<search-aggregations-bucket-significantterms-aggregation,significant terms aggregation>>
|
||||
is a notable exception).
|
||||
|
||||
If you need to index structured content such as email addresses, hostnames, status
|
||||
codes, or tags, it is likely that you should rather use a <<keyword,`keyword`>> field.
|
||||
|
||||
Below is an example of a mapping for a text field:
|
||||
|
||||
[source,js]
|
||||
--------------------------------
|
||||
PUT my_index
|
||||
{
|
||||
"mappings": {
|
||||
"my_type": {
|
||||
"properties": {
|
||||
"full_name": {
|
||||
"type": "text"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------
|
||||
// AUTOSENSE
|
||||
|
||||
Sometimes it is useful to have both a full text (`text`) and a keyword
|
||||
(`keyword`) version of the same field: one for full text search and the
|
||||
other for aggregations and sorting. This can be achieved with
|
||||
<<multi-fields,multi-fields>>.
|
||||
|
||||
[[text-params]]
|
||||
==== Parameters for text fields
|
||||
|
||||
The following parameters are accepted by `text` fields:
|
||||
|
||||
[horizontal]
|
||||
|
||||
<<analyzer,`analyzer`>>::
|
||||
|
||||
The <<analysis,analyzer>> which should be used for
|
||||
<<mapping-index,`analyzed`>> string fields, both at index-time and at
|
||||
search-time (unless overridden by the <<search-analyzer,`search_analyzer`>>).
|
||||
Defaults to the default index analyzer, or the
|
||||
<<analysis-standard-analyzer,`standard` analyzer>>.
|
||||
|
||||
<<mapping-boost,`boost`>>::
|
||||
|
||||
Mapping field-level query time boosting. Accepts a floating point number, defaults
|
||||
to `1.0`.
|
||||
|
||||
<<global-ordinals,`eager_global_ordinals`>>::
|
||||
|
||||
Should global ordinals be loaded eagerly on refresh? Accepts `true` or `false`
|
||||
(default). Enabling this is a good idea on fields that are frequently used for
|
||||
(significant) terms aggregations.
|
||||
|
||||
<<fielddata,`fielddata`>>::
|
||||
|
||||
Can the field use in-memory fielddata for sorting, aggregations,
|
||||
or scripting? Accepts `true` or `false` (default).
|
||||
|
||||
<<field-data-filtering,`fielddata_frequency_filter`>>::
|
||||
|
||||
Expert settings which allow to decide which values to load in memory when `fielddata`
|
||||
is enabled. By default all values are loaded.
|
||||
|
||||
<<multi-fields,`fields`>>::
|
||||
|
||||
Multi-fields allow the same string value to be indexed in multiple ways for
|
||||
different purposes, such as one field for search and a multi-field for
|
||||
sorting and aggregations, or the same string value analyzed by different
|
||||
analyzers.
|
||||
|
||||
<<include-in-all,`include_in_all`>>::
|
||||
|
||||
Whether or not the field value should be included in the
|
||||
<<mapping-all-field,`_all`>> field? Accepts `true` or `false`. Defaults
|
||||
to `false` if <<mapping-index,`index`>> is set to `no`, or if a parent
|
||||
<<object,`object`>> field sets `include_in_all` to `false`.
|
||||
Otherwise defaults to `true`.
|
||||
|
||||
<<mapping-index,`index`>>::
|
||||
|
||||
Should the field be searchable? Accepts `true` (default) or `false`.
|
||||
|
||||
<<index-options,`index_options`>>::
|
||||
|
||||
What information should be stored in the index, for search and highlighting purposes.
|
||||
Defaults to `positions`.
|
||||
|
||||
<<norms,`norms`>>::
|
||||
|
||||
Whether field-length should be taken into account when scoring queries.
|
||||
Accepts `true` (default) or `false`.
|
||||
|
||||
<<position-increment-gap,`position_increment_gap`>>::
|
||||
|
||||
The number of fake term positions which should be inserted between
|
||||
each element of an array of strings. Defaults to 0.
|
||||
The number of fake term position which should be inserted between each
|
||||
element of an array of strings. Defaults to the position_increment_gap
|
||||
configured on the analyzer which defaults to 100. 100 was chosen because it
|
||||
prevents phrase queries with reasonably large slops (less than 100) from
|
||||
matching terms across field values.
|
||||
|
||||
<<mapping-store,`store`>>::
|
||||
|
||||
Whether the field value should be stored and retrievable separately from
|
||||
the <<mapping-source-field,`_source`>> field. Accepts `true` or `false`
|
||||
(default).
|
||||
|
||||
<<search-analyzer,`search_analyzer`>>::
|
||||
|
||||
The <<analyzer,`analyzer`>> that should be used at search time on
|
||||
<<mapping-index,`analyzed`>> fields. Defaults to the `analyzer` setting.
|
||||
|
||||
<<search-quote-analyzer,`search_quote_analyzer`>>::
|
||||
|
||||
The <<analyzer,`analyzer`>> that should be used at search time when a
|
||||
phrase is encountered. Defaults to the `search_analyzer` setting.
|
||||
|
||||
<<similarity,`similarity`>>::
|
||||
|
||||
Which scoring algorithm or _similarity_ should be used. Defaults
|
||||
to `classic`, which uses TF/IDF.
|
||||
|
||||
<<term-vector,`term_vector`>>::
|
||||
|
||||
Whether term vectors should be stored for an <<mapping-index,`analyzed`>>
|
||||
field. Defaults to `no`.
|
|
@ -15,7 +15,7 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"name": { <1>
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"length": { <2>
|
||||
"type": "token_count",
|
||||
|
|
|
@ -16,6 +16,26 @@ values. For backwards compatibility purposes, during the 5.x series:
|
|||
with `string` fields are no longer possible with `text`/`keyword` fields
|
||||
such as enabling `term_vectors` on a not-analyzed `keyword` field.
|
||||
|
||||
==== Default string mappings
|
||||
|
||||
String mappings now have the following default mappings:
|
||||
|
||||
[source,json]
|
||||
---------------
|
||||
{
|
||||
"type": "text",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
}
|
||||
---------------
|
||||
|
||||
This allows to perform full-text search on the original field name and to sort
|
||||
and run aggregations on the sub keyword field.
|
||||
|
||||
==== `index` property
|
||||
|
||||
On all field datatypes (except for the deprecated `string` field), the `index`
|
||||
|
@ -35,12 +55,22 @@ now defaults to using `float` instead of `double`. The reasoning is that
|
|||
floats should be more than enough for most cases but would decrease storage
|
||||
requirements significantly.
|
||||
|
||||
==== `norms`
|
||||
|
||||
`norms` now take a boolean instead of an object. This boolean is the replacement
|
||||
for `norms.enabled`. There is no replacement for `norms.loading` since eager
|
||||
loading of norms is not useful anymore now that norms are disk-based.
|
||||
|
||||
==== `fielddata.format`
|
||||
|
||||
Setting `fielddata.format: doc_values` in the mappings used to implicitly
|
||||
enable doc-values on a field. This no longer works: the only way to enable or
|
||||
disable doc-values is by using the `doc_values` property of mappings.
|
||||
|
||||
==== `fielddata.frequency.regex`
|
||||
|
||||
Regex filters are not supported anymore and will be dropped on upgrade.
|
||||
|
||||
==== Source-transform removed
|
||||
|
||||
The source `transform` feature has been removed. Instead, use an ingest pipeline
|
||||
|
|
|
@ -47,7 +47,7 @@ instance, if the `user` field were mapped as follows:
|
|||
[source,js]
|
||||
--------------------------------------------------
|
||||
"user": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"null_value": "_null_"
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
|
|
@ -116,18 +116,18 @@ curl -s -XPUT 'http://localhost:9200/imdb/' -d '{
|
|||
"movies": {
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"term_vector": "yes"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
"type": "text"
|
||||
},
|
||||
"tags": {
|
||||
"type": "string",
|
||||
"type": "text",
|
||||
"fields" : {
|
||||
"raw": {
|
||||
"type" : "string",
|
||||
"index" : "not_analyzed",
|
||||
"type" : "text",
|
||||
"analyzer": "keyword",
|
||||
"term_vector" : "yes"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,13 +49,13 @@ GET /_search
|
|||
.Why doesn't the `term` query match my document?
|
||||
**************************************************
|
||||
|
||||
String fields can be `analyzed` (treated as full text, like the body of an
|
||||
email), or `not_analyzed` (treated as exact values, like an email address or a
|
||||
zip code). Exact values (like numbers, dates, and `not_analyzed` strings) have
|
||||
String fields can be of type `text` (treated as full text, like the body of an
|
||||
email), or `keyword` (treated as exact values, like an email address or a
|
||||
zip code). Exact values (like numbers, dates, and keywords) have
|
||||
the exact value specified in the field added to the inverted index in order
|
||||
to make them searchable.
|
||||
|
||||
By default, however, `string` fields are `analyzed`. This means that their
|
||||
However, `text` fields are `analyzed`. This means that their
|
||||
values are first passed through an <<analysis,analyzer>> to produce a list of
|
||||
terms, which are then added to the inverted index.
|
||||
|
||||
|
@ -70,7 +70,7 @@ within a big block of full text.
|
|||
|
||||
The `term` query looks for the *exact* term in the field's inverted index --
|
||||
it doesn't know anything about the field's analyzer. This makes it useful for
|
||||
looking up values in `not_analyzed` string fields, or in numeric or date
|
||||
looking up values in keyword fields, or in numeric or date
|
||||
fields. When querying full text fields, use the
|
||||
<<query-dsl-match-query,`match` query>> instead, which understands how the field
|
||||
has been analyzed.
|
||||
|
@ -86,11 +86,10 @@ PUT my_index
|
|||
"my_type": {
|
||||
"properties": {
|
||||
"full_text": {
|
||||
"type": "string" <1>
|
||||
"type": "text" <1>
|
||||
},
|
||||
"exact_value": {
|
||||
"type": "string",
|
||||
"index": "not_analyzed" <2>
|
||||
"type": "keyword" <2>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -105,8 +104,8 @@ PUT my_index/my_type/1
|
|||
--------------------------------------------------
|
||||
// AUTOSENSE
|
||||
|
||||
<1> The `full_text` field is `analyzed` by default.
|
||||
<2> The `exact_value` field is set to be `not_analyzed`.
|
||||
<1> The `full_text` field is of type `text` and will be analyzed.
|
||||
<2> The `exact_value` field is of type `keyword` and will NOT be analyzed.
|
||||
<3> The `full_text` inverted index will contain the terms: [`quick`, `foxes`].
|
||||
<4> The `exact_value` inverted index will contain the exact term: [`Quick Foxes!`].
|
||||
|
||||
|
|
Loading…
Reference in New Issue