From fbad3af3524ddb5a1e45f085d3723e7b247fc0d7 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Tue, 21 Jun 2016 16:31:44 +0200 Subject: [PATCH] Add a how-to section to the docs. #18998 This moves the "Performance Considerations for Elasticsearch Indexing" blog post to the reference guide and adds similar recommendations for tuning disk usage and search speed. --- docs/reference/how-to.asciidoc | 22 ++ docs/reference/how-to/disk-usage.asciidoc | 159 ++++++++++++++ docs/reference/how-to/indexing-speed.asciidoc | 106 ++++++++++ docs/reference/how-to/search-speed.asciidoc | 194 ++++++++++++++++++ docs/reference/index.asciidoc | 2 + 5 files changed, 483 insertions(+) create mode 100644 docs/reference/how-to.asciidoc create mode 100644 docs/reference/how-to/disk-usage.asciidoc create mode 100644 docs/reference/how-to/indexing-speed.asciidoc create mode 100644 docs/reference/how-to/search-speed.asciidoc diff --git a/docs/reference/how-to.asciidoc b/docs/reference/how-to.asciidoc new file mode 100644 index 00000000000..ee954553617 --- /dev/null +++ b/docs/reference/how-to.asciidoc @@ -0,0 +1,22 @@ +[[how-to]] += How To + +[partintro] +-- +Elasticsearch ships with defaults which are intended to give a good out of +the box experience. Full text search, highlighting, aggregations, indexing +should all just work without the user having to change anything. + +Once you better understand how you want to use Elasticsearch, however, +there are a number of optimizations you can make to improve performance +for your use case. + +This section provides guidance about which changes should and shouldn't be +made. +-- + +include::how-to/indexing-speed.asciidoc[] + +include::how-to/search-speed.asciidoc[] + +include::how-to/disk-usage.asciidoc[] diff --git a/docs/reference/how-to/disk-usage.asciidoc b/docs/reference/how-to/disk-usage.asciidoc new file mode 100644 index 00000000000..6465690ec96 --- /dev/null +++ b/docs/reference/how-to/disk-usage.asciidoc @@ -0,0 +1,159 @@ +[[tune-for-disk-usage]] +== Tune for disk usage + +[float] +=== Disable the features you do not need + +By default elasticsearch indexes and adds doc values to most fields so that they +can be searched and aggregated out of the box. For instance if you have a numeric +field called `foo` that you need to run histograms on but that you never need to +filter on, you can safely disable indexing on this field in your +<>: + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "properties": { + "foo": { + "type": "integer", + "index": false + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<> fields store normalization factors in the index in order to be +able to score documents. If you only need matching capabilities on a `text` +field but do not care about the produced scores, you can configure elasticsearch +to not write norms to the index: + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "properties": { + "foo": { + "type": "text", + "norms": false + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +<> fields also store frequencies and positions in the index by +default. Frequencies are used to compute scores and positions are used to run +phrase queries. If you do not need to run phrase queries, you can tell +elasticsearch to not index positions: + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "properties": { + "foo": { + "type": "text", + "index_options": "freqs" + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +Furthermore if you do not care about scoring either, you can configure +elasticsearch to just index matching documents for every term. You will +still be able to search on this field, but phrase queries will raise errors +and scoring will assume that terms appear only once in every document. + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "properties": { + "foo": { + "type": "text", + "norms": false, + "index_options": "freqs" + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +[float] +=== Don't use default dynamic string mappings + +The default <> will index string fields +both as <> and <>. This is wasteful if you only +need one of them. Typically an `id` field will only need to be indexed as a +`keyword` while a `body` field will only need to be indexed as a `text` field. + +This can be disabled by either configuring explicit mappings on string fields +or setting up dynamic templates that will map string fields as either `text` +or `keyword`. + +For instance, here is a template that can be used in order to only map string +fields as `keyword`: + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "dynamic_templates": [ + { + "strings": { + "match_mapping_type": "string", + "mapping": { + "type": "keyword" + } + } + } + ] + } + } +} +-------------------------------------------------- +// CONSOLE + +[float] +=== Disable `_all` + +The <> field indexes the value of all fields of a +document and can use significant space. If you never need to search against all +fields at the same time, it can be disabled. + +[float] +=== Use `best_compression` + +The `_source` and stored fields can easily take a non negligible amount of disk +space. They can be compressed more aggressively by using the `best_compression` +<>. + +[float] +=== Use the smallest numeric type that is sufficient + +When storing <>, using `float` over `double`, or `half_float` +over `float` can help save storage. This is also true for integer types, but less +since Elasticsearch will more easily compress them based on the number of bits +that they actually need. + diff --git a/docs/reference/how-to/indexing-speed.asciidoc b/docs/reference/how-to/indexing-speed.asciidoc new file mode 100644 index 00000000000..8175c21ee8a --- /dev/null +++ b/docs/reference/how-to/indexing-speed.asciidoc @@ -0,0 +1,106 @@ +[[tune-for-indexing-speed]] +== Tune for indexing speed + +[float] +=== Use bulk requests + +Bulk requests will yield much better performance than single-document index +requests. In order to know the optimal size of a bulk request, you shoud run +a benchmark on a single node with a single shard. First try to index 100 +documents at once, then 200, then 400, etc. doubling the number of documents +in a bulk request in every benchmark run. When the indexing speed starts to +plateau then you know you reached the optimal size of a bulk request for your +data. In case of tie, it is better to err in the direction of too few rather +than too many documents. Beware that too large bulk requests might put the +cluster under memory pressure when many of them are sent concurrently, so +it is advisable to avoid going beyond a couple tens of megabytes per request +even if larger requests seem to perform better. + +[float] +=== Use multiple workers/threads to send data to elasticsearch + +A single thread sending bulk requests is unlikely to be able to max out the +indexing capacity of an elasticsearch cluster. In order to use all resources +of the cluster, you should send data from multiple threads or processes. In +addition to making better use of the resources of the cluster, this should +help reduce the cost of each fsync. + +Make sure to watch for `TOO_MANY_REQUESTS (429)` response codes +(`EsRejectedExecutionException` with the Java client), which is the way that +elasticsearch tells you that it cannot keep up with the current indexing rate. +When it happens, you should pause ndexing a bit before trying again, ideally +with randomized exponential backoff. + +Similarly to sizing bulk requests, only testing can tell what the optimal +number of workers is. This can be tested by progressivily increasing the +number of workers until either I/O or CPU is saturated on the cluster. + +[float] +=== Increase the refresh interval + +The default <> is `1s`, which +forces elasticsearch to create a new segment every second. +Increasing this value (to say, `30s`) will allow larger segments to flush and +decreases future merge pressure. + +[float] +=== Disable refresh and replicas for initial loads + +If you need to load a large amount of data at once, you should disable refresh +by setting `index.refresh_interval` to `-1` and set `index.number_of_replicas` +to `0`. This will temporarily put your index at risk since the loss of any shard +will cause data loss, but at the same time indexing will be faster since +documents will be indexed only once. Once the initial loading is finished, you +can set `index.refresh_interval` and `index.number_of_replicas` back to their +original values. + +[float] +=== Disable swapping + +You should make sure that the operating system is not swapping out the java +process by <>. + +[float] +=== Give memory to the filesystem cache + +The filesystem cache will be used in order to buffer I/O operations. You should +make sure to give at least half the memory of the machine running elasticsearch +to the filesystem cache. + +[float] +=== Use faster hardware + +If indexing is I/O bound, you should investigate giving more memory to the +filesystem cache (see above) or buying faster drives. In particular SSD drives +are known to perform better than spinning disks. Always use local storage, +remote filesystems such as `NFS` or `SMB` should be avoided. Also beware of +virtualized storage such as Amazon's `Elastic Block Storage`. Virtualized +storage works very well with Elasticsearch, and it is appealing since it is so +fast and simple to set up, but it is also unfortunately inherently slower on an +ongoing basis when compared to dedicated local storage. If you put an index on +`EBS`, be sure to use provisioned IOPS otherwise operations could be quickly +throttled. + +Stripe your index across multiple SSDs by configuring a RAID 0 array. Remember +that it will increase the risk of failure since the failure of any one SSD +destroys the index. However this is typically the right tradeoff to make: +optimize single shards for maximum performance, and then add replicas across +different nodes so there's redundancy for any node failures. You can also use +<> to backup the index for further +insurance. + +[float] +=== Indexing buffer size + +If your node is doing only heavy indexing, be sure +<> is large enough to give +at most 512 MB indexing buffer per shard doing heavy indexing (beyond that +indexing performance does not typically improve). Elasticsearch takes that +setting (a percentage of the java heap or an absolute byte-size), and +uses it as a shared buffer across all active shards. Very active shards will +naturally use this buffer more than shards that are performing lightweight +indexing. + +The default is `10%` which is often plenty: for example, if you give the JVM +10GB of memory, it will give 1GB to the index buffer, which is enough to host +two shards that are heavily indexing. diff --git a/docs/reference/how-to/search-speed.asciidoc b/docs/reference/how-to/search-speed.asciidoc new file mode 100644 index 00000000000..b6e9c4e927d --- /dev/null +++ b/docs/reference/how-to/search-speed.asciidoc @@ -0,0 +1,194 @@ +[[tune-for-search-speed]] +== Tune for search speed + +[float] +=== Give memory to the filesystem cache + +Elasticsearch heavily relies on the filesystem cache in order to make search +fast. In general, you should make sure that at least half the available memory +goes to the filesystem cache so that elasticsearch can keep hot regions of the +index in physical memory. + +[float] +=== Use faster hardware + +If your search is I/O bound, you should investigate giving more memory to the +filesystem cache (see above) or buying faster drives. In particular SSD drives +are known to perform better than spinning disks. Always use local storage, +remote filesystems such as `NFS` or `SMB` should be avoided. Also beware of +virtualized storage such as Amazon's `Elastic Block Storage`. Virtualized +storage works very well with Elasticsearch, and it is appealing since it is so +fast and simple to set up, but it is also unfortunately inherently slower on an +ongoing basis when compared to dedicated local storage. If you put an index on +`EBS`, be sure to use provisioned IOPS otherwise operations could be quickly +throttled. + +If your search is CPU-bound, you should investigate buying faster CPUs. + +[float] +=== Document modeling + +Documents should be modeled so that search-time operations are as cheap as possible. + +In particular, joins should be avoided. <> can make queries +several times slower and <> relations can make +queries hundreds of times slower. So if the same questions can be answered without +joins by denormalizing documents, significant speedups can be expected. + +[float] +=== Pre-index data + +You should leverage patterns in your queries to optimize the way data is indexed. +For instance, if all your documents have a `price` field and most queries run +<> aggregations on a fixed +list of ranges, you could make this aggregation faster by pre-indexing the ranges +into the index and using a <> +aggregations. + +For instance, if documents look like: + +[source,js] +-------------------------------------------------- +PUT index/type/1 +{ + "designation": "spoon", + "price": 13 +} +-------------------------------------------------- +// CONSOLE + +and search requests look like: + +[source,js] +-------------------------------------------------- +GET index/_search +{ + "aggs": { + "price_ranges": { + "range": { + "field": "price", + "ranges": [ + { "to": 10 }, + { "from": 10, "to": 100 }, + { "from": 100 } + ] + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +Then documents could be enriched by a `price_range` field at index time, which +should be mapped as a <>: + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "properties": { + "price_range": { + "type": "keyword" + } + } + } + } +} + +PUT index/type/1 +{ + "designation": "spoon", + "price": 13, + "price_range": "10-100" +} +-------------------------------------------------- +// CONSOLE + +And then search requests could aggregate this new field rather than running a +`range` aggregation on the `price` field. + +[source,js] +-------------------------------------------------- +GET index/_search +{ + "aggs": { + "price_ranges": { + "terms": { + "field": "price_range" + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +[float] +=== Mappings + +The fact that some data is numeric does not mean it should always be mapped as a +<>. Typically, fields storing identifiers such as an `ISBN` +or any number identifying a record from another database, might benefit from +being mapped as <> rather than `integer` or `long`. + +[float] +=== Avoid scripts + +In general, scripts should be avoided. If they are absolutely needed, you +should prefer the `painless` and `expressions` engines. + +[float] +=== Force-merge read-only indices + +Indices that are read-only would benefit from being +<>. This is typically the +case with time-based indices: only the index for the current time frame is +getting new documents while older indices are read-only. + +IMPORTANT: Don't force-merge indices that are still being written to -- leave +merging to the background merge process. + +[float] +=== Warm up global ordinals + +Global ordinals are a data-structure that is used in order to run +<> aggregations on +<> fields. They are loaded lazily in memory because +elasticsearch does not know which fields will be used in `terms` aggregations +and which fields won't. You can tell elasticsearch to load global ordinals +eagerly at refresh-time by configuring mappings as described below: + +[source,js] +-------------------------------------------------- +PUT index +{ + "mappings": { + "type": { + "properties": { + "foo": { + "type": "keyword", + "eager_global_ordinals": true + } + } + } + } +} +-------------------------------------------------- +// CONSOLE + +[float] +=== Warm up the filesystem cache + +If the machine running elasticsearch is restarted, the filesystem cache will be +empty, so it will take some time before the operating system loads hot regions +of the index into memory so that search operations are fast. You can explicitly +tell the operating system which files should be loaded into memory eagerly +depending on the file extension using the <> +setting. + +WARNING: Loading data into the filesystem cache eagerly on too many indices or +too many files will make searh _slower_ if the filesystem cache is not large +enough to hold all the data. Use with caution. diff --git a/docs/reference/index.asciidoc b/docs/reference/index.asciidoc index 132d763f714..3a625ac1a2c 100644 --- a/docs/reference/index.asciidoc +++ b/docs/reference/index.asciidoc @@ -43,6 +43,8 @@ include::index-modules.asciidoc[] include::ingest.asciidoc[] +include::how-to.asciidoc[] + include::testing.asciidoc[] include::glossary.asciidoc[]