Update remote store stats api documentation (#5107)

* Update remote store stats api documentation 

This closes #4904

Signed-off-by: Ashish <ashking94@gmail.com>

* Apply suggestions from code review

Signed-off-by: Naarcha-AWS <97990722+Naarcha-AWS@users.noreply.github.com>

* Update remote-store-stats-api.md

* Apply suggestions from code review

Co-authored-by: Chris Moore <107723039+cwillum@users.noreply.github.com>
Signed-off-by: Naarcha-AWS <97990722+Naarcha-AWS@users.noreply.github.com>

* Apply suggestions from code review

Co-authored-by: Nathan Bower <nbower@amazon.com>
Signed-off-by: Naarcha-AWS <97990722+Naarcha-AWS@users.noreply.github.com>

---------

Signed-off-by: Ashish <ashking94@gmail.com>
Signed-off-by: Naarcha-AWS <97990722+Naarcha-AWS@users.noreply.github.com>
Co-authored-by: Naarcha-AWS <97990722+Naarcha-AWS@users.noreply.github.com>
Co-authored-by: Chris Moore <107723039+cwillum@users.noreply.github.com>
Co-authored-by: Nathan Bower <nbower@amazon.com>
This commit is contained in:
Ashish 2023-09-28 22:30:50 +05:30 committed by GitHub
parent 8ee9554998
commit 9242b88810
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 384 additions and 109 deletions

View File

@ -7,10 +7,13 @@ grand_parent: Availability and recovery
---
# Remote Store Stats API
Introduced 2.8
{: .label .label-purple }
Use the Remote Store Stats API to monitor shard-level remote store performance.
Use the Remote Store Stats API to monitor shard-level remote-backed storage performance.
Metrics returned from this API only relate to indexes stored on remote-backed nodes. For an aggregated output on an index at the node or cluster level, use the [Index Stats]({{site.url}}{{site.baseurl}}/api-reference/index-apis/stats/), [Nodes Stats]({{site.url}}{{site.baseurl}}/api-reference/nodes-apis/nodes-stats/), or [Cluster Stats]({{site.url}}{{site.baseurl}}/api-reference/cluster-api/cluster-stats/) API.
## Path and HTTP methods
@ -30,7 +33,7 @@ Parameter | Type | Description
## Remote store stats for an index
Use the following API to get remote store statistics for all shards of an index.
Use the following API to get remote store statistics for all index shards.
#### Example request
@ -49,90 +52,289 @@ GET _remotestore/stats/<index_name>
```json
{
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"stats": [
{
"shard_id": "[so][1]",
"refresh_time_lag_in_millis": 0,
"refresh_lag": 0,
"bytes_lag": 0,
"backpressure_rejection_count": 0,
"consecutive_failure_count": 0,
"total_remote_refresh": {
"started": 56,
"succeeded": 56,
"_shards": {
"total": 4,
"successful": 4,
"failed": 0
},
"total_uploads_in_bytes": {
"started": 1524670599,
"succeeded": 1524670599,
"failed": 0
},
"remote_refresh_size_in_bytes": {
"last_successful": 12711179,
"moving_avg": 30726409
},
"upload_latency_in_bytes_per_sec": {
"moving_avg": 25276841.3
},
"remote_refresh_latency_in_millis": {
"moving_avg": 964.7
}
},
{
"shard_id": "[so][0]",
"refresh_time_lag_in_millis": 5727,
"refresh_lag": 1,
"bytes_lag": 0,
"backpressure_rejection_count": 0,
"consecutive_failure_count": 0,
"total_remote_refresh": {
"started": 57,
"succeeded": 56,
"failed": 0
},
"total_uploads_in_bytes": {
"started": 1568138701,
"succeeded": 1568138701,
"failed": 0
},
"remote_refresh_size_in_bytes": {
"last_successful": 12705142,
"moving_avg": 32766119.75
},
"upload_latency_in_bytes_per_sec": {
"moving_avg": 25523682.95
},
"remote_refresh_latency_in_millis": {
"moving_avg": 990.55
}
"indices": {
"remote-index": {
"shards": {
"0": [{
"routing": {
"state": "STARTED",
"primary": true,
"node": "q1VxWZnCTICrfRc2bRW3nw"
},
"segment": {
"download": {},
"upload": {
"local_refresh_timestamp_in_millis": 1694171634102,
"remote_refresh_timestamp_in_millis": 1694171634102,
"refresh_time_lag_in_millis": 0,
"refresh_lag": 0,
"bytes_lag": 0,
"backpressure_rejection_count": 0,
"consecutive_failure_count": 0,
"total_uploads": {
"started": 5,
"succeeded": 5,
"failed": 0
},
"total_upload_size": {
"started_bytes": 15342,
"succeeded_bytes": 15342,
"failed_bytes": 0
},
"remote_refresh_size_in_bytes": {
"last_successful": 0,
"moving_avg": 3068.4
},
"upload_speed_in_bytes_per_sec": {
"moving_avg": 99988.2
},
"remote_refresh_latency_in_millis": {
"moving_avg": 44.0
}
}
},
"translog": {
"upload": {
"last_successful_upload_timestamp": 1694171633644,
"total_uploads": {
"started": 6,
"failed": 0,
"succeeded": 6
},
"total_upload_size": {
"started_bytes": 1932,
"failed_bytes": 0,
"succeeded_bytes": 1932
},
"total_upload_time_in_millis": 21478,
"upload_size_in_bytes": {
"moving_avg": 322.0
},
"upload_speed_in_bytes_per_sec": {
"moving_avg": 2073.8333333333335
},
"upload_time_in_millis": {
"moving_avg": 3579.6666666666665
}
},
"download": {}
}
},
{
"routing": {
"state": "STARTED",
"primary": false,
"node": "EZuen5Y5Sv-eDCLwh9gv-Q"
},
"segment": {
"download": {
"last_sync_timestamp": 1694171634148,
"total_download_size": {
"started_bytes": 15112,
"succeeded_bytes": 15112,
"failed_bytes": 0
},
"download_size_in_bytes": {
"last_successful": 2910,
"moving_avg": 1259.3333333333333
},
"download_speed_in_bytes_per_sec": {
"moving_avg": 382387.3333333333
}
},
"upload": {}
},
"translog": {
"upload": {},
"download": {}
}
}
],
"1": [{
"routing": {
"state": "STARTED",
"primary": false,
"node": "q1VxWZnCTICrfRc2bRW3nw"
},
"segment": {
"download": {
"last_sync_timestamp": 1694171633181,
"total_download_size": {
"started_bytes": 18978,
"succeeded_bytes": 18978,
"failed_bytes": 0
},
"download_size_in_bytes": {
"last_successful": 325,
"moving_avg": 1265.2
},
"download_speed_in_bytes_per_sec": {
"moving_avg": 456047.6666666667
}
},
"upload": {}
},
"translog": {
"upload": {},
"download": {}
}
},
{
"routing": {
"state": "STARTED",
"primary": true,
"node": "EZuen5Y5Sv-eDCLwh9gv-Q"
},
"segment": {
"download": {},
"upload": {
"local_refresh_timestamp_in_millis": 1694171633122,
"remote_refresh_timestamp_in_millis": 1694171633122,
"refresh_time_lag_in_millis": 0,
"refresh_lag": 0,
"bytes_lag": 0,
"backpressure_rejection_count": 0,
"consecutive_failure_count": 0,
"total_uploads": {
"started": 6,
"succeeded": 6,
"failed": 0
},
"total_upload_size": {
"started_bytes": 19208,
"succeeded_bytes": 19208,
"failed_bytes": 0
},
"remote_refresh_size_in_bytes": {
"last_successful": 0,
"moving_avg": 3201.3333333333335
},
"upload_speed_in_bytes_per_sec": {
"moving_avg": 109612.0
},
"remote_refresh_latency_in_millis": {
"moving_avg": 25.333333333333332
}
}
},
"translog": {
"upload": {
"last_successful_upload_timestamp": 1694171633106,
"total_uploads": {
"started": 7,
"failed": 0,
"succeeded": 7
},
"total_upload_size": {
"started_bytes": 2405,
"failed_bytes": 0,
"succeeded_bytes": 2405
},
"total_upload_time_in_millis": 27748,
"upload_size_in_bytes": {
"moving_avg": 343.57142857142856
},
"upload_speed_in_bytes_per_sec": {
"moving_avg": 1445.857142857143
},
"upload_time_in_millis": {
"moving_avg": 3964.0
}
},
"download": {}
}
}
]
}
}
}
]
}
```
</details>
### Response fields
The following table lists the available response fields.
The response body of the Remote Store Stats API is split into three categories:
* `routing` : Contains information related to the shards routing
* `segment` : Contains statistics related to segment transfers from remote-backed storage
* `translog` : Contains statistics related to translog transfers from remote-backed storage
#### routing
The `routing` object contains the following fields.
|Field |Description |
|:--- |:--- |
|`refresh_time_lag_in_millis` |The time (in milliseconds) the remote refresh is behind the local refresh. |
|`refresh_lag` | The number of refreshes by which the remote store is lagging behind the local store. |
|`bytes_lag` | The bytes lag between the remote and local store. |
|`backpressure_rejection_count` | The total number of write rejections made because of remote store backpressure. |
|`consecutive_failure_count` | The number of consecutive remote refresh failures since the last success. |
|`total_remote_refresh` |The total number of remote refreshes. |
|`total_uploads_in_bytes` | The total number of bytes in all uploads to the remote store. |
|`remote_refresh_size_in_bytes.last_successful` |The size of data uploaded in the last successful refresh. |
|`remote_refresh_size_in_bytes.moving_avg` |The average size of data (in bytes) uploaded in the last _N_ refreshes. _N_ is defined in `remote_store.segment.pressure.upload_bytes_moving_average_window_size`. For details, see [Remote segment backpressure]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). |
|`upload_latency_in_bytes_per_sec.moving_avg` |The average speed of remote store uploads (in bytes per second) for the last _N_ uploads. _N_ is defined in `remote_store.segment.pressure.upload_bytes_per_sec_moving_average_window_size`. For details, see [Remote segment backpressure]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). |
|`remote_refresh_latency_in_millis.moving_avg` |The average time taken by a single remote refresh during the last _N_ remote refreshes. _N_ is defined in `remote_store.segment.pressure.upload_time_moving_average_window_size`. For details, see [Remote segment backpressure]({{site.url}}{{site.baseurl}}/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). |
| `primary` | Denotes whether the shard copy is a primary shard. |
| `node` | The name of the node to which the shard is assigned. |
#### segment
The `segment.upload` object contains the following fields.
|Field |Description |
|:--- |:--- |
| `local_refresh_timestamp_in_millis` | The last successful local refresh timestamp, in milliseconds. |
| `remote_refresh_timestamp_in_millis` | The last successful remote refresh timestamp, in milliseconds. |
| `refresh_time_lag_in_millis` | The amount of time, in milliseconds, that the remote refresh is behind the local refresh. |
| `refresh_lag` | The number of refreshes by which the remote store is lagging behind the local store. |
| `bytes_lag` | The lag, in bytes, between the remote and local stores. |
| `backpressure_rejection_count` | The total number of write rejections issued due to backpressure in the remote store. |
| `consecutive_failure_count` | The number of consecutive remote refresh failures since the last successful refresh. |
| `total_remote_refresh` | The total number of remote refreshes. |
| `total_uploads_in_bytes` | The total number of bytes in all uploads to the remote store. |
| `remote_refresh_size_in_bytes.last_successful` | The size of the data uploaded during the last successful refresh. |
| `remote_refresh_size_in_bytes.moving_avg` | The average size of the data, in bytes, uploaded in the last *N* refreshes. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). |
| `upload_latency_in_bytes_per_sec.moving_avg` | The average speed of remote segment uploads, in bytes per second, for the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). |
| `remote_refresh_latency_in_millis.moving_avg` | The average amount of time, in milliseconds, taken by a single remote refresh during the last *N* remote refreshes. *N* is defined in the `remote_store.moving_average_window_size` setting. For more information, see [Remote segment backpressure](https://opensearch.org/docs/latest/tuning-your-cluster/availability-and-recovery/remote-store/remote-segment-backpressure/). |
The `segment.download` object contains the following fields.
|Field |Description |
|:--- |:--- |
| `last_sync_timestamp`| The timestamp, in milliseconds, since the last successful segment file download from remote-backed storage. |
| `total_download_size.started_bytes` | The total number of bytes of segment files actively being downloaded from remote-backed storage. |
| `total_download_size.succeeded_bytes` | The total number of bytes of segment files successfully downloaded from remote-backed storage. |
| `total_download_size.failed_bytes` | The total number of bytes of segment files that failed to download from remote-back storage. |
| `download_size_in_bytes.last_successful` | The size, in bytes, of the last segment file successfully downloaded from remote-backed storage. |
| `download_size_in_bytes.moving_avg` | The average size of segment data, in bytes, downloaded in the last 20 downloads. |
| `download_speed_in_bytes_per_sec.moving_avg` | The average download speed, in bytes per second, of the last 20 downloads. |
#### translog
The `translog.upload` object contains the following fields.
|Field |Description |
|:--- |:--- |
| `last_successful_upload_timestamp`| The timestamp, in milliseconds, since the last translog file successfully uploaded to remote-backed storage. |
| `total_uploads.started` | The total number of attempted translog upload syncs to remote-backed storage. |
| `total_uploads.failed` | The total number of failed translog upload syncs to remote-backed storage. |
| `total_uploads.succeeded` | The total number of successful translog upload syncs to remote-backed storage. |
| `total_upload_size.started_bytes` | The total number of bytes of translog files actively being downloaded from remote-backed storage. |
| `total_upload_size.succeeded_bytes` | The total number of bytes of translog files successfully uploaded to remote-backed storage. |
|`total_upload_size.failed_bytes` | The total number of bytes of translog files that failed to upload to remote-backed storage. |
| `total_upload_time_in_millis` | The total amount of time spent, in milliseconds, uploading translog files to remote-backed storage. |
| `upload_size_in_bytes.moving_avg` | The average size of translog data, in bytes, uploaded in the last *N* downloads. *N* is defined in the `remote_store.moving_average_window_size` setting. |
| `upload_speed_in_bytes_per_sec.moving_avg` | The average speed of translog uploads, in bytes per second, for the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. |
| `upload_time_in_millis.moving_avg` | The average amount of time taken by a single translog upload, in milliseconds, since the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. |
The `translog.download` object contains the following fields.
|Field |Description |
|:--- |:--- |
| `last_successful_download_timestamp` | The timestamp, in milliseconds, since the last translog file successfully uploaded to remote-backed storage. |
| `total_downloads.succeeded` | The total number of successful translog download syncs from remote-backed storage. |
| `total_download_size.succeeded_bytes` | The total number of bytes of translog files successfully uploaded from remote-backed storage. |
| `total_download_time_in_millis` | The total amount of time spent, in milliseconds, downloading translog files from remote-backed storage. |
| `download_size_in_bytes.moving_avg` | The average size of translog data, in bytes, downloaded in the last *N* downloads. *N* is defined in the `remote_store.moving_average_window_size` setting. |
| `download_speed_in_bytes_per_sec.moving_avg` | The average speed of translog downloads, in bytes per second, for the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. |
| `download_time_in_millis.moving_avg` | The average amount of time taken by a single translog download, in milliseconds, since the last *N* uploads. *N* is defined in the `remote_store.moving_average_window_size` setting. |
## Remote store stats for a single shard
@ -155,50 +357,123 @@ GET _remotestore/stats/<index_name>/<shard_id>
```json
{
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"stats": [
{
"shard_id": "[so][0]",
"refresh_time_lag_in_millis": 5727,
"refresh_lag": 1,
"bytes_lag": 0,
"backpressure_rejection_count": 0,
"consecutive_failure_count": 0,
"total_remote_refresh": {
"started": 57,
"succeeded": 56,
"_shards": {
"total": 2,
"successful": 2,
"failed": 0
},
"total_uploads_in_bytes": {
"started": 1568138701,
"succeeded": 1568138701,
"failed": 0
},
"remote_refresh_size_in_bytes": {
"last_successful": 12705142,
"moving_avg": 32766119.75
},
"upload_latency_in_bytes_per_sec": {
"moving_avg": 25523682.95
},
"remote_refresh_latency_in_millis": {
"moving_avg": 990.55
}
},
"indices": {
"remote-index": {
"shards": {
"0": [
{
"routing": {
"state": "STARTED",
"primary": true,
"node": "q1VxWZnCTICrfRc2bRW3nw"
},
"segment": {
"download": {},
"upload": {
"local_refresh_timestamp_in_millis": 1694171634102,
"remote_refresh_timestamp_in_millis": 1694171634102,
"refresh_time_lag_in_millis": 0,
"refresh_lag": 0,
"bytes_lag": 0,
"backpressure_rejection_count": 0,
"consecutive_failure_count": 0,
"total_uploads": {
"started": 5,
"succeeded": 5,
"failed": 0
},
"total_upload_size": {
"started_bytes": 15342,
"succeeded_bytes": 15342,
"failed_bytes": 0
},
"remote_refresh_size_in_bytes": {
"last_successful": 0,
"moving_avg": 3068.4
},
"upload_speed_in_bytes_per_sec": {
"moving_avg": 99988.2
},
"remote_refresh_latency_in_millis": {
"moving_avg": 44.0
}
}
},
"translog": {
"upload": {
"last_successful_upload_timestamp": 1694171633644,
"total_uploads": {
"started": 6,
"failed": 0,
"succeeded": 6
},
"total_upload_size": {
"started_bytes": 1932,
"failed_bytes": 0,
"succeeded_bytes": 1932
},
"total_upload_time_in_millis": 21478,
"upload_size_in_bytes": {
"moving_avg": 322.0
},
"upload_speed_in_bytes_per_sec": {
"moving_avg": 2073.8333333333335
},
"upload_time_in_millis": {
"moving_avg": 3579.6666666666665
}
},
"download": {}
}
},
{
"routing": {
"state": "STARTED",
"primary": false,
"node": "EZuen5Y5Sv-eDCLwh9gv-Q"
},
"segment": {
"download": {
"last_sync_timestamp": 1694171634148,
"total_download_size": {
"started_bytes": 15112,
"succeeded_bytes": 15112,
"failed_bytes": 0
},
"download_size_in_bytes": {
"last_successful": 2910,
"moving_avg": 1259.3333333333333
},
"download_speed_in_bytes_per_sec": {
"moving_avg": 382387.3333333333
}
},
"upload": {}
},
"translog": {
"upload": {},
"download": {}
}
}
]
}
}
}
]
}
```
</details>
### Remote store stats for a local shard
Provide the `local` query parameter set to `true` to only fetch the shards present on the node that is serving the request:
If you want to fetch only shards present on the node serving a Remote Store Stats API request, set the `local` query parameter to `true`, as shown in the following example request:
```json
GET _remotestore/stats/<index_name>?local=true
```
{% include copy-curl.html %}
{% include copy-curl.html %}