From 6cb2aef94374af1ddff0575e89a53656565da0f7 Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Tue, 13 Nov 2018 12:19:22 -0700 Subject: [PATCH] [ILM] Add documentation for error handling in ILM (#35426) * [ILM] Add documentation for error handling in ILM This adds some initial documentation for error handling and retrying failed steps for index lifecycle management --- docs/reference/ilm/error-handling.asciidoc | 161 ++++++++++++++++++ .../get-index-lifecycle-information.asciidoc | 4 - docs/reference/ilm/index.asciidoc | 2 + 3 files changed, 163 insertions(+), 4 deletions(-) create mode 100644 docs/reference/ilm/error-handling.asciidoc diff --git a/docs/reference/ilm/error-handling.asciidoc b/docs/reference/ilm/error-handling.asciidoc new file mode 100644 index 00000000000..8f0e4d6e4c0 --- /dev/null +++ b/docs/reference/ilm/error-handling.asciidoc @@ -0,0 +1,161 @@ +[role="xpack"] +[testenv="basic"] +[[index-lifecycle-error-handling]] +== Index Lifecycle Error Handling + +During Index Lifecycle Management's execution of the policy for an index, it's +possible for a step to encounter an error during its execution. When this +happens, ILM will move the management state into an "error" step. This halts +further execution of the policy and gives an administrator the chance to address +any issues with the policy, index, or cluster. + +An example will be helpful in illustrating this, imagine the following policy +has been created by a user: + +[source,js] +-------------------------------------------------- +PUT _ilm/policy/shrink-the-index +{ + "policy": { + "phases": { + "warm": { + "min_age": "5d", + "actions": { + "shrink": { + "number_of_shards": 4 + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST + +This policy waits until the index is at least 5 days old, and then shrinks +the index to 4 shards. + +Now imagine that a user creates a new index "myindex" with two primary shards, +telling it to use the policy they have created: + +[source,js] +-------------------------------------------------- +PUT /myindex +{ + "settings": { + "index.number_of_shards": 2, + "index.lifecycle.name": "shrink-the-index" + } +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +After five days have passed, ILM will attempt to shrink this index from 2 +shards to 4, which is invalid since the shrink action cannot increase the +number of shards. When this occurs, ILM will move this +index to the "error" step. Once an index is in this step, information about the +reason for the error can be retrieved from the <>: + +[source,js] +-------------------------------------------------- +GET /myindex/_ilm/explain +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +Which returns the following information: + +[source,js] +-------------------------------------------------- +{ + "indices" : { + "myindex" : { + "index" : "myindex", + "managed" : true, <1> + "policy" : "shrink-the-index", <2> + "lifecycle_date_millis" : 1541717265865, + "phase" : "warm", <3> + "phase_time_millis" : 1541717272601, + "action" : "shrink", <4> + "action_time_millis" : 1541717272601, + "step" : "ERROR", <5> + "step_time_millis" : 1541717272688, + "failed_step" : "shrink", <6> + "step_info" : { + "type" : "illegal_argument_exception", <7> + "reason" : "the number of target shards [4] must be less that the number of source shards [2]" <8> + }, + "phase_execution" : { + "policy" : "shrink-the-index", + "phase_definition" : { <9> + "min_age" : "5d", + "actions" : { + "shrink" : { + "number_of_shards" : 4 + } + } + }, + "version" : 1, + "modified_date_in_millis" : 1541717264230 + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TESTRESPONSE[skip:no way to know if we will get this response immediately] +<1> this index is managed by ILM +<2> the policy in question, in this case, "shrink-the-index" +<3> what phase the index is currently in +<4> what action the index is currently on +<5> what step the index is currently on, in this case, because there is an error, the index is in the "ERROR" step +<6> the name of the step that failed to execute, in this case "shrink" +<7> the error class that occurred during this step +<8> the error message that occurred during the execution failure +<9> the definition of the phase (in this case, the "warm" phase) that the index is currently on + +The index here has been moved to the error step because the shrink definition in +the policy is using an incorrect number of shards. So rectifing that in the +policy entails updating the existing policy to use one instead of four for +the targeted number of shards. + +[source,js] +-------------------------------------------------- +PUT _ilm/policy/shrink-the-index +{ + "policy": { + "phases": { + "warm": { + "min_age": "5d", + "actions": { + "shrink": { + "number_of_shards": 1 + } + } + } + } + } +} +-------------------------------------------------- +// CONSOLE +// TEST[continued] + +=== Retrying failed index lifecycle management steps + +Once the underlying issue that caused an index to move to the error step has +been corrected, index lifecycle management must be told to retry the step to see +if it can progress further. This is accomplished by invoking the retry API + +[source,js] +-------------------------------------------------- +POST /myindex/_ilm/retry +-------------------------------------------------- +// CONSOLE +// TEST[skip:we can't be sure the index is ready to be retried at this point] + +Once this has been issue, index lifecycle management will asynchronously pick up +on the step that is in a failed state, attempting to re-run it. The +<> can again be used to monitor the status of +re-running the step. diff --git a/docs/reference/ilm/get-index-lifecycle-information.asciidoc b/docs/reference/ilm/get-index-lifecycle-information.asciidoc index 3d5dc8a1720..e30a0beb872 100644 --- a/docs/reference/ilm/get-index-lifecycle-information.asciidoc +++ b/docs/reference/ilm/get-index-lifecycle-information.asciidoc @@ -5,7 +5,3 @@ Execution Model Discuss how actions are actually split up into discrete steps and how you can see more information about where an index is within a policy (info and all) Talk about the jump-to-step API -Error Handling -Show error in explain api -Demonstrate the retry API -Show how to get a sense of progress for things like the allocate step diff --git a/docs/reference/ilm/index.asciidoc b/docs/reference/ilm/index.asciidoc index be966895556..a542aa61094 100644 --- a/docs/reference/ilm/index.asciidoc +++ b/docs/reference/ilm/index.asciidoc @@ -60,4 +60,6 @@ include::update-lifecycle-policy.asciidoc[] include::get-index-lifecycle-information.asciidoc[] +include::error-handling.asciidoc[] + include::start-stop-ilm.asciidoc[]