From b5131821a6c8f3a32a0b688b111344433c525efa Mon Sep 17 00:00:00 2001 From: Andrei Dan Date: Thu, 13 Feb 2020 20:21:41 +0000 Subject: [PATCH] ILM fix the init step to actually be retryable (#52076) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We marked the `init` ILM step as retryable but our test used `waitUntil` without an assert so we didn’t catch the fact that we were not actually able to retry this step as our ILM state didn’t contain any information about the policy execution (as we were in the process of initialising it). This commit manually sets the current step to `init` when we’re moving the ilm policy into the ERROR step (this enables us to successfully move to the error step and later retry the step) * ShrunkenIndexCheckStep: Use correct logger (cherry picked from commit f78d4b3d91345a2a8fc0f48b90dd66c9959bd7ff) Signed-off-by: Andrei Dan --- .../core/ilm/InitializePolicyContextStep.java | 34 +++++++++++-------- .../core/ilm/InitializePolicyException.java | 20 +++++++++++ .../core/ilm/ShrunkenIndexCheckStep.java | 2 +- .../ilm/InitializePolicyContextStepTests.java | 16 ++++----- .../ilm/TimeSeriesLifecycleActionsIT.java | 30 ++++++++-------- .../xpack/ilm/IndexLifecycleTransition.java | 13 +++++-- 6 files changed, 75 insertions(+), 40 deletions(-) create mode 100644 x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyException.java diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java index 194dcaa2e1eb8..536d72613cccc 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStep.java @@ -38,22 +38,26 @@ public ClusterState performAction(Index index, ClusterState clusterState) { return clusterState; } - LifecycleExecutionState lifecycleState = LifecycleExecutionState - .fromIndexMetadata(indexMetaData); - - if (lifecycleState.getLifecycleDate() != null) { - return clusterState; - } - IndexMetaData.Builder indexMetadataBuilder = IndexMetaData.builder(indexMetaData); - if (shouldParseIndexName(indexMetaData.getSettings())) { - long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName()); - indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1) - .settings(Settings.builder() - .put(indexMetaData.getSettings()) - .put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate) - .build() - ); + LifecycleExecutionState lifecycleState; + try { + lifecycleState = LifecycleExecutionState.fromIndexMetadata(indexMetaData); + if (lifecycleState.getLifecycleDate() != null) { + return clusterState; + } + + if (shouldParseIndexName(indexMetaData.getSettings())) { + long parsedOriginationDate = parseIndexNameAndExtractDate(index.getName()); + indexMetadataBuilder.settingsVersion(indexMetaData.getSettingsVersion() + 1) + .settings(Settings.builder() + .put(indexMetaData.getSettings()) + .put(LifecycleSettings.LIFECYCLE_ORIGINATION_DATE, parsedOriginationDate) + .build() + ); + } + } catch (Exception e) { + String policy = indexMetaData.getSettings().get(LifecycleSettings.LIFECYCLE_NAME); + throw new InitializePolicyException(policy, index.getName(), e); } ClusterState.Builder newClusterStateBuilder = ClusterState.builder(clusterState); diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyException.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyException.java new file mode 100644 index 0000000000000..8feda10abdbc2 --- /dev/null +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/InitializePolicyException.java @@ -0,0 +1,20 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.core.ilm; + +import org.elasticsearch.ElasticsearchException; + +import java.util.Locale; + +/** + * Exception thrown when a problem is encountered while initialising an ILM policy for an index. + */ +public class InitializePolicyException extends ElasticsearchException { + + public InitializePolicyException(String policy, String index, Throwable cause) { + super(String.format(Locale.ROOT, "unable to initialize policy [%s] for index [%s]", policy, index), cause); + } +} diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java index 6b6a70ca50172..715352dce2b43 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ilm/ShrunkenIndexCheckStep.java @@ -25,7 +25,7 @@ */ public class ShrunkenIndexCheckStep extends ClusterStateWaitStep { public static final String NAME = "is-shrunken-index"; - private static final Logger logger = LogManager.getLogger(InitializePolicyContextStep.class); + private static final Logger logger = LogManager.getLogger(ShrunkenIndexCheckStep.class); private String shrunkIndexPrefix; public ShrunkenIndexCheckStep(StepKey key, StepKey nextStepKey, String shrunkIndexPrefix) { diff --git a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java index 404240810f2e6..482f3e4183778 100644 --- a/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java +++ b/x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ilm/InitializePolicyContextStepTests.java @@ -33,14 +33,14 @@ public InitializePolicyContextStep mutateInstance(InitializePolicyContextStep in StepKey nextKey = instance.getNextStepKey(); switch (between(0, 1)) { - case 0: - key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5)); - break; - case 1: - nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5)); - break; - default: - throw new AssertionError("Illegal randomisation branch"); + case 0: + key = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5)); + break; + case 1: + nextKey = new StepKey(key.getPhase(), key.getAction(), key.getName() + randomAlphaOfLength(5)); + break; + default: + throw new AssertionError("Illegal randomisation branch"); } return new InitializePolicyContextStep(key, nextKey); diff --git a/x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java b/x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java index 731481b13ea04..5c97f1cf9cfa5 100644 --- a/x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java +++ b/x-pack/plugin/ilm/qa/multi-node/src/test/java/org/elasticsearch/xpack/ilm/TimeSeriesLifecycleActionsIT.java @@ -1185,26 +1185,26 @@ public void testRolloverStepRetriesUntilRolledOverIndexIsDeleted() throws Except // {@link org.elasticsearch.xpack.core.ilm.ErrorStep} in order to retry the failing step. As {@link #assertBusy} // increases the wait time between calls exponentially, we might miss the window where the policy is on // {@link WaitForRolloverReadyStep} and the move to `attempt-rollover` request will not be successful. - waitUntil(() -> { + assertTrue(waitUntil(() -> { try { return client().performRequest(moveToStepRequest).getStatusLine().getStatusCode() == 200; } catch (IOException e) { return false; } - }, 30, TimeUnit.SECONDS); + }, 30, TimeUnit.SECONDS)); // Similar to above, using {@link #waitUntil} as we want to make sure the `attempt-rollover` step started failing and is being // retried (which means ILM moves back and forth between the `attempt-rollover` step and the `error` step) - waitUntil(() -> { + assertTrue("ILM did not start retrying the attempt-rollover step", waitUntil(() -> { try { Map explainIndexResponse = explainIndex(index); - String step = (String) explainIndexResponse.get("step"); + String failedStep = (String) explainIndexResponse.get("failed_step"); Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD); - return step != null && step.equals("attempt-rollover") && retryCount != null && retryCount >= 1; + return failedStep != null && failedStep.equals("attempt-rollover") && retryCount != null && retryCount >= 1; } catch (IOException e) { return false; } - }, 30, TimeUnit.SECONDS); + }, 30, TimeUnit.SECONDS)); deleteIndex(rolledIndex); @@ -1246,16 +1246,17 @@ public void testUpdateRolloverLifecycleDateStepRetriesWhenRolloverInfoIsMissing( "}"); client().performRequest(moveToStepRequest); - waitUntil(() -> { + assertTrue("ILM did not start retrying the update-rollover-lifecycle-date step", waitUntil(() -> { try { Map explainIndexResponse = explainIndex(index); - String step = (String) explainIndexResponse.get("step"); + String failedStep = (String) explainIndexResponse.get("failed_step"); Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD); - return step != null && step.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null && retryCount >= 1; + return failedStep != null && failedStep.equals(UpdateRolloverLifecycleDateStep.NAME) && retryCount != null + && retryCount >= 1; } catch (IOException e) { return false; } - }); + }, 30, TimeUnit.SECONDS)); index(client(), index, "1", "foo", "bar"); Request refreshIndex = new Request("POST", "/" + index + "/_refresh"); @@ -1441,16 +1442,17 @@ public void testRetryableInitializationStep() throws Exception { assertOK(client().performRequest(startReq)); // Wait until an error has occurred. - waitUntil(() -> { + assertTrue("ILM did not start retrying the init step", waitUntil(() -> { try { Map explainIndexResponse = explainIndex(index); - String step = (String) explainIndexResponse.get("step"); + String failedStep = (String) explainIndexResponse.get("failed_step"); Integer retryCount = (Integer) explainIndexResponse.get(FAILED_STEP_RETRY_COUNT_FIELD); - return step != null && step.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null && retryCount >= 1; + return failedStep != null && failedStep.equals(InitializePolicyContextStep.KEY.getAction()) && retryCount != null + && retryCount >= 1; } catch (IOException e) { return false; } - }, 30, TimeUnit.SECONDS); + }, 30, TimeUnit.SECONDS)); // Turn origination date parsing back off updateIndexSettings(index, Settings.builder() diff --git a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java index 6b38515b10e0a..95b441861bfcd 100644 --- a/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java +++ b/x-pack/plugin/ilm/src/main/java/org/elasticsearch/xpack/ilm/IndexLifecycleTransition.java @@ -24,6 +24,7 @@ import org.elasticsearch.xpack.core.ilm.ErrorStep; import org.elasticsearch.xpack.core.ilm.IndexLifecycleMetadata; import org.elasticsearch.xpack.core.ilm.InitializePolicyContextStep; +import org.elasticsearch.xpack.core.ilm.InitializePolicyException; import org.elasticsearch.xpack.core.ilm.LifecycleExecutionState; import org.elasticsearch.xpack.core.ilm.LifecyclePolicyMetadata; import org.elasticsearch.xpack.core.ilm.LifecycleSettings; @@ -133,8 +134,16 @@ static ClusterState moveClusterStateToErrorStep(Index index, ClusterState cluste ElasticsearchException.generateThrowableXContent(causeXContentBuilder, STACKTRACE_PARAMS, cause); causeXContentBuilder.endObject(); LifecycleExecutionState currentState = LifecycleExecutionState.fromIndexMetadata(idxMeta); - Step.StepKey currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState), - "unable to move to an error step where there is no current step, state: " + currentState); + Step.StepKey currentStep; + // if an error is encountered while initialising the policy the lifecycle execution state will not yet contain any step information + // as we haven't yet initialised the policy, so we'll manually set the current step to be the "initialize policy" step so we can + // record the error (and later retry the init policy step) + if (cause instanceof InitializePolicyException) { + currentStep = InitializePolicyContextStep.KEY; + } else { + currentStep = Objects.requireNonNull(LifecycleExecutionState.getCurrentStepKey(currentState), + "unable to move to an error step where there is no current step, state: " + currentState); + } LifecycleExecutionState nextStepState = updateExecutionStateToStep(policyMetadata, currentState, new Step.StepKey(currentStep.getPhase(), currentStep.getAction(), ErrorStep.NAME), nowSupplier, false);