From 02b373d7550ca916382dcb9dc5e14c52a10cf548 Mon Sep 17 00:00:00 2001 From: Henning Andersen Date: Mon, 5 Jul 2021 18:58:07 +0200 Subject: [PATCH 1/3] Cap max RetryableAction wait time/timeout. RetryableAction uses randomized and exponential back off. If unlucky, the randomization would cause a series of very short waits, which would double the bound every time, risking a subsequent very long wait. Now randomize between [bound/2, bound[. Closes #70996 --- .../action/support/RetryableAction.java | 6 ++++-- .../action/support/RetryableActionTests.java | 7 ++++++- .../persistence/ResultsPersisterService.java | 18 ++++-------------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java b/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java index ae49afb955c4f..45c82f67093a0 100644 --- a/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java +++ b/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java @@ -108,7 +108,7 @@ protected long calculateDelay(long previousDelay) { } protected long minimumDelayMillis() { - return 1L; + return 0L; } public void onFinished() { @@ -148,7 +148,9 @@ public void onFailure(Exception e) { final long nextDelayMillisBound = calculateDelay(delayMillisBound); final RetryingListener retryingListener = new RetryingListener(nextDelayMillisBound, caughtExceptions); final Runnable runnable = createRunnable(retryingListener); - final long delayMillis = Randomness.get().nextInt(Math.toIntExact(delayMillisBound)) + minimumDelayMillis(); + int range = Math.toIntExact((delayMillisBound + 1) / 2); + final long delayMillis = Randomness.get().nextInt(range) + delayMillisBound - range + 1L; + assert delayMillis > 0; if (isDone.get() == false) { final TimeValue delay = TimeValue.timeValueMillis(delayMillis); logger.debug(() -> new ParameterizedMessage("retrying action that failed in {}", delay), e); diff --git a/server/src/test/java/org/elasticsearch/action/support/RetryableActionTests.java b/server/src/test/java/org/elasticsearch/action/support/RetryableActionTests.java index 6b5f35f6c7d6b..d9700a1458a43 100644 --- a/server/src/test/java/org/elasticsearch/action/support/RetryableActionTests.java +++ b/server/src/test/java/org/elasticsearch/action/support/RetryableActionTests.java @@ -102,7 +102,7 @@ public void testRetryableActionTimeout() { final AtomicInteger retryCount = new AtomicInteger(); final PlainActionFuture future = PlainActionFuture.newFuture(); final RetryableAction retryableAction = new RetryableAction<>(logger, taskQueue.getThreadPool(), - TimeValue.timeValueMillis(10), TimeValue.timeValueSeconds(1), future) { + TimeValue.timeValueMillis(randomFrom(1, 10, randomIntBetween(100, 2000))), TimeValue.timeValueSeconds(1), future) { @Override public void tryAction(ActionListener listener) { @@ -119,6 +119,7 @@ public boolean shouldRetry(Exception e) { return e instanceof EsRejectedExecutionException; } }; + long begin = taskQueue.getCurrentTimeMillis(); retryableAction.run(); taskQueue.runAllRunnableTasks(); long previousDeferredTime = 0; @@ -133,6 +134,10 @@ public boolean shouldRetry(Exception e) { assertFalse(taskQueue.hasRunnableTasks()); expectThrows(EsRejectedExecutionException.class, future::actionGet); + + long end = taskQueue.getCurrentTimeMillis(); + // max 3x timeout since we minimum wait half the bound for every retry. + assertThat(end - begin, lessThanOrEqualTo(3000L)); } public void testTimeoutOfZeroMeansNoRetry() { diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java index 368f5d81fb266..2a4e5c29c5a74 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java @@ -390,7 +390,6 @@ private abstract class MlRetryableAction extends RetryableAct final Consumer msgHandler; final BiConsumer> action; volatile int currentAttempt = 0; - volatile long currentMin = MIN_RETRY_SLEEP_MILLIS; volatile long currentMax = MIN_RETRY_SLEEP_MILLIS; MlRetryableAction(String jobId, @@ -454,16 +453,9 @@ public boolean shouldRetry(Exception e) { @Override protected long calculateDelay(long previousDelay) { - // Since we exponentially increase, we don't want force randomness to have an excessively long sleep - if (currentMax < MAX_RETRY_SLEEP_MILLIS) { - currentMin = currentMax; - } // Exponential backoff calculation taken from: https://en.wikipedia.org/wiki/Exponential_backoff int uncappedBackoff = ((1 << Math.min(currentAttempt, MAX_RETRY_EXPONENT)) - 1) * (50); currentMax = Math.min(uncappedBackoff, MAX_RETRY_SLEEP_MILLIS); - // Its good to have a random window along the exponentially increasing curve - // so that not all bulk requests rest for the same amount of time - int randBound = (int)(1 + (currentMax - currentMin)); String msg = new ParameterizedMessage( "failed to {} after [{}] attempts. Will attempt again.", getName(), @@ -471,12 +463,10 @@ protected long calculateDelay(long previousDelay) { .getFormattedMessage(); LOGGER.warn(() -> new ParameterizedMessage("[{}] {}", jobId, msg)); msgHandler.accept(msg); - return randBound; - } - - @Override - protected long minimumDelayMillis() { - return currentMin; + // RetryableAction randomizes in the intervalu [currentMax/2 ; currentMax]. + // Its good to have a random window along the exponentially increasing curve + // so that not all bulk requests rest for the same amount of time + return currentMax; } @Override From 31af8591b16bfeb2abf0914b549de4a80bb04bef Mon Sep 17 00:00:00 2001 From: Henning Andersen <33268011+henningandersen@users.noreply.github.com> Date: Thu, 5 Aug 2021 08:25:45 +0200 Subject: [PATCH 2/3] Update x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java Co-authored-by: David Kyle --- .../xpack/ml/utils/persistence/ResultsPersisterService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java index 2a4e5c29c5a74..464963fbaabc3 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java @@ -463,7 +463,7 @@ protected long calculateDelay(long previousDelay) { .getFormattedMessage(); LOGGER.warn(() -> new ParameterizedMessage("[{}] {}", jobId, msg)); msgHandler.accept(msg); - // RetryableAction randomizes in the intervalu [currentMax/2 ; currentMax]. + // RetryableAction randomizes in the interval [currentMax/2 ; currentMax]. // Its good to have a random window along the exponentially increasing curve // so that not all bulk requests rest for the same amount of time return currentMax; From f69cc7b6f9c4974fde056e23a97a881eaf1faff7 Mon Sep 17 00:00:00 2001 From: Henning Andersen Date: Thu, 5 Aug 2021 09:59:06 +0200 Subject: [PATCH 3/3] Rename calculateDelay --- .../org/elasticsearch/action/support/RetryableAction.java | 6 +++--- .../xpack/ml/utils/persistence/ResultsPersisterService.java | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java b/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java index 45c82f67093a0..ca223346cdd9d 100644 --- a/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java +++ b/server/src/main/java/org/elasticsearch/action/support/RetryableAction.java @@ -103,8 +103,8 @@ public void onRejection(Exception e) { public abstract boolean shouldRetry(Exception e); - protected long calculateDelay(long previousDelay) { - return Math.min(previousDelay * 2, Integer.MAX_VALUE); + protected long calculateDelayBound(long previousDelayBound) { + return Math.min(previousDelayBound * 2, Integer.MAX_VALUE); } protected long minimumDelayMillis() { @@ -145,7 +145,7 @@ public void onFailure(Exception e) { } else { addException(e); - final long nextDelayMillisBound = calculateDelay(delayMillisBound); + final long nextDelayMillisBound = calculateDelayBound(delayMillisBound); final RetryingListener retryingListener = new RetryingListener(nextDelayMillisBound, caughtExceptions); final Runnable runnable = createRunnable(retryingListener); int range = Math.toIntExact((delayMillisBound + 1) / 2); diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java index 464963fbaabc3..f5750fcb00931 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java @@ -452,7 +452,7 @@ public boolean shouldRetry(Exception e) { } @Override - protected long calculateDelay(long previousDelay) { + protected long calculateDelayBound(long previousDelayBound) { // Exponential backoff calculation taken from: https://en.wikipedia.org/wiki/Exponential_backoff int uncappedBackoff = ((1 << Math.min(currentAttempt, MAX_RETRY_EXPONENT)) - 1) * (50); currentMax = Math.min(uncappedBackoff, MAX_RETRY_SLEEP_MILLIS);