Skip to content

Commit 0fd3f76

Browse files
Cap max RetryableAction wait time/timeout. (#74940)
RetryableAction uses randomized and exponential back off. If unlucky, the randomization would cause a series of very short waits, which would double the bound every time, risking a subsequent very long wait. Now randomize between [bound/2, bound[. Closes #70996
1 parent 3ba4230 commit 0fd3f76

File tree

3 files changed

+18
-21
lines changed

3 files changed

+18
-21
lines changed

server/src/main/java/org/elasticsearch/action/support/RetryableAction.java

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,12 +103,12 @@ public void onRejection(Exception e) {
103103

104104
public abstract boolean shouldRetry(Exception e);
105105

106-
protected long calculateDelay(long previousDelay) {
107-
return Math.min(previousDelay * 2, Integer.MAX_VALUE);
106+
protected long calculateDelayBound(long previousDelayBound) {
107+
return Math.min(previousDelayBound * 2, Integer.MAX_VALUE);
108108
}
109109

110110
protected long minimumDelayMillis() {
111-
return 1L;
111+
return 0L;
112112
}
113113

114114
public void onFinished() {
@@ -145,10 +145,12 @@ public void onFailure(Exception e) {
145145
} else {
146146
addException(e);
147147

148-
final long nextDelayMillisBound = calculateDelay(delayMillisBound);
148+
final long nextDelayMillisBound = calculateDelayBound(delayMillisBound);
149149
final RetryingListener retryingListener = new RetryingListener(nextDelayMillisBound, caughtExceptions);
150150
final Runnable runnable = createRunnable(retryingListener);
151-
final long delayMillis = Randomness.get().nextInt(Math.toIntExact(delayMillisBound)) + minimumDelayMillis();
151+
int range = Math.toIntExact((delayMillisBound + 1) / 2);
152+
final long delayMillis = Randomness.get().nextInt(range) + delayMillisBound - range + 1L;
153+
assert delayMillis > 0;
152154
if (isDone.get() == false) {
153155
final TimeValue delay = TimeValue.timeValueMillis(delayMillis);
154156
logger.debug(() -> new ParameterizedMessage("retrying action that failed in {}", delay), e);

server/src/test/java/org/elasticsearch/action/support/RetryableActionTests.java

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ public void testRetryableActionTimeout() {
102102
final AtomicInteger retryCount = new AtomicInteger();
103103
final PlainActionFuture<Boolean> future = PlainActionFuture.newFuture();
104104
final RetryableAction<Boolean> retryableAction = new RetryableAction<>(logger, taskQueue.getThreadPool(),
105-
TimeValue.timeValueMillis(10), TimeValue.timeValueSeconds(1), future) {
105+
TimeValue.timeValueMillis(randomFrom(1, 10, randomIntBetween(100, 2000))), TimeValue.timeValueSeconds(1), future) {
106106

107107
@Override
108108
public void tryAction(ActionListener<Boolean> listener) {
@@ -119,6 +119,7 @@ public boolean shouldRetry(Exception e) {
119119
return e instanceof EsRejectedExecutionException;
120120
}
121121
};
122+
long begin = taskQueue.getCurrentTimeMillis();
122123
retryableAction.run();
123124
taskQueue.runAllRunnableTasks();
124125
long previousDeferredTime = 0;
@@ -133,6 +134,10 @@ public boolean shouldRetry(Exception e) {
133134
assertFalse(taskQueue.hasRunnableTasks());
134135

135136
expectThrows(EsRejectedExecutionException.class, future::actionGet);
137+
138+
long end = taskQueue.getCurrentTimeMillis();
139+
// max 3x timeout since we minimum wait half the bound for every retry.
140+
assertThat(end - begin, lessThanOrEqualTo(3000L));
136141
}
137142

138143
public void testTimeoutOfZeroMeansNoRetry() {

x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/utils/persistence/ResultsPersisterService.java

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,6 @@ private abstract class MlRetryableAction<Request, Response> extends RetryableAct
390390
final Consumer<String> msgHandler;
391391
final BiConsumer<Request, ActionListener<Response>> action;
392392
volatile int currentAttempt = 0;
393-
volatile long currentMin = MIN_RETRY_SLEEP_MILLIS;
394393
volatile long currentMax = MIN_RETRY_SLEEP_MILLIS;
395394

396395
MlRetryableAction(String jobId,
@@ -453,30 +452,21 @@ public boolean shouldRetry(Exception e) {
453452
}
454453

455454
@Override
456-
protected long calculateDelay(long previousDelay) {
457-
// Since we exponentially increase, we don't want force randomness to have an excessively long sleep
458-
if (currentMax < MAX_RETRY_SLEEP_MILLIS) {
459-
currentMin = currentMax;
460-
}
455+
protected long calculateDelayBound(long previousDelayBound) {
461456
// Exponential backoff calculation taken from: https://en.wikipedia.org/wiki/Exponential_backoff
462457
int uncappedBackoff = ((1 << Math.min(currentAttempt, MAX_RETRY_EXPONENT)) - 1) * (50);
463458
currentMax = Math.min(uncappedBackoff, MAX_RETRY_SLEEP_MILLIS);
464-
// Its good to have a random window along the exponentially increasing curve
465-
// so that not all bulk requests rest for the same amount of time
466-
int randBound = (int)(1 + (currentMax - currentMin));
467459
String msg = new ParameterizedMessage(
468460
"failed to {} after [{}] attempts. Will attempt again.",
469461
getName(),
470462
currentAttempt)
471463
.getFormattedMessage();
472464
LOGGER.warn(() -> new ParameterizedMessage("[{}] {}", jobId, msg));
473465
msgHandler.accept(msg);
474-
return randBound;
475-
}
476-
477-
@Override
478-
protected long minimumDelayMillis() {
479-
return currentMin;
466+
// RetryableAction randomizes in the interval [currentMax/2 ; currentMax].
467+
// Its good to have a random window along the exponentially increasing curve
468+
// so that not all bulk requests rest for the same amount of time
469+
return currentMax;
480470
}
481471

482472
@Override

0 commit comments

Comments
 (0)