Skip to content

Commit

Permalink
Make the Execute retrier derive retry delay
Browse files Browse the repository at this point in the history
When a retryInfo is supplied, it should circumvent any other conditions
which would prevent retriability. Its delay will inform the subsequent
backoff delay supplied, assuming it is not beyond the retry count.
  • Loading branch information
werkt committed Nov 29, 2020
1 parent b965367 commit 551bc7a
Show file tree
Hide file tree
Showing 8 changed files with 163 additions and 88 deletions.
127 changes: 127 additions & 0 deletions src/main/java/com/google/devtools/build/lib/remote/ExecuteRetrier.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright 2020 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.devtools.build.lib.remote;

import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.protobuf.Any;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.util.Durations;
import com.google.rpc.DebugInfo;
import com.google.rpc.Help;
import com.google.rpc.LocalizedMessage;
import com.google.rpc.RetryInfo;
import com.google.rpc.RequestInfo;
import com.google.rpc.ResourceInfo;
import com.google.rpc.PreconditionFailure;
import com.google.rpc.PreconditionFailure.Violation;
import io.grpc.Status.Code;
import io.grpc.protobuf.StatusProto;

class ExecuteRetrier extends RemoteRetrier {

private static final String VIOLATION_TYPE_MISSING = "MISSING";

private static class RetryInfoBackoff implements Backoff {
private final int maxRetryAttempts;
int retries = 0;

RetryInfoBackoff(int maxRetryAttempts) {
this.maxRetryAttempts = maxRetryAttempts;
}

@Override
public long nextDelayMillis(Exception e) {
if (retries >= maxRetryAttempts) {
return -1;
}
RetryInfo retryInfo = getRetryInfo(e);
retries++;
return Durations.toMillis(retryInfo.getRetryDelay());
}

RetryInfo getRetryInfo(Exception e) {
RetryInfo retryInfo = RetryInfo.getDefaultInstance();
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status != null) {
for (Any detail : status.getDetailsList()) {
if (detail.is(RetryInfo.class)) {
try {
retryInfo = detail.unpack(RetryInfo.class);
} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen, ignore
}
}
}
}
return retryInfo;
}

@Override
public int getRetryAttempts() {
return retries;
}
}

ExecuteRetrier(
int maxRetryAttempts,
ListeningScheduledExecutorService retryService,
CircuitBreaker circuitBreaker) {
super(() -> maxRetryAttempts > 0 ? new RetryInfoBackoff(maxRetryAttempts) : RETRIES_DISABLED, ExecuteRetrier::test, retryService, circuitBreaker);
}

private static boolean test(Exception e) {
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
return true;
}
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status == null || status.getDetailsCount() == 0) {
return false;
}
boolean fullyRetriable = false;
boolean failedPrecondition = status.getCode() == Code.FAILED_PRECONDITION.value();
for (Any detail : status.getDetailsList()) {
if (detail.is(RetryInfo.class)) {
// server says we can retry, regardless of other details
fullyRetriable = true;
} else if (failedPrecondition && !fullyRetriable) {
if (detail.is(PreconditionFailure.class)) {
try {
PreconditionFailure f = detail.unpack(PreconditionFailure.class);
if (f.getViolationsCount() == 0) {
failedPrecondition = false;
}
for (Violation v : f.getViolationsList()) {
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
failedPrecondition = false;
}
}
// if *all* > 0 precondition failure violations have type MISSING, failedPrecondition remains true
} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen
return false;
}
} else if (!(detail.is(DebugInfo.class)
|| detail.is(Help.class)
|| detail.is(LocalizedMessage.class)
|| detail.is(RequestInfo.class)
|| detail.is(ResourceInfo.class))) { // ignore benign details
// consider all other details as failures
failedPrecondition = false;
}
}
}
return fullyRetriable || failedPrecondition;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ ExecuteResponse waitExecution() throws IOException {
//
// However, we only retry Execute() if executeBackoff should retry. Also increase the retry
// counter at the same time (done by nextDelayMillis()).
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis() >= 0) {
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis(e) >= 0) {
lastOperation = null;
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ public ExponentialBackoff(RemoteOptions options) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (attempts == maxAttempts) {
return -1;
}
Expand Down Expand Up @@ -221,13 +221,13 @@ public void reset() {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (currentBackoff == null) {
currentBackoff = backoffSupplier.get();
retries++;
return 0;
}
return currentBackoff.nextDelayMillis();
return currentBackoff.nextDelayMillis(e);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.devtools.build.lib.actions.ActionInput;
Expand Down Expand Up @@ -85,23 +86,12 @@
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import com.google.longrunning.Operation;
import com.google.protobuf.Any;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;
import com.google.protobuf.Timestamp;
import com.google.protobuf.util.Durations;
import com.google.protobuf.util.Timestamps;
import com.google.rpc.DebugInfo;
import com.google.rpc.Help;
import com.google.rpc.LocalizedMessage;
import com.google.rpc.RetryInfo;
import com.google.rpc.RequestInfo;
import com.google.rpc.ResourceInfo;
import com.google.rpc.PreconditionFailure;
import com.google.rpc.PreconditionFailure.Violation;
import io.grpc.Context;
import io.grpc.Status.Code;
import io.grpc.protobuf.StatusProto;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
Expand All @@ -120,48 +110,6 @@
@ThreadSafe
public class RemoteSpawnRunner implements SpawnRunner {

private static final String VIOLATION_TYPE_MISSING = "MISSING";

private static boolean retriableExecErrors(Exception e) {
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
return true;
}
if (!RemoteRetrierUtils.causedByStatus(e, Code.FAILED_PRECONDITION)) {
return false;
}
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status == null || status.getDetailsCount() == 0) {
return false;
}
for (Any detail : status.getDetailsList()) {
if (detail.is(PreconditionFailure.class)) {
try {
PreconditionFailure f = detail.unpack(PreconditionFailure.class);
if (f.getViolationsCount() == 0) {
return false; // Generally shouldn't happen
}
for (Violation v : f.getViolationsList()) {
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
return false;
}
}
} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen
return false;
}
} else if (!(detail.is(DebugInfo.class)
|| detail.is(Help.class)
|| detail.is(LocalizedMessage.class)
|| detail.is(RetryInfo.class)
|| detail.is(RequestInfo.class)
|| detail.is(ResourceInfo.class))) { // ignore benign details
// consider all other details as failures
return false;
}
}
return true; // if *all* > 0 precondition failure violations have type MISSING
}

private final Path execRoot;
private final RemoteOptions remoteOptions;
private final ExecutionOptions executionOptions;
Expand Down Expand Up @@ -831,11 +779,8 @@ static Collection<Path> resolveActionInputs(

private static RemoteRetrier createExecuteRetrier(
RemoteOptions options, ListeningScheduledExecutorService retryService) {
return new RemoteRetrier(
options.remoteMaxRetryAttempts > 0
? () -> new Retrier.ZeroBackoff(options.remoteMaxRetryAttempts)
: () -> Retrier.RETRIES_DISABLED,
RemoteSpawnRunner::retriableExecErrors,
return new ExecuteRetrier(
options.remoteMaxRetryAttempts,
retryService,
Retrier.ALLOW_ALL_CALLS);
}
Expand Down
12 changes: 6 additions & 6 deletions src/main/java/com/google/devtools/build/lib/remote/Retrier.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ public interface Backoff {
* Returns the next delay in milliseconds, or a value less than {@code 0} if we should stop
* retrying.
*/
long nextDelayMillis();
long nextDelayMillis(Exception e);

/**
* Returns the number of calls to {@link #nextDelayMillis()} thus far, not counting any calls
* Returns the number of calls to {@link #nextDelayMillis(Exception)} thus far, not counting any calls
* that returned less than {@code 0}.
*/
int getRetryAttempts();
Expand Down Expand Up @@ -140,7 +140,7 @@ public void recordSuccess() {}
public static final Backoff RETRIES_DISABLED =
new Backoff() {
@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
return -1;
}

Expand All @@ -161,7 +161,7 @@ public ZeroBackoff(int maxRetries) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (retries >= maxRetries) {
return -1;
}
Expand Down Expand Up @@ -253,7 +253,7 @@ public <T> T execute(Callable<T> call, Backoff backoff) throws Exception {
if (!shouldRetry.test(e)) {
throw e;
}
final long delayMillis = backoff.nextDelayMillis();
final long delayMillis = backoff.nextDelayMillis(e);
if (delayMillis < 0) {
throw e;
}
Expand Down Expand Up @@ -286,7 +286,7 @@ public <T> ListenableFuture<T> executeAsync(AsyncCallable<T> call, Backoff backo
private <T> ListenableFuture<T> onExecuteAsyncFailure(
Exception t, AsyncCallable<T> call, Backoff backoff) {
if (isRetriable(t)) {
long waitMillis = backoff.nextDelayMillis();
long waitMillis = backoff.nextDelayMillis(t);
if (waitMillis >= 0) {
try {
return Futures.scheduleAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertThrows;
import static org.junit.Assert.fail;
import static org.mockito.ArgumentMatchers.any;

import build.bazel.remote.execution.v2.Digest;
import build.bazel.remote.execution.v2.RequestMetadata;
Expand Down Expand Up @@ -330,7 +331,7 @@ public void queryWriteStatus(
uploader.uploadBlob(hash, chunker, true);

// This test should not have triggered any retries.
Mockito.verify(mockBackoff, Mockito.never()).nextDelayMillis();
Mockito.verify(mockBackoff, Mockito.never()).nextDelayMillis(any(Exception.class));
Mockito.verify(mockBackoff, Mockito.times(1)).getRetryAttempts();

blockUntilInternalStateConsistent(uploader);
Expand Down Expand Up @@ -469,7 +470,7 @@ public void queryWriteStatus(

// This test should have triggered a single retry, because it made
// no progress.
Mockito.verify(mockBackoff, Mockito.times(1)).nextDelayMillis();
Mockito.verify(mockBackoff, Mockito.times(1)).nextDelayMillis(any(Exception.class));

blockUntilInternalStateConsistent(uploader);

Expand Down Expand Up @@ -1402,7 +1403,7 @@ public FixedBackoff(int maxRetries, int delayMillis) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (retries < maxRetries) {
retries++;
return delayMillis;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1083,13 +1083,13 @@ public void read(ReadRequest request, StreamObserver<ReadResponse> responseObser
}
});
assertThat(new String(downloadBlob(client, digest), UTF_8)).isEqualTo("abcdefg");
Mockito.verify(mockBackoff, Mockito.never()).nextDelayMillis();
Mockito.verify(mockBackoff, Mockito.never()).nextDelayMillis(any(Exception.class));
}

@Test
public void downloadBlobPassesThroughDeadlineExceededWithoutProgress() throws IOException {
Backoff mockBackoff = Mockito.mock(Backoff.class);
Mockito.when(mockBackoff.nextDelayMillis()).thenReturn(-1L);
Mockito.when(mockBackoff.nextDelayMillis(any(Exception.class))).thenReturn(-1L);
final GrpcCacheClient client =
newClient(Options.getDefaults(RemoteOptions.class), () -> mockBackoff);
final Digest digest = DIGEST_UTIL.computeAsUtf8("abcdefg");
Expand All @@ -1109,7 +1109,7 @@ public void read(ReadRequest request, StreamObserver<ReadResponse> responseObser
IOException e = assertThrows(IOException.class, () -> downloadBlob(client, digest));
Status st = Status.fromThrowable(e);
assertThat(st.getCode()).isEqualTo(Status.Code.DEADLINE_EXCEEDED);
Mockito.verify(mockBackoff, Mockito.times(1)).nextDelayMillis();
Mockito.verify(mockBackoff, Mockito.times(1)).nextDelayMillis(any(Exception.class));
}

@Test
Expand Down
Loading

0 comments on commit 551bc7a

Please sign in to comment.