Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Status error presentation with details #12564

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -385,11 +385,6 @@ public String getDetailMessage(
String reason = "(" + status.toShortString() + ")"; // e.g "(Exit 1)"
String explanation = Strings.isNullOrEmpty(message) ? "" : ": " + message;

if (!status().isConsideredUserError()) {
String errorDetail = status().name().toLowerCase(Locale.US)
.replace('_', ' ');
explanation += ". Note: Remote connection/protocol failed with: " + errorDetail;
}
if (status() == Status.TIMEOUT) {
if (getWallTime().isPresent()) {
explanation +=
Expand All @@ -407,9 +402,6 @@ public String getDetailMessage(
explanation += " Action tagged as local was forcibly run remotely and failed - it's "
+ "possible that the action simply doesn't work remotely";
}
if (!Strings.isNullOrEmpty(failureMessage)) {
explanation += " " + failureMessage;
}
return reason + explanation;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright 2020 The Bazel Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.devtools.build.lib.remote;

import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.protobuf.Any;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.util.Durations;
import com.google.rpc.DebugInfo;
import com.google.rpc.Help;
import com.google.rpc.LocalizedMessage;
import com.google.rpc.RetryInfo;
import com.google.rpc.RequestInfo;
import com.google.rpc.ResourceInfo;
import com.google.rpc.PreconditionFailure;
import com.google.rpc.PreconditionFailure.Violation;
import io.grpc.Status.Code;
import io.grpc.protobuf.StatusProto;

class ExecuteRetrier extends RemoteRetrier {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: please add javadoc

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hope a single line is sufficient here, didn't see much else on the other classes I looked at.


private static final String VIOLATION_TYPE_MISSING = "MISSING";

private static class RetryInfoBackoff implements Backoff {
private final int maxRetryAttempts;
int retries = 0;

RetryInfoBackoff(int maxRetryAttempts) {
this.maxRetryAttempts = maxRetryAttempts;
}

@Override
public long nextDelayMillis(Exception e) {
if (retries >= maxRetryAttempts) {
return -1;
}
RetryInfo retryInfo = getRetryInfo(e);
retries++;
return Durations.toMillis(retryInfo.getRetryDelay());
}

RetryInfo getRetryInfo(Exception e) {
RetryInfo retryInfo = RetryInfo.getDefaultInstance();
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status != null) {
for (Any detail : status.getDetailsList()) {
if (detail.is(RetryInfo.class)) {
try {
retryInfo = detail.unpack(RetryInfo.class);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

break here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want the retryInfo to have a deterministic last-specified behavior. Not that I want to see multiple from the service, but if it does, the last one in the list should be effective.

} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen, ignore
}
}
}
}
return retryInfo;
}

@Override
public int getRetryAttempts() {
return retries;
}
}

ExecuteRetrier(
int maxRetryAttempts,
ListeningScheduledExecutorService retryService,
CircuitBreaker circuitBreaker) {
super(() -> maxRetryAttempts > 0 ? new RetryInfoBackoff(maxRetryAttempts) : RETRIES_DISABLED, ExecuteRetrier::test, retryService, circuitBreaker);
}

private static boolean test(Exception e) {
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
return true;
}
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status == null || status.getDetailsCount() == 0) {
return false;
}
boolean fullyRetriable = false;
boolean failedPrecondition = status.getCode() == Code.FAILED_PRECONDITION.value();
for (Any detail : status.getDetailsList()) {
if (detail.is(RetryInfo.class)) {
// server says we can retry, regardless of other details
fullyRetriable = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

break here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Early return here - nothing after it is effective, and coming up with whether the precondition failure is effective is meaningless. Thanks!

} else if (failedPrecondition && !fullyRetriable) {
if (detail.is(PreconditionFailure.class)) {
try {
PreconditionFailure f = detail.unpack(PreconditionFailure.class);
if (f.getViolationsCount() == 0) {
failedPrecondition = false;
}
for (Violation v : f.getViolationsList()) {
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
failedPrecondition = false;
}
}
// if *all* > 0 precondition failure violations have type MISSING, failedPrecondition remains true
} catch (InvalidProtocolBufferException protoEx) {
// really shouldn't happen
return false;
}
} else if (!(detail.is(DebugInfo.class)
|| detail.is(Help.class)
|| detail.is(LocalizedMessage.class)
|| detail.is(RequestInfo.class)
|| detail.is(ResourceInfo.class))) { // ignore benign details
// consider all other details as failures
failedPrecondition = false;
}
}
}
return fullyRetriable || failedPrecondition;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,8 @@ public boolean isExecutionTimeout() {
public ExecuteResponse getResponse() {
return response;
}

public Status getOriginalStatus() {
return status;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ ExecuteResponse waitExecution() throws IOException {
//
// However, we only retry Execute() if executeBackoff should retry. Also increase the retry
// counter at the same time (done by nextDelayMillis()).
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis() >= 0) {
if (e.getStatus().getCode() == Code.NOT_FOUND && executeBackoff.nextDelayMillis(e) >= 0) {
lastOperation = null;
return null;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ public ExponentialBackoff(RemoteOptions options) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (attempts == maxAttempts) {
return -1;
}
Expand Down Expand Up @@ -221,13 +221,13 @@ public void reset() {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (currentBackoff == null) {
currentBackoff = backoffSupplier.get();
retries++;
return 0;
}
return currentBackoff.nextDelayMillis();
return currentBackoff.nextDelayMillis(e);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.devtools.build.lib.actions.ActionInput;
Expand Down Expand Up @@ -85,17 +86,12 @@
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import com.google.longrunning.Operation;
import com.google.protobuf.Any;
import com.google.protobuf.InvalidProtocolBufferException;
import com.google.protobuf.Message;
import com.google.protobuf.Timestamp;
import com.google.protobuf.util.Durations;
import com.google.protobuf.util.Timestamps;
import com.google.rpc.PreconditionFailure;
import com.google.rpc.PreconditionFailure.Violation;
import io.grpc.Context;
import io.grpc.Status.Code;
import io.grpc.protobuf.StatusProto;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
Expand All @@ -114,38 +110,6 @@
@ThreadSafe
public class RemoteSpawnRunner implements SpawnRunner {

private static final String VIOLATION_TYPE_MISSING = "MISSING";

private static boolean retriableExecErrors(Exception e) {
if (BulkTransferException.isOnlyCausedByCacheNotFoundException(e)) {
return true;
}
if (!RemoteRetrierUtils.causedByStatus(e, Code.FAILED_PRECONDITION)) {
return false;
}
com.google.rpc.Status status = StatusProto.fromThrowable(e);
if (status == null || status.getDetailsCount() == 0) {
return false;
}
for (Any details : status.getDetailsList()) {
PreconditionFailure f;
try {
f = details.unpack(PreconditionFailure.class);
} catch (InvalidProtocolBufferException protoEx) {
return false;
}
if (f.getViolationsCount() == 0) {
return false; // Generally shouldn't happen
}
for (Violation v : f.getViolationsList()) {
if (!v.getType().equals(VIOLATION_TYPE_MISSING)) {
return false;
}
}
}
return true; // if *all* > 0 violations have type MISSING
}

private final Path execRoot;
private final RemoteOptions remoteOptions;
private final ExecutionOptions executionOptions;
Expand Down Expand Up @@ -656,12 +620,10 @@ private SpawnResult handleError(
catastrophe = false;
}

final String errorMessage;
if (!verboseFailures) {
errorMessage = Utils.grpcAwareErrorMessage(exception);
} else {
String errorMessage = Utils.grpcAwareErrorMessage(exception);
if (verboseFailures) {
// On --verbose_failures print the whole stack trace
errorMessage = Throwables.getStackTraceAsString(exception);
errorMessage += "\n" + Throwables.getStackTraceAsString(exception);
}

return new SpawnResult.Builder()
Expand Down Expand Up @@ -817,11 +779,8 @@ static Collection<Path> resolveActionInputs(

private static RemoteRetrier createExecuteRetrier(
RemoteOptions options, ListeningScheduledExecutorService retryService) {
return new RemoteRetrier(
options.remoteMaxRetryAttempts > 0
? () -> new Retrier.ZeroBackoff(options.remoteMaxRetryAttempts)
: () -> Retrier.RETRIES_DISABLED,
RemoteSpawnRunner::retriableExecErrors,
return new ExecuteRetrier(
options.remoteMaxRetryAttempts,
retryService,
Retrier.ALLOW_ALL_CALLS);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ public interface Backoff {
* Returns the next delay in milliseconds, or a value less than {@code 0} if we should stop
* retrying.
*/
long nextDelayMillis();
long nextDelayMillis(Exception e);

/**
* Returns the number of calls to {@link #nextDelayMillis()} thus far, not counting any calls
* Returns the number of calls to {@link #nextDelayMillis(Exception)} thus far, not counting any calls
* that returned less than {@code 0}.
*/
int getRetryAttempts();
Expand Down Expand Up @@ -140,7 +140,7 @@ public void recordSuccess() {}
public static final Backoff RETRIES_DISABLED =
new Backoff() {
@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
return -1;
}

Expand All @@ -161,7 +161,7 @@ public ZeroBackoff(int maxRetries) {
}

@Override
public long nextDelayMillis() {
public long nextDelayMillis(Exception e) {
if (retries >= maxRetries) {
return -1;
}
Expand Down Expand Up @@ -253,7 +253,7 @@ public <T> T execute(Callable<T> call, Backoff backoff) throws Exception {
if (!shouldRetry.test(e)) {
throw e;
}
final long delayMillis = backoff.nextDelayMillis();
final long delayMillis = backoff.nextDelayMillis(e);
if (delayMillis < 0) {
throw e;
}
Expand Down Expand Up @@ -286,7 +286,7 @@ public <T> ListenableFuture<T> executeAsync(AsyncCallable<T> call, Backoff backo
private <T> ListenableFuture<T> onExecuteAsyncFailure(
Exception t, AsyncCallable<T> call, Backoff backoff) {
if (isRetriable(t)) {
long waitMillis = backoff.nextDelayMillis();
long waitMillis = backoff.nextDelayMillis(t);
if (waitMillis >= 0) {
try {
return Futures.scheduleAsync(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ java_library(
"//src/main/java/com/google/devtools/build/lib/analysis:blaze_version_info",
"//src/main/java/com/google/devtools/build/lib/authandtls",
"//src/main/java/com/google/devtools/build/lib/concurrent",
"//src/main/java/com/google/devtools/build/lib/remote:ExecutionStatusException",
"//src/main/java/com/google/devtools/build/lib/remote/common",
"//src/main/java/com/google/devtools/build/lib/remote/options",
"//src/main/java/com/google/devtools/build/lib/vfs",
Expand All @@ -30,6 +31,10 @@ java_library(
"//third_party:jsr305",
"//third_party/grpc:grpc-jar",
"//third_party/protobuf:protobuf_java",
"//third_party/protobuf:protobuf_java_util",
"@googleapis//:google_rpc_code_java_proto",
"@googleapis//:google_rpc_error_details_java_proto",
"@googleapis//:google_rpc_status_java_proto",
"@remoteapis//:build_bazel_remote_execution_v2_remote_execution_java_grpc",
"@remoteapis//:build_bazel_remote_execution_v2_remote_execution_java_proto",
],
Expand Down
Loading