Skip to content

Commit

Permalink
Don't rewind nodes that are done with an error.
Browse files Browse the repository at this point in the history
We don't intentionally attempt to rewind nodes in error, but it can happen when two parents observe the same lost input and there is a race to rewind the node. After the first parent rewinds it, the node completes with an error (i.e. because the action is flaky), then the losing parent arrives. Rewinding the errorful node again is problematic in a `--nokeep_going` build because error bubbling expects the errorful node to be done.

A regression test is added to `MemoizingEvaluatorTest`.

PiperOrigin-RevId: 589132267
Change-Id: Idc9c30c2fea34d02d31fa20a5f4855cb1300c1ba
  • Loading branch information
justinhorvitz authored and copybara-github committed Dec 8, 2023
1 parent e560c49 commit f6746f5
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ public synchronized MarkedDirtyResult markDirty(DirtyType dirtyType) {
checkNotNull(dirtyType, this);

if (isDone()) {
if (dirtyType == DirtyType.REWIND && getErrorInfo() != null) {
return null; // Rewinding errors is no-op.
}
GroupedDeps directDeps = GroupedDeps.decompress(getCompressedDirectDepsForDoneEntry());
checkState(
dirtyType != DirtyType.DIRTY || !directDeps.isEmpty(),
Expand Down
10 changes: 7 additions & 3 deletions src/main/java/com/google/devtools/build/skyframe/NodeEntry.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ enum DirtyType {
* <p>A node dirtied with {@code REWIND} is re-evaluated during the evaluation phase if it is
* requested, regardless of the state of its dependencies. Even if it re-evaluates to the same
* value, dirty parents are re-evaluated.
*
* <p>Rewinding is tolerated but no-op if the node is already dirty or is done with an
* {@linkplain #getErrorInfo() error} (regardless of the error's {@link
* com.google.devtools.build.skyframe.SkyFunctionException.Transience}).
*/
REWIND
}
Expand Down Expand Up @@ -155,10 +159,10 @@ enum DirtyType {
* !P.isChanged()}. Otherwise, this will throw {@link IllegalStateException}.
*
* <p>{@code markDirty(DirtyType.REWIND)} may be called at any time (even multiple times
* concurrently), although it only has an effect if the node {@link #isDone}.
* concurrently), although it only has an effect if the node {@link #isDone} with no error.
*
* @return if the node was done, a {@link MarkedDirtyResult} which may include the node's reverse
* deps; otherwise {@code null}
* @return if the node transitioned from done to dirty as a result of this call, a {@link
* MarkedDirtyResult} which may include the node's reverse deps; otherwise {@code null}
*/
@Nullable
@ThreadSafe
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,9 @@ public final synchronized MarkedDirtyResult markDirty(DirtyType dirtyType) {
if (!isDone()) {
return null; // Tolerate concurrent requests to rewind.
}
if (getErrorInfo() != null) {
return null; // Rewinding errors is no-op.
}
dirtyBuildingState = new RewoundNonIncrementalBuildingState(value);
value = null;
return MarkedDirtyResult.forRewinding();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ public boolean supportsPartialReevaluation() {
};
private static final NestedSet<Reportable> NO_EVENTS =
NestedSetBuilder.emptySet(Order.STABLE_ORDER);

@TestParameter boolean isPartialReevaluation;
protected final V initialVersion = getInitialVersion();

Expand Down Expand Up @@ -566,6 +567,27 @@ public void concurrentRewindingAllowed() throws InterruptedException {
assertThat(entry.getLifecycleState()).isEqualTo(LifecycleState.NEEDS_REBUILDING);
}

@Test
public void rewindErrorfulNode_toleratedButNoOp(@TestParameter Transience transience)
throws InterruptedException {
InMemoryNodeEntry entry = createEntry();
entry.addReverseDepAndCheckIfDone(null); // Start evaluation.
entry.markRebuilding();

ReifiedSkyFunctionException exception =
new ReifiedSkyFunctionException(
new GenericFunctionException(new SomeErrorException("oops"), transience));
ErrorInfo errorInfo = ErrorInfo.fromException(exception, transience == Transience.TRANSIENT);
assertThat(setValue(entry, /* value= */ null, errorInfo, initialVersion)).isEmpty();

assertThat(entry.markDirty(DirtyType.REWIND)).isNull();
assertThat(entry.isDone()).isTrue();
assertThat(entry.getLifecycleState()).isEqualTo(LifecycleState.DONE);
assertThat(entry.getValue()).isNull();
assertThat(entry.toValue()).isNull();
assertThat(entry.getErrorInfo()).isEqualTo(errorInfo);
}

@CanIgnoreReturnValue
static Set<SkyKey> setValue(
NodeEntry entry, SkyValue value, @Nullable ErrorInfo errorInfo, Version graphVersion)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2690,7 +2690,7 @@ public SkyValue compute(SkyKey skyKey, Environment env) throws InterruptedExcept

assertThatEvaluationResult(result).hasEntryThat(dep).isEqualTo(new StringValue("depVal"));
assertThat(inconsistencyReceiver).isEmpty();
assertThatEvaluationResult(result).hasEntryThat(top).isNull();
assertThat(tester.evaluator.getValues()).doesNotContainKey(top);
assertThatEvaluationResult(result).hasReverseDepsInGraphThat(dep).isEmpty();
}

Expand Down Expand Up @@ -2822,14 +2822,13 @@ public void twoParentsRewindSameDep_markedDirtyTwice() throws Exception {
tester.getOrCreate(dep).setBuilder(rewindableFunction);
injectGraphListenerForTesting(
(key, type, order, context) -> {
if (type == EventType.MARK_DIRTY && order == Order.BEFORE) {
if (!isFirstParent.getAndSet(false)) {
// Lost the race. Wait for the first parent to finish so we rewind again. We could
// just wait for dep to finish, but then we might mark it dirty before the first
// parent uses it, which would lead to flaky BUILDING_PARENT_FOUND_UNDONE_CHILD
// inconsistencies.
Uninterruptibles.awaitUninterruptibly(firstParentDone);
}
if (type == EventType.MARK_DIRTY
&& order == Order.BEFORE
&& !isFirstParent.getAndSet(false)) {
// Lost the race. Wait for the first parent to finish so we rewind again. We could just
// wait for dep to finish, but then we might mark it dirty before the first parent uses
// it, which would lead to flaky BUILDING_PARENT_FOUND_UNDONE_CHILD inconsistencies.
Uninterruptibles.awaitUninterruptibly(firstParentDone);
}
},
/* deterministic= */ false);
Expand Down Expand Up @@ -2873,6 +2872,98 @@ public void twoParentsRewindSameDep_markedDirtyTwice() throws Exception {
assertThatEvaluationResult(result).hasReverseDepsInGraphThat(dep).containsExactly(top1, top2);
}

/**
* Regression test for b/315301248.
*
* <p>In a {@code --nokeep_going} build, multiple parents attempt to rewind the same node
* concurrently. One successfully dirties the node, which then completes with an error before the
* second parent attempts to dirty the node again. If the second rewinding attempt actually
* transitions the node from done (in error) to dirty, we would crash during error bubbling, which
* reasonably expects the errorful node to be done.
*
* <p>The solution is to ignore rewinding attempts on errorful nodes.
*/
@Test
public void twoParentsRewindSameDep_depEvaluatesToErrorAfterRewind() throws Exception {
assume().that(resetSupported()).isTrue();

var inconsistencyReceiver = recordInconsistencies();
var rewindableErrorFunction =
new SkyFunction() {
private int calls = 0;

@Nullable
@Override
public SkyValue compute(SkyKey skyKey, Environment env) throws SkyFunctionException {
if (++calls == 1) {
return RewindableFunction.STALE_VALUE;
}
throw new GenericFunctionException(new SomeErrorException("error"));
}
};
CyclicBarrier parentBarrier = new CyclicBarrier(2);
AtomicBoolean isFirstParent = new AtomicBoolean(true);
CountDownLatch depErrorSet = new CountDownLatch(1);
SkyKey top1 = skyKey("top1");
SkyKey top2 = skyKey("top2");
SkyKey dep = skyKey("dep");

tester.getOrCreate(dep).setBuilder(rewindableErrorFunction);
injectGraphListenerForTesting(
(key, type, order, context) -> {
if (type == EventType.MARK_DIRTY
&& order == Order.BEFORE
&& !isFirstParent.getAndSet(false)) {
// Lost the race. Wait for dep have its error set so that we attempt to rewind a done
// node in error.
Uninterruptibles.awaitUninterruptibly(depErrorSet);
} else if (key.equals(dep)
&& type == EventType.SET_VALUE
&& order == Order.AFTER
&& ValueWithMetadata.getMaybeErrorInfo((SkyValue) context) != null) {
depErrorSet.countDown();
}
},
/* deterministic= */ false);
SkyFunction parentFunction =
(skyKey, env) -> {
var depValue = (StringValue) env.getValue(dep);
if (depValue == null) {
return null;
}
assertThat(depValue).isEqualTo(RewindableFunction.STALE_VALUE);
awaitUnchecked(parentBarrier);
var rewindGraph = Reset.newRewindGraphFor(skyKey);
rewindGraph.putEdge(skyKey, dep);
return Reset.of(rewindGraph);
};
tester.getOrCreate(top1).setBuilder(parentFunction);
tester.getOrCreate(top2).setBuilder(parentFunction);

var result = tester.eval(/* keepGoing= */ false, top1, top2);

assertThatEvaluationResult(result).hasError();
assertThat(result.errorMap().keySet()).containsAnyOf(top1, top2);
assertThat(inconsistencyReceiver)
.containsExactly(
InconsistencyData.resetRequested(top1),
InconsistencyData.rewind(top1, ImmutableSet.of(dep)),
InconsistencyData.resetRequested(top2),
InconsistencyData.rewind(top2, ImmutableSet.of(dep)));

if (!incrementalitySupported()) {
return; // Skip assertions on dependency edges when they aren't kept.
}

result = tester.eval(/* keepGoing= */ false, dep);

// The parents never completed, so an incremental build deletes them. Check that they are no
// longer in the graph and that rdeps are removed from dep.
assertThatEvaluationResult(result).hasSingletonErrorThat(dep).isNotNull();
assertThat(tester.evaluator.getValues().keySet()).containsNoneOf(top1, top2);
assertThatEvaluationResult(result).hasReverseDepsInGraphThat(dep).isEmpty();
}

private static void awaitUnchecked(CyclicBarrier barrier) {
try {
barrier.await();
Expand Down Expand Up @@ -3128,7 +3219,7 @@ private static final class RewindableFunction implements SkyFunction {
static final StringValue STALE_VALUE = new StringValue("stale");
static final StringValue FRESH_VALUE = new StringValue("fresh");

private int calls;
private int calls = 0;

@Override
public SkyValue compute(SkyKey skyKey, Environment env) {
Expand Down Expand Up @@ -5623,22 +5714,22 @@ public SkyValue compute(SkyKey skyKey, Environment env)

/** Data encapsulating a graph inconsistency found during evaluation. */
@AutoValue
public abstract static class InconsistencyData {
public abstract SkyKey key();
abstract static class InconsistencyData {
abstract SkyKey key();

public abstract ImmutableSet<SkyKey> otherKeys();
abstract ImmutableSet<SkyKey> otherKeys();

public abstract Inconsistency inconsistency();
abstract Inconsistency inconsistency();

public static InconsistencyData resetRequested(SkyKey key) {
static InconsistencyData resetRequested(SkyKey key) {
return create(key, /* otherKeys= */ null, Inconsistency.RESET_REQUESTED);
}

static InconsistencyData rewind(SkyKey parent, ImmutableSet<SkyKey> children) {
return create(parent, children, Inconsistency.PARENT_FORCE_REBUILD_OF_CHILD);
}

public static InconsistencyData create(
static InconsistencyData create(
SkyKey key, @Nullable Collection<SkyKey> otherKeys, Inconsistency inconsistency) {
return new AutoValue_MemoizingEvaluatorTest_InconsistencyData(
key,
Expand Down

0 comments on commit f6746f5

Please sign in to comment.