Skip to content

Commit

Permalink
Merge pull request #405 from jglick/AnomalousStatus
Browse files Browse the repository at this point in the history
Extending `AnomalousStatus` to also kill `sh` steps
  • Loading branch information
jglick authored Nov 7, 2024
2 parents 9dc4dc1 + 4043ebf commit 6a3e903
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@ private static synchronized ScheduledExecutorService threadPool() {
* {@link #ws} is nulled out and Jenkins waits until a fresh handle is available.
*/
@SuppressFBWarnings(value="SE_TRANSIENT_FIELD_NOT_RESTORED", justification="recurrencePeriod is set in onResume, not deserialization")
static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable {
@Restricted(NoExternalUse.class)
public static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable {

private static final long MIN_RECURRENCE_PERIOD = 250; // ¼s
private static final long MAX_RECURRENCE_PERIOD = 15000; // 15s
Expand All @@ -282,7 +283,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
/** Serialized state of the controller. */
private Controller controller;
/** {@link Node#getNodeName} of {@link #ws}. */
private String node;
public String node;
/** {@link FilePath#getRemote} of {@link #ws}. */
private String remote;
/** Whether the entire stdout of the process is to become the return value of the step. */
Expand Down Expand Up @@ -366,7 +367,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab
return null;
} else {
LOGGER.fine(() -> "rediscovering that " + node + " has been removed and timeout has expired");
listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating shell step");

Check warning on line 370 in src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered line

Line 370 is not covered by tests
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());

Check warning on line 371 in src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void resume(StepContext context) throws Exception {
exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
} catch (TimeoutException x) {
LOGGER.log(Level.FINE, x, () -> "failed to wait for " + item + "; outstanding queue items: " + Arrays.toString(Queue.getInstance().getItems()) + "; running executables: " + Stream.of(Jenkins.get().getComputers()).flatMap(c -> c.getExecutors().stream()).collect(Collectors.toList()));
listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating node step");
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
} catch (CancellationException x) {
LOGGER.log(Level.FINE, "ceased to wait for " + node, x);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepExecution;
import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep;
import org.jenkinsci.plugins.workflow.steps.durable_task.Messages;
import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl;
import org.jenkinsci.plugins.workflow.support.concurrent.Timeout;
Expand Down Expand Up @@ -276,12 +277,12 @@ public void stop(@NonNull Throwable cause) throws Exception {
@Extension public static final class AnomalousStatus extends PeriodicWork {

@Override public long getRecurrencePeriod() {
return Duration.ofMinutes(30).toMillis();
return Duration.ofMinutes(5).toMillis();
}

@Override public long getInitialDelay() {
// Do not run too soon after startup, in case things are still loading, agents are still reattaching, etc.
return Duration.ofMinutes(15).toMillis();
return Duration.ofMinutes(7).toMillis();
}

/**
Expand Down Expand Up @@ -311,6 +312,7 @@ public void stop(@NonNull Throwable cause) throws Exception {
}
}
Set<StepContext> newAnomalous = new HashSet<>();
Set<String> affectedNodes = new HashSet<>();
StepExecution.applyAll(ExecutorStepExecution.class, exec -> {
StepContext ctx = exec.getContext();
if (!knownTasks.contains(ctx)) {
Expand All @@ -322,6 +324,9 @@ public void stop(@NonNull Throwable cause) throws Exception {
LOGGER.log(Level.WARNING, null, x);
}
ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled()));
if (exec.state != null) {
affectedNodes.add(exec.state.node);
}
} else {
newAnomalous.add(ctx);
}
Expand All @@ -330,6 +335,21 @@ public void stop(@NonNull Throwable cause) throws Exception {
}
return null;
}).get();
// Also abort any shell steps running on the same node(s):
if (!affectedNodes.isEmpty()) {
StepExecution.applyAll(DurableTaskStep.Execution.class, exec -> {
if (affectedNodes.contains(exec.node)) {
StepContext ctx = exec.getContext();
try {
ctx.get(TaskListener.class).error("also cancelling shell steps running on " + exec.node);
} catch (IOException | InterruptedException x) {
LOGGER.log(Level.WARNING, null, x);
}
ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause()));
}
return null;

Check warning on line 350 in src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 315-350 are not covered by tests
});
}
for (StepContext ctx : newAnomalous) {
ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; will cancel if this condition persists");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ private void commonSetup() {
j.assertLogNotContains("assuming it is not coming back", b);
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b));
for (int i = 0; i < 5; i++) {
j.assertLogContains("slave" + i + " has been removed for 15 sec, assuming it is not coming back", b);
j.assertLogContains("slave" + i + " has been removed for 15 sec; assuming it is not coming back, and terminating node step", b);
}
assertThat(logging.getRecords().stream().filter(r -> r.getLevel().intValue() >= Level.WARNING.intValue()).toArray(), emptyArray());
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ private static void assertLogMatches(WorkflowRun build, String regexp) throws IO
SemaphoreStep.success("wait/1", null);
r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b));
assertEquals(Collections.emptyList(), Arrays.asList(Queue.getInstance().getItems()));
r.assertLogContains("dumbo has been removed for 15 sec, assuming it is not coming back", b);
r.assertLogContains("dumbo has been removed for 15 sec; assuming it is not coming back, and terminating node step", b);
});
}

Expand Down

0 comments on commit 6a3e903

Please sign in to comment.