diff --git a/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java b/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java index 1a92ea37..6d3951f1 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java @@ -257,7 +257,8 @@ private static synchronized ScheduledExecutorService threadPool() { * {@link #ws} is nulled out and Jenkins waits until a fresh handle is available. */ @SuppressFBWarnings(value="SE_TRANSIENT_FIELD_NOT_RESTORED", justification="recurrencePeriod is set in onResume, not deserialization") - static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable { + @Restricted(NoExternalUse.class) + public static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable { private static final long MIN_RECURRENCE_PERIOD = 250; // ¼s private static final long MAX_RECURRENCE_PERIOD = 15000; // 15s @@ -282,7 +283,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab /** Serialized state of the controller. */ private Controller controller; /** {@link Node#getNodeName} of {@link #ws}. */ - private String node; + public String node; /** {@link FilePath#getRemote} of {@link #ws}. */ private String remote; /** Whether the entire stdout of the process is to become the return value of the step. */ @@ -366,7 +367,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab return null; } else { LOGGER.fine(() -> "rediscovering that " + node + " has been removed and timeout has expired"); - listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back"); + listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating shell step"); throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java index 7df05cd6..c05c900d 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java @@ -110,7 +110,7 @@ void resume(StepContext context) throws Exception { exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); } catch (TimeoutException x) { LOGGER.log(Level.FINE, x, () -> "failed to wait for " + item + "; outstanding queue items: " + Arrays.toString(Queue.getInstance().getItems()) + "; running executables: " + Stream.of(Jenkins.get().getComputers()).flatMap(c -> c.getExecutors().stream()).collect(Collectors.toList())); - listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back"); + listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating node step"); throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } catch (CancellationException x) { LOGGER.log(Level.FINE, "ceased to wait for " + node, x); diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index e4172ed2..5787ab57 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -90,6 +90,7 @@ import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException; import org.jenkinsci.plugins.workflow.steps.StepContext; import org.jenkinsci.plugins.workflow.steps.StepExecution; +import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep; import org.jenkinsci.plugins.workflow.steps.durable_task.Messages; import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl; import org.jenkinsci.plugins.workflow.support.concurrent.Timeout; @@ -276,12 +277,12 @@ public void stop(@NonNull Throwable cause) throws Exception { @Extension public static final class AnomalousStatus extends PeriodicWork { @Override public long getRecurrencePeriod() { - return Duration.ofMinutes(30).toMillis(); + return Duration.ofMinutes(5).toMillis(); } @Override public long getInitialDelay() { // Do not run too soon after startup, in case things are still loading, agents are still reattaching, etc. - return Duration.ofMinutes(15).toMillis(); + return Duration.ofMinutes(7).toMillis(); } /** @@ -311,6 +312,7 @@ public void stop(@NonNull Throwable cause) throws Exception { } } Set newAnomalous = new HashSet<>(); + Set affectedNodes = new HashSet<>(); StepExecution.applyAll(ExecutorStepExecution.class, exec -> { StepContext ctx = exec.getContext(); if (!knownTasks.contains(ctx)) { @@ -322,6 +324,9 @@ public void stop(@NonNull Throwable cause) throws Exception { LOGGER.log(Level.WARNING, null, x); } ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled())); + if (exec.state != null) { + affectedNodes.add(exec.state.node); + } } else { newAnomalous.add(ctx); } @@ -330,6 +335,21 @@ public void stop(@NonNull Throwable cause) throws Exception { } return null; }).get(); + // Also abort any shell steps running on the same node(s): + if (!affectedNodes.isEmpty()) { + StepExecution.applyAll(DurableTaskStep.Execution.class, exec -> { + if (affectedNodes.contains(exec.node)) { + StepContext ctx = exec.getContext(); + try { + ctx.get(TaskListener.class).error("also cancelling shell steps running on " + exec.node); + } catch (IOException | InterruptedException x) { + LOGGER.log(Level.WARNING, null, x); + } + ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause())); + } + return null; + }); + } for (StepContext ctx : newAnomalous) { ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; will cancel if this condition persists"); } diff --git a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java index 0b05323b..5ecf1d51 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java @@ -159,7 +159,7 @@ private void commonSetup() { j.assertLogNotContains("assuming it is not coming back", b); j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b)); for (int i = 0; i < 5; i++) { - j.assertLogContains("slave" + i + " has been removed for 15 sec, assuming it is not coming back", b); + j.assertLogContains("slave" + i + " has been removed for 15 sec; assuming it is not coming back, and terminating node step", b); } assertThat(logging.getRecords().stream().filter(r -> r.getLevel().intValue() >= Level.WARNING.intValue()).toArray(), emptyArray()); }); diff --git a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java index 56b210ad..6db7b30e 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java @@ -531,7 +531,7 @@ private static void assertLogMatches(WorkflowRun build, String regexp) throws IO SemaphoreStep.success("wait/1", null); r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); assertEquals(Collections.emptyList(), Arrays.asList(Queue.getInstance().getItems())); - r.assertLogContains("dumbo has been removed for 15 sec, assuming it is not coming back", b); + r.assertLogContains("dumbo has been removed for 15 sec; assuming it is not coming back, and terminating node step", b); }); }