From cefa50f69e0b5a7a9052b1fbe1497b4c0696bb96 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 5 Nov 2024 18:57:39 -0500 Subject: [PATCH 1/4] Extending `AnomalousStatus` to also kill `sh` steps --- .../steps/durable_task/DurableTaskStep.java | 7 ++++--- .../support/steps/ExecutorStepDynamicContext.java | 2 +- .../support/steps/ExecutorStepExecution.java | 15 +++++++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java b/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java index 1a92ea37..6d3951f1 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java @@ -257,7 +257,8 @@ private static synchronized ScheduledExecutorService threadPool() { * {@link #ws} is nulled out and Jenkins waits until a fresh handle is available. */ @SuppressFBWarnings(value="SE_TRANSIENT_FIELD_NOT_RESTORED", justification="recurrencePeriod is set in onResume, not deserialization") - static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable { + @Restricted(NoExternalUse.class) + public static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable { private static final long MIN_RECURRENCE_PERIOD = 250; // ¼s private static final long MAX_RECURRENCE_PERIOD = 15000; // 15s @@ -282,7 +283,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab /** Serialized state of the controller. */ private Controller controller; /** {@link Node#getNodeName} of {@link #ws}. */ - private String node; + public String node; /** {@link FilePath#getRemote} of {@link #ws}. */ private String remote; /** Whether the entire stdout of the process is to become the return value of the step. */ @@ -366,7 +367,7 @@ static final class Execution extends AbstractStepExecutionImpl implements Runnab return null; } else { LOGGER.fine(() -> "rediscovering that " + node + " has been removed and timeout has expired"); - listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back"); + listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating shell step"); throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } } diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java index 7df05cd6..c05c900d 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java @@ -110,7 +110,7 @@ void resume(StepContext context) throws Exception { exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS); } catch (TimeoutException x) { LOGGER.log(Level.FINE, x, () -> "failed to wait for " + item + "; outstanding queue items: " + Arrays.toString(Queue.getInstance().getItems()) + "; running executables: " + Stream.of(Jenkins.get().getComputers()).flatMap(c -> c.getExecutors().stream()).collect(Collectors.toList())); - listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back"); + listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating node step"); throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause()); } catch (CancellationException x) { LOGGER.log(Level.FINE, "ceased to wait for " + node, x); diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index e4172ed2..78af4eb8 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -90,6 +90,7 @@ import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException; import org.jenkinsci.plugins.workflow.steps.StepContext; import org.jenkinsci.plugins.workflow.steps.StepExecution; +import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep; import org.jenkinsci.plugins.workflow.steps.durable_task.Messages; import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl; import org.jenkinsci.plugins.workflow.support.concurrent.Timeout; @@ -322,6 +323,20 @@ public void stop(@NonNull Throwable cause) throws Exception { LOGGER.log(Level.WARNING, null, x); } ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled())); + // Also abort any shell steps running on the same node: + if (exec.state != null) { + try { + for (var exec2 : ctx.get(FlowExecution.class).getCurrentExecutions(true).get()) { + if (exec2 instanceof DurableTaskStep.Execution) { + if (exec.state.node.equals(((DurableTaskStep.Execution) exec2).node)) { + exec2.getContext().onFailure(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause())); + } + } + } + } catch (Exception x) { + LOGGER.log(Level.WARNING, null, x); + } + } } else { newAnomalous.add(ctx); } From aea9bf04f1e41c0ed89685da6bd09c7b4f598788 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 6 Nov 2024 13:53:42 -0500 Subject: [PATCH 2/4] Trying a different approach given https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/405#discussion_r1831538053 --- .../support/steps/ExecutorStepExecution.java | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index 78af4eb8..35541973 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -312,6 +312,7 @@ public void stop(@NonNull Throwable cause) throws Exception { } } Set newAnomalous = new HashSet<>(); + Set affectedNodes = new HashSet<>(); StepExecution.applyAll(ExecutorStepExecution.class, exec -> { StepContext ctx = exec.getContext(); if (!knownTasks.contains(ctx)) { @@ -323,19 +324,8 @@ public void stop(@NonNull Throwable cause) throws Exception { LOGGER.log(Level.WARNING, null, x); } ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled())); - // Also abort any shell steps running on the same node: if (exec.state != null) { - try { - for (var exec2 : ctx.get(FlowExecution.class).getCurrentExecutions(true).get()) { - if (exec2 instanceof DurableTaskStep.Execution) { - if (exec.state.node.equals(((DurableTaskStep.Execution) exec2).node)) { - exec2.getContext().onFailure(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause())); - } - } - } - } catch (Exception x) { - LOGGER.log(Level.WARNING, null, x); - } + affectedNodes.add(exec.state.node); } } else { newAnomalous.add(ctx); @@ -345,6 +335,21 @@ public void stop(@NonNull Throwable cause) throws Exception { } return null; }).get(); + // Also abort any shell steps running on the same node(s): + if (!affectedNodes.isEmpty()) { + StepExecution.applyAll(DurableTaskStep.Execution.class, exec -> { + if (affectedNodes.contains(exec.node)) { + StepContext ctx = exec.getContext(); + try { + ctx.get(TaskListener.class).error("also cancelling shell steps running on " + exec.node); + } catch (IOException | InterruptedException x) { + LOGGER.log(Level.WARNING, null, x); + } + ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause())); + } + return null; + }); + } for (StepContext ctx : newAnomalous) { ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; will cancel if this condition persists"); } From 591f2899954f0f7f4fac2c833db62350bb112b5c Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 6 Nov 2024 13:57:08 -0500 Subject: [PATCH 3/4] Test fixes --- .../workflow/support/steps/ExecutorStepDynamicContextTest.java | 2 +- .../plugins/workflow/support/steps/ExecutorStepTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java index 0b05323b..5ecf1d51 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContextTest.java @@ -159,7 +159,7 @@ private void commonSetup() { j.assertLogNotContains("assuming it is not coming back", b); j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b)); for (int i = 0; i < 5; i++) { - j.assertLogContains("slave" + i + " has been removed for 15 sec, assuming it is not coming back", b); + j.assertLogContains("slave" + i + " has been removed for 15 sec; assuming it is not coming back, and terminating node step", b); } assertThat(logging.getRecords().stream().filter(r -> r.getLevel().intValue() >= Level.WARNING.intValue()).toArray(), emptyArray()); }); diff --git a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java index 56b210ad..6db7b30e 100644 --- a/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java +++ b/src/test/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepTest.java @@ -531,7 +531,7 @@ private static void assertLogMatches(WorkflowRun build, String regexp) throws IO SemaphoreStep.success("wait/1", null); r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); assertEquals(Collections.emptyList(), Arrays.asList(Queue.getInstance().getItems())); - r.assertLogContains("dumbo has been removed for 15 sec, assuming it is not coming back", b); + r.assertLogContains("dumbo has been removed for 15 sec; assuming it is not coming back, and terminating node step", b); }); } From 4043ebfb57520bf9a257bbca47355d86bb76ae2b Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 6 Nov 2024 14:48:16 -0500 Subject: [PATCH 4/4] Make `AnomalousStatus` run more frequently --- .../plugins/workflow/support/steps/ExecutorStepExecution.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java index 35541973..5787ab57 100644 --- a/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java +++ b/src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java @@ -277,12 +277,12 @@ public void stop(@NonNull Throwable cause) throws Exception { @Extension public static final class AnomalousStatus extends PeriodicWork { @Override public long getRecurrencePeriod() { - return Duration.ofMinutes(30).toMillis(); + return Duration.ofMinutes(5).toMillis(); } @Override public long getInitialDelay() { // Do not run too soon after startup, in case things are still loading, agents are still reattaching, etc. - return Duration.ofMinutes(15).toMillis(); + return Duration.ofMinutes(7).toMillis(); } /**