Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extending AnomalousStatus to also kill sh steps #405

Merged
merged 4 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,8 @@
* {@link #ws} is nulled out and Jenkins waits until a fresh handle is available.
*/
@SuppressFBWarnings(value="SE_TRANSIENT_FIELD_NOT_RESTORED", justification="recurrencePeriod is set in onResume, not deserialization")
static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable {
@Restricted(NoExternalUse.class)
public static final class Execution extends AbstractStepExecutionImpl implements Runnable, ExecutionRemotable {

private static final long MIN_RECURRENCE_PERIOD = 250; // ¼s
private static final long MAX_RECURRENCE_PERIOD = 15000; // 15s
Expand All @@ -282,7 +283,7 @@
/** Serialized state of the controller. */
private Controller controller;
/** {@link Node#getNodeName} of {@link #ws}. */
private String node;
public String node;
/** {@link FilePath#getRemote} of {@link #ws}. */
private String remote;
/** Whether the entire stdout of the process is to become the return value of the step. */
Expand Down Expand Up @@ -366,8 +367,8 @@
return null;
} else {
LOGGER.fine(() -> "rediscovering that " + node + " has been removed and timeout has expired");
listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
listener().getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating shell step");

Check warning on line 370 in src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered line

Line 370 is not covered by tests
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());

Check warning on line 371 in src/main/java/org/jenkinsci/plugins/workflow/steps/durable_task/DurableTaskStep.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
}
}
removedNodeDiscovered = 0; // something else; reset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void resume(StepContext context) throws Exception {
exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
} catch (TimeoutException x) {
LOGGER.log(Level.FINE, x, () -> "failed to wait for " + item + "; outstanding queue items: " + Arrays.toString(Queue.getInstance().getItems()) + "; running executables: " + Stream.of(Jenkins.get().getComputers()).flatMap(c -> c.getExecutors().stream()).collect(Collectors.toList()));
listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + "; assuming it is not coming back, and terminating node step");
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
} catch (CancellationException x) {
LOGGER.log(Level.FINE, "ceased to wait for " + node, x);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@
import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.steps.StepExecution;
import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep;
import org.jenkinsci.plugins.workflow.steps.durable_task.Messages;
import org.jenkinsci.plugins.workflow.support.actions.WorkspaceActionImpl;
import org.jenkinsci.plugins.workflow.support.concurrent.Timeout;
Expand Down Expand Up @@ -276,12 +277,12 @@
@Extension public static final class AnomalousStatus extends PeriodicWork {

@Override public long getRecurrencePeriod() {
return Duration.ofMinutes(30).toMillis();
return Duration.ofMinutes(5).toMillis();
}

@Override public long getInitialDelay() {
// Do not run too soon after startup, in case things are still loading, agents are still reattaching, etc.
return Duration.ofMinutes(15).toMillis();
return Duration.ofMinutes(7).toMillis();
}

/**
Expand Down Expand Up @@ -311,25 +312,44 @@
}
}
Set<StepContext> newAnomalous = new HashSet<>();
Set<String> affectedNodes = new HashSet<>();
StepExecution.applyAll(ExecutorStepExecution.class, exec -> {
StepContext ctx = exec.getContext();
if (!knownTasks.contains(ctx)) {
LOGGER.warning(() -> "do not know about " + ctx);
if (anomalous.contains(ctx)) {
try {
ctx.get(TaskListener.class).error("node block still appears to be neither running nor scheduled; cancelling");
} catch (IOException | InterruptedException x) {
LOGGER.log(Level.WARNING, null, x);
}
ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new QueueTaskCancelled()));
if (exec.state != null) {
affectedNodes.add(exec.state.node);
}
} else {
newAnomalous.add(ctx);
}
} else {
LOGGER.fine(() -> "know about " + ctx);
}
return null;
}).get();
// Also abort any shell steps running on the same node(s):
if (!affectedNodes.isEmpty()) {
StepExecution.applyAll(DurableTaskStep.Execution.class, exec -> {
if (affectedNodes.contains(exec.node)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there cases (e.g. nodes with multiple executors?) where this could abort steps which are running fine if another step on the same node is having unusual problems? If so, could we check exec.state.cookie or something more precise than just the node name instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory perhaps, but this monitor is normally used for cloud nodes with one executor, and it seems unlikely that the agent could be connected and functional on one executor and node block while broken in another one of the same build. (For that matter, it would rarely make any sense to run two concurrent node blocks in the same build on the same agent.)

StepContext ctx = exec.getContext();
try {
ctx.get(TaskListener.class).error("also cancelling shell steps running on " + exec.node);
} catch (IOException | InterruptedException x) {
LOGGER.log(Level.WARNING, null, x);
}
ctx.onFailure(new FlowInterruptedException(Result.ABORTED, false, new RemovedNodeCause()));
}
return null;

Check warning on line 350 in src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered lines

Lines 315-350 are not covered by tests
});
}
for (StepContext ctx : newAnomalous) {
ctx.get(TaskListener.class).error("node block appears to be neither running nor scheduled; will cancel if this condition persists");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ private void commonSetup() {
j.assertLogNotContains("assuming it is not coming back", b);
j.assertBuildStatus(Result.ABORTED, j.waitForCompletion(b));
for (int i = 0; i < 5; i++) {
j.assertLogContains("slave" + i + " has been removed for 15 sec, assuming it is not coming back", b);
j.assertLogContains("slave" + i + " has been removed for 15 sec; assuming it is not coming back, and terminating node step", b);
}
assertThat(logging.getRecords().stream().filter(r -> r.getLevel().intValue() >= Level.WARNING.intValue()).toArray(), emptyArray());
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ private static void assertLogMatches(WorkflowRun build, String regexp) throws IO
SemaphoreStep.success("wait/1", null);
r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b));
assertEquals(Collections.emptyList(), Arrays.asList(Queue.getInstance().getItems()));
r.assertLogContains("dumbo has been removed for 15 sec, assuming it is not coming back", b);
r.assertLogContains("dumbo has been removed for 15 sec; assuming it is not coming back, and terminating node step", b);
});
}

Expand Down