Skip to content

Commit

Permalink
Merge pull request #382 from jglick/selfNameVsLabel
Browse files Browse the repository at this point in the history
Parallel build can fail to resume when some nodes specify a label which is also a self-label of another node
  • Loading branch information
jglick authored Jul 5, 2024
2 parents d70641d + 5f1db60 commit 82d1345
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@
import hudson.slaves.WorkspaceList;
import java.io.IOException;
import java.io.Serializable;
import java.util.Arrays;
import java.util.concurrent.CancellationException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import jenkins.model.Jenkins;
import org.jenkinsci.plugins.workflow.FilePathUtils;
import org.jenkinsci.plugins.workflow.steps.DynamicContext;
Expand Down Expand Up @@ -106,6 +109,7 @@ void resume(StepContext context) throws Exception {
try {
exec = item.getFuture().getStartCondition().get(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS, TimeUnit.MILLISECONDS);
} catch (TimeoutException x) {
LOGGER.log(Level.FINE, x, () -> "failed to wait for " + item + "; outstanding queue items: " + Arrays.toString(Queue.getInstance().getItems()) + "; running executables: " + Stream.of(Jenkins.get().getComputers()).flatMap(c -> c.getExecutors().stream()).collect(Collectors.toList()));
listener.getLogger().println(node + " has been removed for " + Util.getTimeSpanString(ExecutorStepExecution.TIMEOUT_WAITING_FOR_NODE_MILLIS) + ", assuming it is not coming back");
throw new FlowInterruptedException(Result.ABORTED, /* TODO false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());

Check warning on line 114 in src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepDynamicContext.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: false probably more appropriate */true, new ExecutorStepExecution.RemovedNodeTimeoutCause());
} catch (CancellationException x) {
Expand All @@ -132,7 +136,7 @@ void resume(StepContext context) throws Exception {
_lease.release();
throw new IOException("JENKINS-37121: something already locked " + fp);
}
LOGGER.fine(() -> "fully restored for " + path + " on " + node);
LOGGER.fine(() -> "fully restored for " + path + " on " + node + " → " + computer.getName());
}

@Override public String toString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import hudson.model.User;
import hudson.model.queue.CauseOfBlockage;
import hudson.model.queue.QueueListener;
import hudson.model.queue.QueueTaskDispatcher;
import hudson.model.queue.SubTask;
import hudson.remoting.ChannelClosedException;
import hudson.remoting.RequestAbortedException;
Expand Down Expand Up @@ -161,7 +162,7 @@ public void stop(@NonNull Throwable cause) throws Exception {
try (ACLContext as = ACL.as(ACL.SYSTEM)) {
items = Queue.getInstance().getItems();
}
LOGGER.log(FINE, "stopping one of {0}", Arrays.asList(items));
LOGGER.log(FINE, cause, () -> "stopping one of " + Arrays.asList(items));
StepContext context = getContext();
for (Queue.Item item : items) {
// if we are still in the queue waiting to be scheduled, just retract that
Expand Down Expand Up @@ -438,6 +439,8 @@ public static final class PlaceholderTask implements ContinuedTask, Serializable
private final StepContext context;
/** Initially set to {@link ExecutorStep#getLabel}, if any; later switched to actual self-label when block runs. */
private String label;
/** Set after {@link #label} is set to a self-label. */
private boolean labelIsSelfLabel;
/** Shortcut for {@link #run}. */
private final String runId;
/**
Expand Down Expand Up @@ -539,6 +542,27 @@ public boolean hasStarted() {
return j.getNode(label);
}

@Restricted(NoExternalUse.class)
@Extension public static final class EnforceSelfLabel extends QueueTaskDispatcher {
@Override public CauseOfBlockage canTake(Node node, Queue.BuildableItem item) {
if (item.task instanceof PlaceholderTask) {
PlaceholderTask t = (PlaceholderTask) item.task;
if (t.labelIsSelfLabel) {
String nodeName = node.getNodeName();
if (!nodeName.equals(t.label)) {
LOGGER.fine(() -> "Refusing to let " + item + " be run on " + node + " despite label match");
return new CauseOfBlockage() {
@Override public String getShortDescription() {
return "Must run on " + t.label + " not " + nodeName;

Check warning on line 556 in src/main/java/org/jenkinsci/plugins/workflow/support/steps/ExecutorStepExecution.java

View check run for this annotation

ci.jenkins.io / Code Coverage

Not covered line

Line 556 is not covered by tests
}
};
}
}
}
return null;
}
}

@Deprecated
@Override public boolean isBuildBlocked() {
return false;
Expand Down Expand Up @@ -681,8 +705,7 @@ public String getShortDescription() {
return r;
}

// TODO 2.389+ @Override
public @CheckForNull Queue.Executable getOwnerExecutable() {
@Override public @CheckForNull Queue.Executable getOwnerExecutable() {
Run<?, ?> r = runForDisplay();
return r instanceof Queue.Executable ? (Queue.Executable) r : null;
}
Expand Down Expand Up @@ -1034,6 +1057,7 @@ public final class PlaceholderExecutable implements ContinuableExecutable, Acces
cookie = UUID.randomUUID().toString();
// Switches the label to a self-label, so if the executable is killed and restarted, it will run on the same node:
label = computer.getName();
labelIsSelfLabel = true;

EnvVars env = computer.getEnvironment();
env.overrideExpandingAll(computer.buildEnvironment(listener));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package org.jenkinsci.plugins.workflow.support.steps;

import hudson.ExtensionList;
import hudson.FilePath;
import hudson.model.Computer;
import hudson.model.Descriptor;
import hudson.model.Item;
Expand All @@ -35,11 +36,14 @@
import hudson.model.TaskListener;
import hudson.model.queue.CauseOfBlockage;
import hudson.model.queue.QueueTaskDispatcher;
import hudson.remoting.Channel;
import hudson.slaves.AbstractCloudComputer;
import hudson.slaves.AbstractCloudSlave;
import hudson.slaves.Cloud;
import hudson.slaves.ComputerListener;
import hudson.slaves.NodeProvisioner;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.logging.Level;
Expand Down Expand Up @@ -185,4 +189,35 @@ private TestCloudSlave(String name) throws Exception {
}
}

@Test public void selfNameVsLabel() throws Throwable {
logging.recordPackage(ExecutorStepExecution.class, Level.FINE);
rr.then(r -> {
r.jenkins.setNumExecutors(0);
ExtensionList.lookupSingleton(DelayX.class).active = false;
r.createSlave("x", null, null);
r.createSlave("x2", "x", null);
WorkflowJob p = r.createProject(WorkflowJob.class, "p");
p.setDefinition(new CpsFlowDefinition("parallel b1: {node('x && !x2') {semaphore 'b1'}}, b2: {node('x2') {semaphore 'b2'}}", true));
WorkflowRun b = p.scheduleBuild2(0).waitForStart();
SemaphoreStep.waitForStart("b1/1", b);
SemaphoreStep.waitForStart("b2/1", b);
});
rr.then(r -> {
WorkflowJob p = r.jenkins.getItemByFullName("p", WorkflowJob.class);
WorkflowRun b = p.getBuildByNumber(1);
SemaphoreStep.success("b1/1", null);
SemaphoreStep.success("b2/1", null);
r.waitForCompletion(b);
r.assertBuildStatusSuccess(b);
});
}
@TestExtension("selfNameVsLabel") public static final class DelayX extends ComputerListener {
boolean active = true;
@Override public void preOnline(Computer c, Channel channel, FilePath root, TaskListener listener) throws IOException, InterruptedException {
if (c.getName().equals("x")) {
Thread.sleep(5_000);
}
}
}

}

0 comments on commit 82d1345

Please sign in to comment.