Skip to content

Commit

Permalink
[JENKINS-69534] Make computer creation thread safe (#7087)
Browse files Browse the repository at this point in the history
This change fixes a race condition that caused file descriptor leaks
when cloud agents were created.

The Jenkins.computers map is changed from a CopyOnWriteMap to a
ConcurrentHashMap to simplify concurrent creation of computers. This
allows us to make AbstractCIBase's createNewComputerForNode and
updateNewComputer methods thread safe and simplify removeComputer.
  • Loading branch information
jonaslind authored Sep 15, 2022
1 parent 98e499d commit 5414938
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 24 deletions.
45 changes: 23 additions & 22 deletions core/src/main/java/hudson/model/AbstractCIBase.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
Expand Down Expand Up @@ -132,7 +134,7 @@ public void setDisabledAdministrativeMonitors(Set<String> disabledAdministrative

public abstract Queue getQueue();

protected abstract Map<Node, Computer> getComputerMap();
protected abstract ConcurrentMap<Node, Computer> getComputerMap();

/* =================================================================================================================
* Computer API uses package protection heavily
Expand All @@ -158,21 +160,31 @@ private void updateComputer(Node n, Map<String, Computer> byNameMap, Set<Compute
@CheckForNull
private Computer createNewComputerForNode(Node n, boolean automaticAgentLaunch) {
Computer c = null;
Map<Node, Computer> computers = getComputerMap();
ConcurrentMap<Node, Computer> computers = getComputerMap();
// we always need Computer for the built-in node as a fallback in case there's no other Computer.
if (n.getNumExecutors() > 0 || n == Jenkins.get()) {
// The start/connect of a new computer is potentially costly, so we don't want to perform it inside
// computeIfAbsent. Instead, we use this creationWasAttempted flag to signal if start/connect is needed or
// not.
final AtomicBoolean creationWasAttempted = new AtomicBoolean(false);
try {
c = n.createComputer();
c = computers.computeIfAbsent(n, node -> {
creationWasAttempted.set(true);
return node.createComputer();
});
} catch (RuntimeException ex) { // Just in case there is a bogus extension
LOGGER.log(Level.WARNING, "Error retrieving computer for node " + n.getNodeName() + ", continuing", ex);
}
if (!creationWasAttempted.get()) {
LOGGER.log(Level.FINE, "Node {0} is not a new node skipping", n.getNodeName());
return null;
}
if (c == null) {
LOGGER.log(Level.WARNING, "Cannot create computer for node {0}, the {1}#createComputer() method returned null. Skipping this node",
new Object[]{n.getNodeName(), n.getClass().getName()});
return null;
}

computers.put(n, c);
if (!n.isHoldOffLaunchUntilSave() && automaticAgentLaunch) {
RetentionStrategy retentionStrategy = c.getRetentionStrategy();
if (retentionStrategy != null) {
Expand All @@ -192,34 +204,23 @@ private Computer createNewComputerForNode(Node n, boolean automaticAgentLaunch)
}

/*package*/ void removeComputer(final Computer computer) {
Queue.withLock(new Runnable() {
@Override
public void run() {
Map<Node, Computer> computers = getComputerMap();
for (Map.Entry<Node, Computer> e : computers.entrySet()) {
if (e.getValue() == computer) {
computers.remove(e.getKey());
computer.onRemoved();
return;
}
}
ConcurrentMap<Node, Computer> computers = getComputerMap();
Queue.withLock(() -> {
if (computers.values().remove(computer)) {
computer.onRemoved();
}
});
}

/*package*/ @CheckForNull Computer getComputer(Node n) {
Map<Node, Computer> computers = getComputerMap();
ConcurrentMap<Node, Computer> computers = getComputerMap();
return computers.get(n);
}

protected void updateNewComputer(final Node n, boolean automaticAgentLaunch) {
final String nodeName = n.getNodeName();
final Map<Node, Computer> computers = getComputerMap();
if (computers.containsKey(n)) {
LOGGER.warning("Node " + nodeName + " is not a new node skipping");
if (createNewComputerForNode(n, automaticAgentLaunch) == null) {
return;
}
createNewComputerForNode(n, automaticAgentLaunch);
getQueue().scheduleMaintenance();
Listeners.notify(ComputerListener.class, false, ComputerListener::onConfigurationChange);
}
Expand All @@ -232,7 +233,7 @@ protected void updateNewComputer(final Node n, boolean automaticAgentLaunch) {
* so that we won't upset {@link Executor}s running in it.
*/
protected void updateComputerList(final boolean automaticAgentLaunch) {
final Map<Node, Computer> computers = getComputerMap();
final ConcurrentMap<Node, Computer> computers = getComputerMap();
final Set<Computer> old = new HashSet<>(computers.size());
Queue.withLock(new Runnable() {
@Override
Expand Down
5 changes: 3 additions & 2 deletions core/src/main/java/jenkins/model/Jenkins.java
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
import java.util.TreeSet;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
Expand Down Expand Up @@ -529,7 +530,7 @@ public class Jenkins extends AbstractCIBase implements DirectlyModifiableTopLeve
/**
* {@link Computer}s in this Jenkins system. Read-only.
*/
protected final transient Map<Node, Computer> computers = new CopyOnWriteMap.Hash<>();
protected final transient ConcurrentMap<Node, Computer> computers = new ConcurrentHashMap<>();

/**
* Active {@link Cloud}s.
Expand Down Expand Up @@ -2178,7 +2179,7 @@ public Cloud getCloud(String name) {
}

@Override
protected Map<Node, Computer> getComputerMap() {
protected ConcurrentMap<Node, Computer> getComputerMap() {
return computers;
}

Expand Down

0 comments on commit 5414938

Please sign in to comment.