Skip to content

Commit

Permalink
slurm: wait for all nodes being ready
Browse files Browse the repository at this point in the history
Implement ready_many similar to drain_many which waits until the
desired state is reached, with a configurable timeout. If nodes
failed to become ready, they are reported in the timeout exception.
Usually we expect the nodes to become ready almost immediately so
there shouldn't be a noticeable waiting period.

ready_many is now used by the `all-nodes ready` subcommand.

PL-131739
  • Loading branch information
dpausp committed Sep 18, 2023
1 parent ede6213 commit e4ecf23
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 21 deletions.
35 changes: 14 additions & 21 deletions pkgs/fc/agent/fc/manage/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ def ready_all(
"will be set to ready."
),
),
timeout: int = Option(
default=30, help="Wait for seconds for nodes to become ready."
),
strict_state_check: Optional[bool] = False,
reason_must_match: Optional[str] = Option(
default=None,
Expand Down Expand Up @@ -225,27 +228,17 @@ def ready_all(
return

with directory_connection(context.enc_path) as directory:
for node_name in node_names:
fc.util.slurm.ready(
log,
node_name,
strict_state_check,
reason_must_match,
skip_nodes_in_maintenance,
directory,
)

# XXX: Give slurmctld a bit more time to settle.
# The change to "ready" should be almost instantaneous. However, we experienced an
# alert when the Sensu check ran some milliseconds after this command finished and
# still saw all nodes as "down" even after slurmctld said that nodes are responding.
# This will be replaced by a proper wait loop like drain_many.
# See PL-131739 "Slurm maintenance causes alert".
log.info(
"ready-all-finished",
_replace_msg="Finished, waiting 2 seconds for slurmctld to settle.",
)
time.sleep(2)
fc.util.slurm.ready_many(
log,
node_names,
timeout,
strict_state_check,
reason_must_match,
skip_nodes_in_maintenance,
directory,
)

log.info("ready-all-finished")


@all_nodes_app.command()
Expand Down
111 changes: 111 additions & 0 deletions pkgs/fc/agent/fc/util/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def is_node_drained(log, node_info):
return drained


def is_node_ready(node_info):
state, *flags = node_info["state"].split("+")
return "DRAIN" not in flags and state not in ("DOWN", "DOWN*")


def get_all_node_names():
return pyslurm.node().get().keys()

Expand Down Expand Up @@ -490,6 +495,112 @@ def ready(
)


def ready_many(
log,
node_names: list[str],
timeout: int,
strict_state_check: bool = False,
reason_must_match: Optional[str] = None,
skip_in_maintenance=False,
directory=None,
):
log.debug("ready-many-start", nodes=node_names)

nodes_to_wait_for = set()

for node_name in node_names:
check_result = run_ready_pre_checks(
log,
node_name,
strict_state_check,
reason_must_match,
skip_in_maintenance,
directory,
)
if check_result.action:
nodes_to_wait_for.add(node_name)

if not nodes_to_wait_for:
log.info(
"ready-many-nothing-to-do",
_replace_msg="OK: All wanted nodes are already ready.",
)
return

state_change_ready = {
"node_names": ",".join(nodes_to_wait_for),
"node_state": pyslurm.NODE_RESUME,
}
result = update_nodes(state_change_ready)
log.debug("node-update-result", result=result)

log.info(
"ready-many-waiting",
num_waiting_nodes=len(nodes_to_wait_for),
_replace_msg="Waiting for {num_waiting_nodes} nodes to become ready.",
)

start_time = time.time()
elapsed = 0.0
ii = 0

while elapsed < timeout:
ready_nodes = set()
for node_name in nodes_to_wait_for:
node_info = get_node_info(node_name)

if is_node_ready(node_info):
log.info(
"node-ready",
_replace_msg="{node} is now ready.",
node=node_name,
)

ready_nodes.add(node_name)

for node_name in ready_nodes:
nodes_to_wait_for.remove(node_name)

if not nodes_to_wait_for:
log.info(
"ready-many-finished",
_replace_msg="All nodes are ready after {elapsed} seconds.",
elapsed=int(elapsed),
)
return

log.debug(
"ready-all-wait",
elapsed=int(elapsed),
timeout=timeout,
num_waiting_nodes=len(nodes_to_wait_for),
)

pause = min([15, 2**ii])
log.debug("ready-wait", sleep=pause)
time.sleep(pause)
ii += 1
elapsed = time.time() - start_time

# Loop finished => time limit reached

remaining_node_states = {
o: get_node_info(o)["state"] for o in nodes_to_wait_for
}

log.error(
"ready-many-timeout",
timeout=timeout,
remaining_node_states=remaining_node_states,
num_remaining=len(nodes_to_wait_for),
_replace_msg=(
"{num_remaining} node(s) did not become ready in time, waited "
"{timeout} seconds for: {remaining_node_states}"
),
)
raise NodeStateTimeout(remaining_node_states)


def check_controller(log, hostname):
errors = []
warnings = []
Expand Down
96 changes: 96 additions & 0 deletions pkgs/fc/agent/fc/util/tests/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,99 @@ def fake_get_node_info(node_name):
timeout=3,
remaining_node_states=remaining_node_states,
)


def test_ready_many(logger, monkeypatch):
iter_states = {
"test20": iter(
[
"IDLE+DRAIN",
"IDLE",
]
),
"test21": iter(
[
"DOWN+DRAIN",
"MIXED",
]
),
"test22": iter(
[
"DOWN",
"IDLE",
]
),
"test23": iter(
[
"IDLE",
]
),
"test24": iter(
[
"IDLE*",
"IDLE",
]
),
# Node is DOWN forever.
"test25": repeat("DOWN"),
}

# test25 should be ignored by ready_many. It would cause a timeout if the function
# fails to ignore it because the node will never be in a ready state.

def fake_get_node_info(node_name):
return {
"name": node_name,
"state": next(iter_states[node_name]),
"reason": "other" if node_name == "test25" else "test ready many",
}

monkeypatch.setattr(fc.util.slurm, "get_node_info", fake_get_node_info)
fc.util.slurm.ready_many(
logger, list(iter_states.keys()), 3, reason_must_match="test ready many"
)


def test_ready_many_timeout(logger, log, monkeypatch):
iter_states = {
"test20": iter(
[
"IDLE+DRAIN",
"IDLE",
]
),
"test21": iter(
[
"IDLE",
]
),
# Node is DOWN forever.
"test22": repeat("DOWN"),
}

def fake_get_node_info(node_name):
return {
"name": node_name,
"state": next(iter_states[node_name]),
"reason": "test ready many timeout",
}

monkeypatch.setattr(fc.util.slurm, "get_node_info", fake_get_node_info)

with raises(NodeStateTimeout) as e:
fc.util.slurm.ready_many(
logger,
list(iter_states.keys()),
3,
reason_must_match="test ready many timeout",
)

remaining_node_states = {"test22": "DOWN"}

assert e.value.remaining_node_states == remaining_node_states

assert log.has(
"ready-many-timeout",
timeout=3,
remaining_node_states=remaining_node_states,
)

0 comments on commit e4ecf23

Please sign in to comment.