Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slurm: wait for all nodes being ready #786

Merged
merged 1 commit into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 14 additions & 21 deletions pkgs/fc/agent/fc/manage/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ def ready_all(
"will be set to ready."
),
),
timeout: int = Option(
default=30, help="Wait for seconds for nodes to become ready."
),
strict_state_check: Optional[bool] = False,
reason_must_match: Optional[str] = Option(
default=None,
Expand Down Expand Up @@ -225,27 +228,17 @@ def ready_all(
return

with directory_connection(context.enc_path) as directory:
for node_name in node_names:
fc.util.slurm.ready(
log,
node_name,
strict_state_check,
reason_must_match,
skip_nodes_in_maintenance,
directory,
)

# XXX: Give slurmctld a bit more time to settle.
# The change to "ready" should be almost instantaneous. However, we experienced an
# alert when the Sensu check ran some milliseconds after this command finished and
# still saw all nodes as "down" even after slurmctld said that nodes are responding.
# This will be replaced by a proper wait loop like drain_many.
# See PL-131739 "Slurm maintenance causes alert".
log.info(
"ready-all-finished",
_replace_msg="Finished, waiting 2 seconds for slurmctld to settle.",
)
time.sleep(2)
fc.util.slurm.ready_many(
log,
node_names,
timeout,
strict_state_check,
reason_must_match,
skip_nodes_in_maintenance,
directory,
)

log.info("ready-all-finished")


@all_nodes_app.command()
Expand Down
111 changes: 111 additions & 0 deletions pkgs/fc/agent/fc/util/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def is_node_drained(log, node_info):
return drained


def is_node_ready(node_info):
state, *flags = node_info["state"].split("+")
return "DRAIN" not in flags and state not in ("DOWN", "DOWN*")


def get_all_node_names():
return pyslurm.node().get().keys()

Expand Down Expand Up @@ -490,6 +495,112 @@ def ready(
)


def ready_many(
log,
node_names: list[str],
timeout: int,
strict_state_check: bool = False,
reason_must_match: Optional[str] = None,
skip_in_maintenance=False,
directory=None,
):
log.debug("ready-many-start", nodes=node_names)

nodes_to_wait_for = set()

for node_name in node_names:
check_result = run_ready_pre_checks(
log,
node_name,
strict_state_check,
reason_must_match,
skip_in_maintenance,
directory,
)
if check_result.action:
nodes_to_wait_for.add(node_name)

if not nodes_to_wait_for:
log.info(
"ready-many-nothing-to-do",
_replace_msg="OK: All wanted nodes are already ready.",
)
return

state_change_ready = {
"node_names": ",".join(nodes_to_wait_for),
"node_state": pyslurm.NODE_RESUME,
}
result = update_nodes(state_change_ready)
log.debug("node-update-result", result=result)

log.info(
"ready-many-waiting",
num_waiting_nodes=len(nodes_to_wait_for),
_replace_msg="Waiting for {num_waiting_nodes} nodes to become ready.",
)

start_time = time.time()
elapsed = 0.0
ii = 0

while elapsed < timeout:
ready_nodes = set()
for node_name in nodes_to_wait_for:
node_info = get_node_info(node_name)

if is_node_ready(node_info):
log.info(
"node-ready",
_replace_msg="{node} is now ready.",
node=node_name,
)

ready_nodes.add(node_name)

for node_name in ready_nodes:
nodes_to_wait_for.remove(node_name)

if not nodes_to_wait_for:
log.info(
"ready-many-finished",
_replace_msg="All nodes are ready after {elapsed} seconds.",
elapsed=int(elapsed),
)
return

log.debug(
"ready-all-wait",
elapsed=int(elapsed),
timeout=timeout,
num_waiting_nodes=len(nodes_to_wait_for),
)

pause = min([15, 2**ii])
log.debug("ready-wait", sleep=pause)
time.sleep(pause)
ii += 1
elapsed = time.time() - start_time

# Loop finished => time limit reached

remaining_node_states = {
o: get_node_info(o)["state"] for o in nodes_to_wait_for
}

log.error(
"ready-many-timeout",
timeout=timeout,
remaining_node_states=remaining_node_states,
num_remaining=len(nodes_to_wait_for),
_replace_msg=(
"{num_remaining} node(s) did not become ready in time, waited "
"{timeout} seconds for: {remaining_node_states}"
),
)
raise NodeStateTimeout(remaining_node_states)


def check_controller(log, hostname):
errors = []
warnings = []
Expand Down
96 changes: 96 additions & 0 deletions pkgs/fc/agent/fc/util/tests/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,3 +310,99 @@ def fake_get_node_info(node_name):
timeout=3,
remaining_node_states=remaining_node_states,
)


def test_ready_many(logger, monkeypatch):
iter_states = {
"test20": iter(
[
"IDLE+DRAIN",
"IDLE",
]
),
"test21": iter(
[
"DOWN+DRAIN",
"MIXED",
]
),
"test22": iter(
[
"DOWN",
"IDLE",
]
),
"test23": iter(
[
"IDLE",
]
),
"test24": iter(
[
"IDLE*",
"IDLE",
]
),
# Node is DOWN forever.
"test25": repeat("DOWN"),
}

# test25 should be ignored by ready_many. It would cause a timeout if the function
# fails to ignore it because the node will never be in a ready state.

def fake_get_node_info(node_name):
return {
"name": node_name,
"state": next(iter_states[node_name]),
"reason": "other" if node_name == "test25" else "test ready many",
}

monkeypatch.setattr(fc.util.slurm, "get_node_info", fake_get_node_info)
fc.util.slurm.ready_many(
logger, list(iter_states.keys()), 3, reason_must_match="test ready many"
)


def test_ready_many_timeout(logger, log, monkeypatch):
iter_states = {
"test20": iter(
[
"IDLE+DRAIN",
"IDLE",
]
),
"test21": iter(
[
"IDLE",
]
),
# Node is DOWN forever.
"test22": repeat("DOWN"),
}

def fake_get_node_info(node_name):
return {
"name": node_name,
"state": next(iter_states[node_name]),
"reason": "test ready many timeout",
}

monkeypatch.setattr(fc.util.slurm, "get_node_info", fake_get_node_info)

with raises(NodeStateTimeout) as e:
fc.util.slurm.ready_many(
logger,
list(iter_states.keys()),
3,
reason_must_match="test ready many timeout",
)

remaining_node_states = {"test22": "DOWN"}

assert e.value.remaining_node_states == remaining_node_states

assert log.has(
"ready-many-timeout",
timeout=3,
remaining_node_states=remaining_node_states,
)