Skip to content

Commit

Permalink
Make some Slurm errors fatal for execution (#392)
Browse files Browse the repository at this point in the history
This PR makes `sacct` and `sstat` errors result in an exception when running Slurm-based workflows. Previously, the errors were ignored and this could result in SmartSim's state becoming inconsistent and unstable.

[ committed by @al-rigazzi ]
[ reviewed by @MattToast ]
  • Loading branch information
al-rigazzi authored Dec 13, 2023
1 parent 3ee7794 commit 67785e1
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 53 deletions.
5 changes: 5 additions & 0 deletions doc/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ To be released at some future point in time

Description

- `sacct` and `sstat` errors are now fatal for Slurm-based workflow executions
- Added documentation section about ML features and TorchScript
- Added TorchScript functions to Online Analysis tutorial
- Split tests into groups for parallel execution in CI/CD pipeline
Expand All @@ -30,6 +31,9 @@ Description

Detailed Notes

- When the Slurm functions `sacct` and `sstat` returned an error, it would be ignored
and SmartSim's state could become inconsistent. To prevent this, errors
raised by `sacct` or `sstat` now result in an exception. (SmartSim-PR392_)
- A section named *ML Features* was added to documentation. It contains multiple
examples of how ML models and functions can be added to and executed on the DB.
TorchScript-based post-processing was added to the *Online Analysis* tutorial (SmartSim-PR411_)
Expand All @@ -50,6 +54,7 @@ Detailed Notes
- Add support for creation of multiple databases with unique identifiers. (SmartSim-PR342_)


.. _SmartSim-PR392: https://github.com/CrayLabs/SmartSim/pull/392
.. _SmartSim-PR411: https://github.com/CrayLabs/SmartSim/pull/411
.. _SmartSim-PR424: https://github.com/CrayLabs/SmartSim/pull/424
.. _SmartSim-PR417: https://github.com/CrayLabs/SmartSim/pull/417
Expand Down
6 changes: 2 additions & 4 deletions docker/docs/dev/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,13 @@ COPY . /usr/local/src/SmartSim/
WORKDIR /usr/local/src/SmartSim/

# Install smartredis
RUN rm smartredis \
&& git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --depth=1 smartredis \
RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --depth=1 smartredis \
&& cd smartredis \
&& python -m pip install . \
&& rm -rf ~/.cache/pip

# Install smartdashboard
RUN rm smartdashboard \
&& git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \
RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \
&& cd smartdashboard \
&& python -m pip install . \
&& rm -rf ~/.cache/pip
Expand Down
64 changes: 34 additions & 30 deletions smartsim/_core/launcher/slurm/slurmCommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,77 +26,70 @@

import typing as t

from ....log import get_logger
from ....error import LauncherError
from ...utils.helpers import expand_exe_path
from ..util.shell import execute_cmd

logger = get_logger(__name__)

def sstat(args: t.List[str]) -> t.Tuple[str, str]:

def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
"""Calls sstat with args
:param args: List of command arguments
:type args: List of str
:returns: Output and error of sstat
"""
_sstat = _find_slurm_command("sstat")
cmd = [_sstat] + args
_, out, error = execute_cmd(cmd)
return out, error
_, out, err = _execute_slurm_cmd("sstat", args, raise_on_err=raise_on_err)
return out, err


def sacct(args: t.List[str]) -> t.Tuple[str, str]:
def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
"""Calls sacct with args
:param args: List of command arguments
:type args: List of str
:returns: Output and error of sacct
"""
_sacct = _find_slurm_command("sacct")
cmd = [_sacct] + args
_, out, error = execute_cmd(cmd)
return out, error
_, out, err = _execute_slurm_cmd("sacct", args, raise_on_err=raise_on_err)
return out, err


def salloc(args: t.List[str]) -> t.Tuple[str, str]:
def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
"""Calls slurm salloc with args
:param args: List of command arguments
:type args: List of str
:returns: Output and error of salloc
"""
_salloc = _find_slurm_command("salloc")
cmd = [_salloc] + args
_, out, error = execute_cmd(cmd)
return out, error
_, out, err = _execute_slurm_cmd("salloc", args, raise_on_err=raise_on_err)
return out, err


def sinfo(args: t.List[str]) -> t.Tuple[str, str]:
def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
"""Calls slurm sinfo with args
:param args: List of command arguments
:type args: List of str
:returns: Output and error of sinfo
"""
_sinfo = _find_slurm_command("sinfo")
cmd = [_sinfo] + args
_, out, error = execute_cmd(cmd)
return out, error
_, out, err = _execute_slurm_cmd("sinfo", args, raise_on_err=raise_on_err)
return out, err


def scontrol(args: t.List[str]) -> t.Tuple[str, str]:
def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
"""Calls slurm scontrol with args
:param args: List of command arguments
:type args: List of str
:returns: Output and error of sinfo
"""
_scontrol = _find_slurm_command("scontrol")
cmd = [_scontrol] + args
_, out, error = execute_cmd(cmd)
return out, error
_, out, err = _execute_slurm_cmd("scontrol", args, raise_on_err=raise_on_err)
return out, err


def scancel(args: t.List[str]) -> t.Tuple[int, str, str]:
def scancel(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[int, str, str]:
"""Calls slurm scancel with args.
returncode is also supplied in this function.
Expand All @@ -106,10 +99,7 @@ def scancel(args: t.List[str]) -> t.Tuple[int, str, str]:
:return: output and error
:rtype: str
"""
_scancel = _find_slurm_command("scancel")
cmd = [_scancel] + args
returncode, out, error = execute_cmd(cmd)
return returncode, out, error
return _execute_slurm_cmd("scancel", args, raise_on_err=raise_on_err)


def _find_slurm_command(cmd: str) -> str:
Expand All @@ -120,3 +110,17 @@ def _find_slurm_command(cmd: str) -> str:
raise LauncherError(
f"Slurm Launcher could not find path of {cmd} command"
) from e


def _execute_slurm_cmd(
command: str, args: t.List[str], raise_on_err: bool = False
) -> t.Tuple[int, str, str]:
cmd_exe = _find_slurm_command(command)
cmd = [cmd_exe] + args
returncode, out, error = execute_cmd(cmd)
if returncode != 0:
msg = f"An error occurred while calling {command}: {error}"
if raise_on_err:
raise LauncherError(msg)
logger.error(msg)
return returncode, out, error
16 changes: 8 additions & 8 deletions smartsim/_core/launcher/slurm/slurmLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,7 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]:
"""
_, step_ids = self.step_mapping.get_ids(step_names, managed=True)
step_str = _create_step_id_str([val for val in step_ids if val is not None])
output, error = sstat([step_str, "-i", "-n", "-p", "-a"])

if "error:" in error.split(" "):
raise LauncherError("Failed to retrieve nodelist from stat")
output, _ = sstat([step_str, "-i", "-n", "-p", "-a"], raise_on_err=True)

# parse node list for each step
node_lists = []
Expand Down Expand Up @@ -240,9 +237,9 @@ def _get_slurm_step_id(step: Step, interval: int = 2) -> str:
step_id: t.Optional[str] = None
trials = CONFIG.wlm_trials
while trials > 0:
output, err = sacct(["--noheader", "-p", "--format=jobname,jobid"])
if err:
logger.warning(f"An error occurred while calling sacct: {err}")
output, _ = sacct(
["--noheader", "-p", "--format=jobname,jobid"], raise_on_err=True
)

step_id = parse_step_id_from_sacct(output, step.name)
if step_id:
Expand All @@ -263,7 +260,10 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]:
:rtype: list[StepInfo]
"""
step_str = _create_step_id_str(step_ids)
sacct_out, _ = sacct(["--noheader", "-p", "-b", "--jobs", step_str])
sacct_out, _ = sacct(
["--noheader", "-p", "-b", "--jobs", step_str], raise_on_err=True
)

# (status, returncode)
stat_tuples = [parse_sacct(sacct_out, step_id) for step_id in step_ids]

Expand Down
55 changes: 55 additions & 0 deletions tests/on_wlm/test_slurm_commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# BSD 2-Clause License
#
# Copyright (c) 2021-2023, Hewlett Packard Enterprise
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import pytest

from smartsim.error.errors import LauncherError
from smartsim._core.launcher.slurm.slurmCommands import *

# retrieved from pytest fixtures
if pytest.test_launcher != "slurm":
pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems")


# Test that common ways of launching commands
# raise when expected to do so
@pytest.mark.parametrize(
"cmd,raises",
[
(sacct, True),
(sstat, True),
(sinfo, False),
(salloc, False),
(scancel, False),
(scontrol, False),
],
)
def test_error_raises(cmd, raises):
args = ["--non_existing_arg"]
if raises:
with pytest.raises(LauncherError):
cmd(args, raise_on_err = True)
else:
cmd(args)
21 changes: 10 additions & 11 deletions tests/test_telemetry_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,15 @@

import logging
import pathlib
from random import sample
import pytest
import shutil
import sys
import typing as t
import time
import uuid
from conftest import FileUtils, MLUtils, WLMUtils
import smartsim
from conftest import FileUtils, WLMUtils

from smartsim._core.control.jobmanager import JobManager
from smartsim._core.control.job import Job, JobEntity, _JobKey
from smartsim._core.control.job import Job, JobEntity
from smartsim._core.launcher.launcher import WLMLauncher
from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher
from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd
Expand All @@ -52,7 +49,6 @@
STATUS_NEW,
STATUS_PAUSED,
STATUS_RUNNING,
TERMINAL_STATUSES,
)
import smartsim._core.config.config as cfg

Expand Down Expand Up @@ -642,7 +638,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch):

def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch):
"""
Test telemetry with only a database running
Test telemetry with only a non-generated database running
"""
with monkeypatch.context() as ctx:
ctx.setattr(cfg.Config, "telemetry_frequency", 1)
Expand All @@ -660,6 +656,8 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch):

# create regular database
orc = exp.create_database(port=test_port, interface=test_interface)
orc.set_path(test_dir)

try:
exp.start(orc)

Expand All @@ -683,7 +681,7 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch):

def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch):
"""
Test telemetry with only a database running
Test telemetry with only a database and a model running
"""

with monkeypatch.context() as ctx:
Expand All @@ -703,6 +701,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch):

# create regular database
orc = exp.create_database(port=test_port, interface=test_interface)
exp.generate(orc)
try:
exp.start(orc)

Expand Down Expand Up @@ -739,7 +738,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch):

def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch):
"""
Test telemetry with only a database running
Test telemetry with only an ensemble
"""

with monkeypatch.context() as ctx:
Expand Down Expand Up @@ -775,14 +774,14 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch):

def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch):
"""
Test telemetry with only a database running
Test telemetry with only a colocated model running
"""

with monkeypatch.context() as ctx:
ctx.setattr(cfg.Config, "telemetry_frequency", 1)

# Set experiment name
exp_name = "telemetry_ensemble"
exp_name = "telemetry_colo"

# Retrieve parameters from testing environment
test_launcher = wlmutils.get_test_launcher()
Expand Down

0 comments on commit 67785e1

Please sign in to comment.