Make some Slurm errors fatal for execution (#392)

This PR makes `sacct` and `sstat` errors result in an exception when running Slurm-based workflows. Previously, the errors were ignored and this could result in SmartSim's state becoming inconsistent and unstable. [ committed by @al-rigazzi ] [ reviewed by @MattToast ]
CrayLabs · Dec 13, 2023 · 67785e1 · 67785e1
1 parent 3ee7794
commit 67785e1
Show file tree

Hide file tree

Showing 6 changed files with 114 additions and 53 deletions.
diff --git a/doc/changelog.rst b/doc/changelog.rst
@@ -19,6 +19,7 @@ To be released at some future point in time
 
 Description
 
+- `sacct` and `sstat` errors are now fatal for Slurm-based workflow executions
 - Added documentation section about ML features and TorchScript
 - Added TorchScript functions to Online Analysis tutorial
 - Split tests into groups for parallel execution in CI/CD pipeline
@@ -30,6 +31,9 @@ Description
 
 Detailed Notes
 
+- When the Slurm functions `sacct` and `sstat` returned an error, it would be ignored
+  and SmartSim's state could become inconsistent. To prevent this, errors
+  raised by `sacct` or `sstat` now result in an exception. (SmartSim-PR392_)
 - A section named *ML Features* was added to documentation. It contains multiple
   examples of how ML models and functions can be added to and executed on the DB.
   TorchScript-based post-processing was added to the *Online Analysis* tutorial (SmartSim-PR411_)
@@ -50,6 +54,7 @@ Detailed Notes
 - Add support for creation of multiple databases with unique identifiers. (SmartSim-PR342_)
 
 
+.. _SmartSim-PR392: https://github.com/CrayLabs/SmartSim/pull/392
 .. _SmartSim-PR411: https://github.com/CrayLabs/SmartSim/pull/411
 .. _SmartSim-PR424: https://github.com/CrayLabs/SmartSim/pull/424
 .. _SmartSim-PR417: https://github.com/CrayLabs/SmartSim/pull/417

diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile
@@ -43,15 +43,13 @@ COPY . /usr/local/src/SmartSim/
 WORKDIR /usr/local/src/SmartSim/
 
 # Install smartredis
-RUN rm smartredis \
-    && git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --depth=1 smartredis \
+RUN git clone https://github.com/CrayLabs/SmartRedis.git --branch develop --depth=1 smartredis \
     && cd smartredis \
     && python -m pip install . \
     && rm -rf ~/.cache/pip
 
 # Install smartdashboard
-RUN rm smartdashboard \
-    && git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \
+RUN git clone https://github.com/CrayLabs/SmartDashboard.git --branch develop --depth=1 smartdashboard \
     && cd smartdashboard \
     && python -m pip install . \
     && rm -rf ~/.cache/pip

diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py
@@ -26,77 +26,70 @@
 
 import typing as t
 
+from ....log import get_logger
 from ....error import LauncherError
 from ...utils.helpers import expand_exe_path
 from ..util.shell import execute_cmd
 
+logger = get_logger(__name__)
 
-def sstat(args: t.List[str]) -> t.Tuple[str, str]:
+
+def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
     """Calls sstat with args
 
     :param args: List of command arguments
     :type args: List of str
     :returns: Output and error of sstat
     """
-    _sstat = _find_slurm_command("sstat")
-    cmd = [_sstat] + args
-    _, out, error = execute_cmd(cmd)
-    return out, error
+    _, out, err = _execute_slurm_cmd("sstat", args, raise_on_err=raise_on_err)
+    return out, err
 
 
-def sacct(args: t.List[str]) -> t.Tuple[str, str]:
+def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
     """Calls sacct with args
 
     :param args: List of command arguments
     :type args: List of str
     :returns: Output and error of sacct
     """
-    _sacct = _find_slurm_command("sacct")
-    cmd = [_sacct] + args
-    _, out, error = execute_cmd(cmd)
-    return out, error
+    _, out, err = _execute_slurm_cmd("sacct", args, raise_on_err=raise_on_err)
+    return out, err
 
 
-def salloc(args: t.List[str]) -> t.Tuple[str, str]:
+def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
     """Calls slurm salloc with args
 
     :param args: List of command arguments
     :type args: List of str
     :returns: Output and error of salloc
     """
-    _salloc = _find_slurm_command("salloc")
-    cmd = [_salloc] + args
-    _, out, error = execute_cmd(cmd)
-    return out, error
+    _, out, err = _execute_slurm_cmd("salloc", args, raise_on_err=raise_on_err)
+    return out, err
 
 
-def sinfo(args: t.List[str]) -> t.Tuple[str, str]:
+def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
     """Calls slurm sinfo with args
 
     :param args: List of command arguments
     :type args: List of str
     :returns: Output and error of sinfo
     """
-    _sinfo = _find_slurm_command("sinfo")
-    cmd = [_sinfo] + args
-    _, out, error = execute_cmd(cmd)
-    return out, error
+    _, out, err = _execute_slurm_cmd("sinfo", args, raise_on_err=raise_on_err)
+    return out, err
 
 
-def scontrol(args: t.List[str]) -> t.Tuple[str, str]:
+def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str]:
     """Calls slurm scontrol with args
 
     :param args: List of command arguments
     :type args: List of str
     :returns: Output and error of sinfo
     """
-    _scontrol = _find_slurm_command("scontrol")
-    cmd = [_scontrol] + args
-    _, out, error = execute_cmd(cmd)
-    return out, error
+    _, out, err = _execute_slurm_cmd("scontrol", args, raise_on_err=raise_on_err)
+    return out, err
 
 
-def scancel(args: t.List[str]) -> t.Tuple[int, str, str]:
+def scancel(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[int, str, str]:
     """Calls slurm scancel with args.
 
     returncode is also supplied in this function.
@@ -106,10 +99,7 @@ def scancel(args: t.List[str]) -> t.Tuple[int, str, str]:
     :return: output and error
     :rtype: str
     """
-    _scancel = _find_slurm_command("scancel")
-    cmd = [_scancel] + args
-    returncode, out, error = execute_cmd(cmd)
-    return returncode, out, error
+    return _execute_slurm_cmd("scancel", args, raise_on_err=raise_on_err)
 
 
 def _find_slurm_command(cmd: str) -> str:
@@ -120,3 +110,17 @@ def _find_slurm_command(cmd: str) -> str:
         raise LauncherError(
             f"Slurm Launcher could not find path of {cmd} command"
         ) from e
+
+
+def _execute_slurm_cmd(
+    command: str, args: t.List[str], raise_on_err: bool = False
+) -> t.Tuple[int, str, str]:
+    cmd_exe = _find_slurm_command(command)
+    cmd = [cmd_exe] + args
+    returncode, out, error = execute_cmd(cmd)
+    if returncode != 0:
+        msg = f"An error occurred while calling {command}: {error}"
+        if raise_on_err:
+            raise LauncherError(msg)
+        logger.error(msg)
+    return returncode, out, error
diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py
@@ -107,10 +107,7 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]:
         """
         _, step_ids = self.step_mapping.get_ids(step_names, managed=True)
         step_str = _create_step_id_str([val for val in step_ids if val is not None])
-        output, error = sstat([step_str, "-i", "-n", "-p", "-a"])
-
-        if "error:" in error.split(" "):
-            raise LauncherError("Failed to retrieve nodelist from stat")
+        output, _ = sstat([step_str, "-i", "-n", "-p", "-a"], raise_on_err=True)
 
         # parse node list for each step
         node_lists = []
@@ -240,9 +237,9 @@ def _get_slurm_step_id(step: Step, interval: int = 2) -> str:
         step_id: t.Optional[str] = None
         trials = CONFIG.wlm_trials
         while trials > 0:
-            output, err = sacct(["--noheader", "-p", "--format=jobname,jobid"])
-            if err:
-                logger.warning(f"An error occurred while calling sacct: {err}")
+            output, _ = sacct(
+                ["--noheader", "-p", "--format=jobname,jobid"], raise_on_err=True
+            )
 
             step_id = parse_step_id_from_sacct(output, step.name)
             if step_id:
@@ -263,7 +260,10 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]:
         :rtype: list[StepInfo]
         """
         step_str = _create_step_id_str(step_ids)
-        sacct_out, _ = sacct(["--noheader", "-p", "-b", "--jobs", step_str])
+        sacct_out, _ = sacct(
+            ["--noheader", "-p", "-b", "--jobs", step_str], raise_on_err=True
+        )
+
         # (status, returncode)
         stat_tuples = [parse_sacct(sacct_out, step_id) for step_id in step_ids]
 

diff --git a/tests/on_wlm/test_slurm_commands.py b/tests/on_wlm/test_slurm_commands.py
@@ -0,0 +1,55 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2023, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import pytest
+
+from smartsim.error.errors import LauncherError
+from smartsim._core.launcher.slurm.slurmCommands import *
+
+# retrieved from pytest fixtures
+if pytest.test_launcher != "slurm":
+    pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems")
+
+
+# Test that common ways of launching commands
+# raise when expected to do so
+@pytest.mark.parametrize(
+    "cmd,raises",
+    [
+        (sacct, True),
+        (sstat, True),
+        (sinfo, False),
+        (salloc, False),
+        (scancel, False),
+        (scontrol, False),
+    ],
+)
+def test_error_raises(cmd, raises):
+    args = ["--non_existing_arg"]
+    if raises:
+        with pytest.raises(LauncherError):
+            cmd(args, raise_on_err = True)
+    else:
+        cmd(args)
diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py
@@ -27,18 +27,15 @@
 
 import logging
 import pathlib
-from random import sample
 import pytest
-import shutil
 import sys
 import typing as t
 import time
 import uuid
-from conftest import FileUtils, MLUtils, WLMUtils
-import smartsim
+from conftest import FileUtils, WLMUtils
 
 from smartsim._core.control.jobmanager import JobManager
-from smartsim._core.control.job import Job, JobEntity, _JobKey
+from smartsim._core.control.job import Job, JobEntity
 from smartsim._core.launcher.launcher import WLMLauncher
 from smartsim._core.launcher.slurm.slurmLauncher import SlurmLauncher
 from smartsim._core.launcher.step.step import Step, proxyable_launch_cmd
@@ -52,7 +49,6 @@
     STATUS_NEW,
     STATUS_PAUSED,
     STATUS_RUNNING,
-    TERMINAL_STATUSES,
 )
 import smartsim._core.config.config as cfg
 
@@ -642,7 +638,7 @@ def test_telemetry_db_only_with_generate(test_dir, wlmutils, monkeypatch):
 
 def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch):
     """
-    Test telemetry with only a database running
+    Test telemetry with only a non-generated database running
     """
     with monkeypatch.context() as ctx:
         ctx.setattr(cfg.Config, "telemetry_frequency", 1)
@@ -660,6 +656,8 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch):
 
         # create regular database
         orc = exp.create_database(port=test_port, interface=test_interface)
+        orc.set_path(test_dir)
+
         try:
             exp.start(orc)
 
@@ -683,7 +681,7 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch):
 
 def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch):
     """
-    Test telemetry with only a database running
+    Test telemetry with only a database and a model running
     """
 
     with monkeypatch.context() as ctx:
@@ -703,6 +701,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch):
 
         # create regular database
         orc = exp.create_database(port=test_port, interface=test_interface)
+        exp.generate(orc)
         try:
             exp.start(orc)
 
@@ -739,7 +738,7 @@ def test_telemetry_db_and_model(fileutils, test_dir, wlmutils, monkeypatch):
 
 def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch):
     """
-    Test telemetry with only a database running
+    Test telemetry with only an ensemble
     """
 
     with monkeypatch.context() as ctx:
@@ -775,14 +774,14 @@ def test_telemetry_ensemble(fileutils, test_dir, wlmutils, monkeypatch):
 
 def test_telemetry_colo(fileutils, test_dir, wlmutils, coloutils, monkeypatch):
     """
-    Test telemetry with only a database running
+    Test telemetry with only a colocated model running
     """
 
     with monkeypatch.context() as ctx:
         ctx.setattr(cfg.Config, "telemetry_frequency", 1)
 
         # Set experiment name
-        exp_name = "telemetry_ensemble"
+        exp_name = "telemetry_colo"
 
         # Retrieve parameters from testing environment
         test_launcher = wlmutils.get_test_launcher()