Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add base RunSettings support for slurm, pbs, and cobalt #90

Merged
merged 19 commits into from
Oct 20, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,31 @@ def get_test_interface():
global test_nic
return test_nic

@staticmethod
def get_base_run_settings(exe, args, nodes=1, ntasks=1, **kwargs):
if test_launcher == "slurm":
run_args = {"--nodes": nodes,
"--ntasks": ntasks,
"--time": "00:10:00"}
run_args.update(kwargs)
settings = RunSettings(exe, args, run_command="srun", run_args=run_args)
return settings
if test_launcher == "pbs":
run_args = {"--pes": ntasks}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we don't specify nodes and time for pbs and cobalt base settings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aprun doesn't have the concept of nodes. it's just processing elements.

run_args.update(kwargs)
settings = RunSettings(exe, args, run_command="aprun", run_args=run_args)
return settings
if test_launcher == "cobalt":
run_args = {"--pes": ntasks}
run_args.update(kwargs)
settings = RunSettings(exe, args, run_command="aprun", run_args=run_args)
return settings
if test_launcher == "lsf":
raise SSUnsupportedError("SmartSim LSF launcher does not support custom run settings at this time.")
# TODO allow user to pick aprun vs MPIrun
return RunSettings(exe, args)


@staticmethod
def get_run_settings(exe, args, nodes=1, ntasks=1, **kwargs):
if test_launcher == "slurm":
Expand Down
7 changes: 5 additions & 2 deletions smartsim/launcher/cobalt/cobaltLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@

from ...constants import STATUS_CANCELLED, STATUS_COMPLETED
from ...error import LauncherError, SSConfigError
from ...settings import AprunSettings, CobaltBatchSettings, MpirunSettings
from ...settings import AprunSettings, CobaltBatchSettings, MpirunSettings, RunSettings
from ...utils import get_logger
from ..launcher import WLMLauncher
from ..pbs.pbsCommands import qdel, qstat
from ..step import AprunStep, CobaltBatchStep, MpirunStep
from ..step import AprunStep, CobaltBatchStep, LocalStep, MpirunStep
from ..stepInfo import CobaltStepInfo
from .cobaltParser import parse_cobalt_step_id, parse_cobalt_step_status, parse_qsub_out

Expand Down Expand Up @@ -80,6 +80,9 @@ def create_step(self, name, cwd, step_settings):
if isinstance(step_settings, MpirunSettings):
step = MpirunStep(name, cwd, step_settings)
return step
if isinstance(step_settings, RunSettings):
step = LocalStep(name, cwd, step_settings)
return step
raise TypeError(
f"RunSettings type {type(step_settings)} not supported by Cobalt"
)
Expand Down
7 changes: 5 additions & 2 deletions smartsim/launcher/pbs/pbsLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

from ...constants import STATUS_CANCELLED, STATUS_COMPLETED
from ...error import LauncherError, SSConfigError
from ...settings import AprunSettings, MpirunSettings, QsubBatchSettings
from ...settings import AprunSettings, MpirunSettings, QsubBatchSettings, RunSettings
from ...utils import get_logger
from ..launcher import WLMLauncher
from ..step import AprunStep, MpirunStep, QsubBatchStep
from ..step import AprunStep, LocalStep, MpirunStep, QsubBatchStep
from ..stepInfo import PBSStepInfo
from .pbsCommands import qdel, qstat
from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat
Expand Down Expand Up @@ -76,6 +76,9 @@ def create_step(self, name, cwd, step_settings):
if isinstance(step_settings, MpirunSettings):
step = MpirunStep(name, cwd, step_settings)
return step
if isinstance(step_settings, RunSettings):
step = LocalStep(name, cwd, step_settings)
return step
raise TypeError(
f"RunSettings type {type(step_settings)} not supported by PBSPro"
)
Expand Down
7 changes: 5 additions & 2 deletions smartsim/launcher/slurm/slurmLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@

from ...constants import STATUS_CANCELLED
from ...error import LauncherError, SSConfigError, SSUnsupportedError
from ...settings import MpirunSettings, SbatchSettings, SrunSettings
from ...settings import MpirunSettings, RunSettings, SbatchSettings, SrunSettings
from ...utils import get_logger
from ..launcher import WLMLauncher
from ..step import MpirunStep, SbatchStep, SrunStep
from ..step import LocalStep, MpirunStep, SbatchStep, SrunStep
from ..stepInfo import SlurmStepInfo
from .slurmCommands import sacct, scancel, sstat
from .slurmParser import parse_sacct, parse_sstat_nodes, parse_step_id_from_sacct
Expand Down Expand Up @@ -77,6 +77,9 @@ def create_step(self, name, cwd, step_settings):
if isinstance(step_settings, MpirunSettings):
step = MpirunStep(name, cwd, step_settings)
return step
if isinstance(step_settings, RunSettings):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so lets say im on a slurm system and I put in the AprunSettings as my model run settings.
and the launcher doesn’t have AprunStep but it does have the base class… the launcher will still try to launch the AprunSettings as a LocalStep.

so then if I launch I would get this error

Traceback (most recent call last):
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 359, in _launch_step
    job_id = self._launcher.run(job_step)
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/launcher/slurm/slurmLauncher.py", line 159, in run
    task_id = self.task_manager.start_task(
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/launcher/taskManager.py", line 124, in start_task
    proc = execute_async_cmd(cmd_list, cwd, env=env, out=out, err=err)
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/launcher/util/shell.py", line 102, in execute_async_cmd
    raise ShellError("Failed to run command", e, cmd_list) from None
smartsim.error.errors.ShellError: Failed to run command

Command: aprun /usr/bin/echo hello
Error from shell: [Errno 2] No such file or directory: 'aprun'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "hello.py", line 10, in <module>
    exp.start(model)
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/experiment.py", line 103, in start
    self._control.start(manifest=start_manifest, block=block)
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 78, in start
    self._launch(manifest)
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 296, in _launch
    self._launch_step(*job_step)
  File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 365, in _launch_step
    raise SmartSimError(f"Job step {entity.name} failed to launch") from e
smartsim.error.errors.SmartSimError: Job step hello failed to launch

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this isn't a bad error but then the AprunSettings won't necessarily have all the arguments prepared correctly.

step = LocalStep(name, cwd, step_settings)
return step
raise SSUnsupportedError("RunSettings type not supported by Slurm")
except SSConfigError as e:
raise LauncherError("Step creation failed: " + str(e)) from None
Expand Down
106 changes: 106 additions & 0 deletions tests/on_wlm/test_base_settings_on_wlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import time

import pytest

from smartsim import Experiment, constants
from smartsim.settings.settings import RunSettings

"""
Test the launch and stop of models and ensembles using base
RunSettings while on WLM.
"""

# retrieved from pytest fixtures
if pytest.test_launcher not in pytest.wlm_options:
pytestmark = pytest.mark.skip(reason="Not testing WLM integrations")


def test_model_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-base-settings-model-launch"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5")
settings2 = wlmutils.get_base_run_settings("python", f"{script} --time=5")
M1 = exp.create_model("m1", path=test_dir, run_settings=settings1)
M2 = exp.create_model("m2", path=test_dir, run_settings=settings2)

# launch models twice to show that they can also be restarted
for _ in range(2):
exp.start(M1, M2, block=True)
statuses = exp.get_status(M1, M2)
assert all([stat == constants.STATUS_COMPLETED for stat in statuses])


def test_model_stop_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-base-settings-model-stop"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5")
settings2 = wlmutils.get_base_run_settings("python", f"{script} --time=5")
M1 = exp.create_model("m1", path=test_dir, run_settings=settings1)
M2 = exp.create_model("m2", path=test_dir, run_settings=settings2)

# stop launched models
exp.start(M1, M2, block=False)
time.sleep(2)
exp.stop(M1, M2)
assert M1.name in exp._control._jobs.completed
assert M2.name in exp._control._jobs.completed
statuses = exp.get_status(M1, M2)
assert all([stat == constants.STATUS_CANCELLED for stat in statuses])


def test_ensemble_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-base-settings-ensemble-launch"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings = wlmutils.get_base_run_settings("python", f"{script} --time=5")
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=2)
ensemble.set_path(test_dir)

# launch ensemble twice to show that it can also be restarted
for _ in range(2):
exp.start(ensemble, block=True)
statuses = exp.get_status(ensemble)
assert all([stat == constants.STATUS_COMPLETED for stat in statuses])


def test_ensemble_stop_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-base-settings-ensemble-launch"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings = wlmutils.get_base_run_settings("python", f"{script} --time=5")
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=2)
ensemble.set_path(test_dir)

# stop launched ensemble
exp.start(ensemble, block=False)
time.sleep(2)
exp.stop(ensemble)
statuses = exp.get_status(ensemble)
assert all([stat == constants.STATUS_CANCELLED for stat in statuses])
assert all([m.name in exp._control._jobs.completed for m in ensemble])
97 changes: 97 additions & 0 deletions tests/on_wlm/test_simple_base_settings_on_wlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import time

import pytest

from smartsim import Experiment, constants
from smartsim.settings.settings import RunSettings

"""
Test the launch and stop of simple models and ensembles that use base
RunSettings while on WLM.
"""

# retrieved from pytest fixtures
if pytest.test_launcher not in pytest.wlm_options:
pytestmark = pytest.mark.skip(reason="Not testing WLM integrations")


def test_simple_model_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-simplebase-settings-model-launch"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings = RunSettings("python", exe_args=f"{script} --time=5")
M = exp.create_model("m", path=test_dir, run_settings=settings)

# launch model twice to show that it can also be restarted
for _ in range(2):
exp.start(M, block=True)
assert exp.get_status(M)[0] == constants.STATUS_COMPLETED


def test_simple_model_stop_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-simplebase-settings-model-stop"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings = RunSettings("python", exe_args=f"{script} --time=5")
M = exp.create_model("m", path=test_dir, run_settings=settings)

# stop launched model
exp.start(M, block=False)
time.sleep(2)
exp.stop(M)
assert M.name in exp._control._jobs.completed
assert exp.get_status(M)[0] == constants.STATUS_CANCELLED


def test_simple_ensemble_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-simple-base-settings-ensemble-launch"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings = RunSettings("python", exe_args=f"{script} --time=5")
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=1)
ensemble.set_path(test_dir)

# launch ensemble twice to show that it can also be restarted
for _ in range(2):
exp.start(ensemble, block=True)
assert exp.get_status(ensemble)[0] == constants.STATUS_COMPLETED


def test_simple_ensemble_stop_on_wlm(fileutils, wlmutils):
launcher = wlmutils.get_test_launcher()
if launcher not in ["pbs", "slurm", "cobalt"]:
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM")

exp_name = "test-simple-base-settings-ensemble-stop"
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher())
test_dir = fileutils.make_test_dir(exp_name)

script = fileutils.get_test_conf_path("sleep.py")
settings = RunSettings("python", exe_args=f"{script} --time=5")
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=1)
ensemble.set_path(test_dir)

# stop launched ensemble
exp.start(ensemble, block=False)
time.sleep(2)
exp.stop(ensemble)
assert exp.get_status(ensemble)[0] == constants.STATUS_CANCELLED
assert ensemble.models[0].name in exp._control._jobs.completed