-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add base RunSettings support for slurm, pbs, and cobalt #90
Changes from 6 commits
76cc7d9
2d2c978
7ef946e
85a17f0
3657c4d
6dc42a0
74090ac
b959088
0c543df
70c07d6
b0e3a23
e467aef
cae52fc
a22589c
5d18ad7
87167b9
37981ff
dbe90bd
ebe6326
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,10 +29,10 @@ | |
|
||
from ...constants import STATUS_CANCELLED | ||
from ...error import LauncherError, SSConfigError, SSUnsupportedError | ||
from ...settings import MpirunSettings, SbatchSettings, SrunSettings | ||
from ...settings import MpirunSettings, RunSettings, SbatchSettings, SrunSettings | ||
from ...utils import get_logger | ||
from ..launcher import WLMLauncher | ||
from ..step import MpirunStep, SbatchStep, SrunStep | ||
from ..step import LocalStep, MpirunStep, SbatchStep, SrunStep | ||
from ..stepInfo import SlurmStepInfo | ||
from .slurmCommands import sacct, scancel, sstat | ||
from .slurmParser import parse_sacct, parse_sstat_nodes, parse_step_id_from_sacct | ||
|
@@ -77,6 +77,9 @@ def create_step(self, name, cwd, step_settings): | |
if isinstance(step_settings, MpirunSettings): | ||
step = MpirunStep(name, cwd, step_settings) | ||
return step | ||
if isinstance(step_settings, RunSettings): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so lets say im on a slurm system and I put in the so then if I launch I would get this error Traceback (most recent call last):
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 359, in _launch_step
job_id = self._launcher.run(job_step)
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/launcher/slurm/slurmLauncher.py", line 159, in run
task_id = self.task_manager.start_task(
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/launcher/taskManager.py", line 124, in start_task
proc = execute_async_cmd(cmd_list, cwd, env=env, out=out, err=err)
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/launcher/util/shell.py", line 102, in execute_async_cmd
raise ShellError("Failed to run command", e, cmd_list) from None
smartsim.error.errors.ShellError: Failed to run command
Command: aprun /usr/bin/echo hello
Error from shell: [Errno 2] No such file or directory: 'aprun'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "hello.py", line 10, in <module>
exp.start(model)
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/experiment.py", line 103, in start
self._control.start(manifest=start_manifest, block=block)
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 78, in start
self._launch(manifest)
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 296, in _launch
self._launch_step(*job_step)
File "/lus/cls01029/spartee/poseidon/SmartSim/smartsim/control/controller.py", line 365, in _launch_step
raise SmartSimError(f"Job step {entity.name} failed to launch") from e
smartsim.error.errors.SmartSimError: Job step hello failed to launch There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this isn't a bad error but then the |
||
step = LocalStep(name, cwd, step_settings) | ||
return step | ||
raise SSUnsupportedError("RunSettings type not supported by Slurm") | ||
except SSConfigError as e: | ||
raise LauncherError("Step creation failed: " + str(e)) from None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import time | ||
|
||
import pytest | ||
|
||
from smartsim import Experiment, constants | ||
from smartsim.settings.settings import RunSettings | ||
|
||
""" | ||
Test the launch and stop of models and ensembles using base | ||
RunSettings while on WLM. | ||
""" | ||
|
||
# retrieved from pytest fixtures | ||
if pytest.test_launcher not in pytest.wlm_options: | ||
pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") | ||
|
||
|
||
def test_model_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-base-settings-model-launch" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") | ||
settings2 = wlmutils.get_base_run_settings("python", f"{script} --time=5") | ||
M1 = exp.create_model("m1", path=test_dir, run_settings=settings1) | ||
M2 = exp.create_model("m2", path=test_dir, run_settings=settings2) | ||
|
||
# launch models twice to show that they can also be restarted | ||
for _ in range(2): | ||
exp.start(M1, M2, block=True) | ||
statuses = exp.get_status(M1, M2) | ||
assert all([stat == constants.STATUS_COMPLETED for stat in statuses]) | ||
|
||
|
||
def test_model_stop_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-base-settings-model-stop" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings1 = wlmutils.get_base_run_settings("python", f"{script} --time=5") | ||
settings2 = wlmutils.get_base_run_settings("python", f"{script} --time=5") | ||
M1 = exp.create_model("m1", path=test_dir, run_settings=settings1) | ||
M2 = exp.create_model("m2", path=test_dir, run_settings=settings2) | ||
|
||
# stop launched models | ||
exp.start(M1, M2, block=False) | ||
time.sleep(2) | ||
exp.stop(M1, M2) | ||
assert M1.name in exp._control._jobs.completed | ||
assert M2.name in exp._control._jobs.completed | ||
statuses = exp.get_status(M1, M2) | ||
assert all([stat == constants.STATUS_CANCELLED for stat in statuses]) | ||
|
||
|
||
def test_ensemble_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-base-settings-ensemble-launch" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings = wlmutils.get_base_run_settings("python", f"{script} --time=5") | ||
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=2) | ||
ensemble.set_path(test_dir) | ||
|
||
# launch ensemble twice to show that it can also be restarted | ||
for _ in range(2): | ||
exp.start(ensemble, block=True) | ||
statuses = exp.get_status(ensemble) | ||
assert all([stat == constants.STATUS_COMPLETED for stat in statuses]) | ||
|
||
|
||
def test_ensemble_stop_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-base-settings-ensemble-launch" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings = wlmutils.get_base_run_settings("python", f"{script} --time=5") | ||
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=2) | ||
ensemble.set_path(test_dir) | ||
|
||
# stop launched ensemble | ||
exp.start(ensemble, block=False) | ||
time.sleep(2) | ||
exp.stop(ensemble) | ||
statuses = exp.get_status(ensemble) | ||
assert all([stat == constants.STATUS_CANCELLED for stat in statuses]) | ||
assert all([m.name in exp._control._jobs.completed for m in ensemble]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import time | ||
|
||
import pytest | ||
|
||
from smartsim import Experiment, constants | ||
from smartsim.settings.settings import RunSettings | ||
|
||
""" | ||
Test the launch and stop of simple models and ensembles that use base | ||
RunSettings while on WLM. | ||
""" | ||
|
||
# retrieved from pytest fixtures | ||
if pytest.test_launcher not in pytest.wlm_options: | ||
pytestmark = pytest.mark.skip(reason="Not testing WLM integrations") | ||
|
||
|
||
def test_simple_model_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-simplebase-settings-model-launch" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings = RunSettings("python", exe_args=f"{script} --time=5") | ||
M = exp.create_model("m", path=test_dir, run_settings=settings) | ||
|
||
# launch model twice to show that it can also be restarted | ||
for _ in range(2): | ||
exp.start(M, block=True) | ||
assert exp.get_status(M)[0] == constants.STATUS_COMPLETED | ||
|
||
|
||
def test_simple_model_stop_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-simplebase-settings-model-stop" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings = RunSettings("python", exe_args=f"{script} --time=5") | ||
M = exp.create_model("m", path=test_dir, run_settings=settings) | ||
|
||
# stop launched model | ||
exp.start(M, block=False) | ||
time.sleep(2) | ||
exp.stop(M) | ||
assert M.name in exp._control._jobs.completed | ||
assert exp.get_status(M)[0] == constants.STATUS_CANCELLED | ||
|
||
|
||
def test_simple_ensemble_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-simple-base-settings-ensemble-launch" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings = RunSettings("python", exe_args=f"{script} --time=5") | ||
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=1) | ||
ensemble.set_path(test_dir) | ||
|
||
# launch ensemble twice to show that it can also be restarted | ||
for _ in range(2): | ||
exp.start(ensemble, block=True) | ||
assert exp.get_status(ensemble)[0] == constants.STATUS_COMPLETED | ||
|
||
|
||
def test_simple_ensemble_stop_on_wlm(fileutils, wlmutils): | ||
launcher = wlmutils.get_test_launcher() | ||
if launcher not in ["pbs", "slurm", "cobalt"]: | ||
pytest.skip("Test only runs on systems with PBSPro, Slurm, or Cobalt as WLM") | ||
|
||
exp_name = "test-simple-base-settings-ensemble-stop" | ||
exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher()) | ||
test_dir = fileutils.make_test_dir(exp_name) | ||
|
||
script = fileutils.get_test_conf_path("sleep.py") | ||
settings = RunSettings("python", exe_args=f"{script} --time=5") | ||
ensemble = exp.create_ensemble("ensemble", run_settings=settings, replicas=1) | ||
ensemble.set_path(test_dir) | ||
|
||
# stop launched ensemble | ||
exp.start(ensemble, block=False) | ||
time.sleep(2) | ||
exp.stop(ensemble) | ||
assert exp.get_status(ensemble)[0] == constants.STATUS_CANCELLED | ||
assert ensemble.models[0].name in exp._control._jobs.completed |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason why we don't specify nodes and time for
pbs
andcobalt
base settings?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aprun doesn't have the concept of
nodes
. it's just processing elements.