Skip to content

Commit

Permalink
Refactor of OpenMPISettings and improved support for mpiexec (#242)
Browse files Browse the repository at this point in the history
mpiexec, mpirun, and orterun are all synonyms in the modern MPI
standards. The mpirunSettings and mpirunStep have been refactored
to the more generic mpiSettings and mpiStep. Within these modules
the classes associated with mpiexec, mpirun, and orterun all
inherit from mpiexec (which historically was the first executable
method that was defined in the MPI standard).

One exception to this is that the slurm workload manager
provides a wrapper to mpiexec which wraps a subset of the
supported options to mpiexec so that they can be passed to Slurm.
In general, this is a very limited set of options and so we choose
not to support it. Instead, we check to see if mpiexec is actually
mpiexec.slurm and throw an UnsupportedError.

Another exception (which is now supported) is the Parallel
Application Launch Service used on more recent HPE/Cray HPC
platforms which also wraps mpiexec. Users of these platforms
should now use the new run settings class palsSettings and PALS
launchers for their workflows.

[ Commited by @ashao ]
[ Reviewed by @MattToast ]

Co-authored-by: Riccardo Balin <[email protected]>
  • Loading branch information
ashao and rickybalin committed Dec 6, 2022
1 parent 4c61fae commit ff2cf24
Show file tree
Hide file tree
Showing 18 changed files with 855 additions and 464 deletions.
6 changes: 4 additions & 2 deletions smartsim/_core/launcher/cobalt/cobaltLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from ...config import CONFIG
from ..launcher import WLMLauncher
from ..pbs.pbsCommands import qdel, qstat
from ..step import AprunStep, CobaltBatchStep, LocalStep, MpirunStep
from ..step import AprunStep, CobaltBatchStep, LocalStep, MpirunStep, MpiexecStep, OrterunStep
from ..stepInfo import CobaltStepInfo
from .cobaltParser import parse_cobalt_step_id, parse_cobalt_step_status, parse_qsub_out

Expand All @@ -62,7 +62,9 @@ def __init__(self):
AprunSettings: AprunStep,
CobaltBatchSettings: CobaltBatchStep,
MpirunSettings: MpirunStep,
RunSettings: LocalStep,
MpiexecSettings: MpiexecStep,
OrterunSettings: OrterunStep,
RunSettings: LocalStep
}

def run(self, step):
Expand Down
4 changes: 3 additions & 1 deletion smartsim/_core/launcher/lsf/lsfLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from ....status import STATUS_CANCELLED, STATUS_COMPLETED
from ...config import CONFIG
from ..launcher import WLMLauncher
from ..step import BsubBatchStep, JsrunStep, LocalStep, MpirunStep
from ..step import BsubBatchStep, JsrunStep, LocalStep, MpirunStep, MpiexecStep, OrterunStep
from ..stepInfo import LSFBatchStepInfo, LSFJsrunStepInfo
from .lsfCommands import bjobs, bkill, jskill, jslist
from .lsfParser import (
Expand Down Expand Up @@ -63,6 +63,8 @@ class LSFLauncher(WLMLauncher):
JsrunSettings: JsrunStep,
BsubBatchSettings: BsubBatchStep,
MpirunSettings: MpirunStep,
MpiexecSettings: MpiexecStep,
OrterunSettings: OrterunStep,
RunSettings: LocalStep,
}

Expand Down
12 changes: 11 additions & 1 deletion smartsim/_core/launcher/pbs/pbsLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,14 @@
from ....status import STATUS_CANCELLED, STATUS_COMPLETED
from ...config import CONFIG
from ..launcher import WLMLauncher
from ..step import AprunStep, LocalStep, MpirunStep, QsubBatchStep
from ..step import (
AprunStep,
LocalStep,
MpirunStep,
MpiexecStep,
OrterunStep,
QsubBatchStep
)
from ..stepInfo import PBSStepInfo
from .pbsCommands import qdel, qstat
from .pbsParser import parse_qstat_jobid, parse_step_id_from_qstat
Expand All @@ -57,8 +64,11 @@ class PBSLauncher(WLMLauncher):
supported_rs = {
AprunSettings: AprunStep,
QsubBatchSettings: QsubBatchStep,
MpiexecSettings: MpiexecStep,
MpirunSettings: MpirunStep,
OrterunSettings: OrterunStep,
RunSettings: LocalStep,
PalsMpiexecSettings: MpiexecStep
}

def run(self, step):
Expand Down
4 changes: 3 additions & 1 deletion smartsim/_core/launcher/slurm/slurmLauncher.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from ....status import STATUS_CANCELLED
from ...config import CONFIG
from ..launcher import WLMLauncher
from ..step import LocalStep, MpirunStep, SbatchStep, SrunStep
from ..step import LocalStep, MpirunStep, MpiexecStep, SbatchStep, SrunStep, OrterunStep
from ..stepInfo import SlurmStepInfo
from .slurmCommands import sacct, scancel, sstat
from .slurmParser import parse_sacct, parse_sstat_nodes, parse_step_id_from_sacct
Expand All @@ -59,6 +59,8 @@ class SlurmLauncher(WLMLauncher):
SrunSettings: SrunStep,
SbatchSettings: SbatchStep,
MpirunSettings: MpirunStep,
MpiexecSettings: MpiexecStep,
OrterunSettings: OrterunStep,
RunSettings: LocalStep,
}

Expand Down
2 changes: 1 addition & 1 deletion smartsim/_core/launcher/step/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
from .cobaltStep import CobaltBatchStep
from .localStep import LocalStep
from .lsfStep import BsubBatchStep, JsrunStep
from .mpirunStep import MpirunStep
from .mpiStep import MpirunStep, MpiexecStep, OrterunStep
from .pbsStep import QsubBatchStep
from .slurmStep import SbatchStep, SrunStep
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,9 @@

logger = get_logger(__name__)


class MpirunStep(Step):
class _BaseMPIStep(Step):
def __init__(self, name, cwd, run_settings):
"""Initialize a OpenMPI mpirun job step
"""Initialize a job step conforming to the MPI standard
:param name: name of the entity to be launched
:type name: str
Expand All @@ -46,74 +45,79 @@ def __init__(self, name, cwd, run_settings):
:param run_settings: run settings for entity
:type run_settings: RunSettings
"""

super().__init__(name, cwd)


self.run_settings = run_settings

self.alloc = None
if not self.run_settings.in_batch:
self._set_alloc()

_supported_launchers = [
"PBS",
"COBALT",
"SLURM",
"LSB"
]

@property
def _run_command(self):
return self.run_settings._run_command

def get_launch_cmd(self):
"""Get the command to launch this step
:return: launch command
:rtype: list[str]
"""
mpirun = self.run_settings.run_command
mpirun_cmd = [mpirun, "-wdir", self.cwd]
# add env vars to mpirun command
mpirun_cmd.extend(self.run_settings.format_env_vars())
mpi_cmd = [self._run_command, "--wdir", self.cwd]
# add env vars to mpi command
mpi_cmd.extend(self.run_settings.format_env_vars())

# add mpirun settings to command
mpirun_cmd.extend(self.run_settings.format_run_args())
# add mpi settings to command
mpi_cmd.extend(self.run_settings.format_run_args())

if self.run_settings.colocated_db_settings:
# disable cpu binding as the entrypoint will set that
# for the application and database process now
mpirun_cmd.extend(["--bind-to", "none"])
# mpi_cmd.extend(["--cpu-bind", "none"])

# Replace the command with the entrypoint wrapper script
bash = shutil.which("bash")
launch_script_path = self.get_colocated_launch_script()
mpirun_cmd += [bash, launch_script_path]
mpi_cmd += [bash, launch_script_path]

mpirun_cmd += self._build_exe()
mpi_cmd += self._build_exe()

# if its in a batch, redirect stdout to
# file in the cwd.
if self.run_settings.in_batch:
output = self.get_step_file(ending=".out")
mpirun_cmd += [">", output]
return mpirun_cmd
mpi_cmd += [">", output]
return mpi_cmd

def _set_alloc(self):
"""Set the id of the allocation
:raises AllocationError: allocation not listed or found
"""
if "PBS_JOBID" in os.environ: # cov-pbs
self.alloc = os.environ["PBS_JOBID"]
logger.debug(
f"Running on PBS allocation {self.alloc} gleaned from user environment"
)
elif "COBALT_JOBID" in os.environ: # cov-cobalt
self.alloc = os.environ["COBALT_JOBID"]
logger.debug(
f"Running on Cobalt allocation {self.alloc} gleaned from user environment"
)
elif "SLURM_JOBID" in os.environ: # cov-slurm
self.alloc = os.environ["SLURM_JOBID"]
logger.debug(
f"Running on Slurm allocation {self.alloc} gleaned from user environment"
)
elif "LSB_JOBID" in os.environ: # cov-lsf
self.alloc = os.environ["LSB_JOBID"]
logger.debug(
f"Running on Slurm allocation {self.alloc} gleaned from user environment"
)
else:
raise AllocationError(
"No allocation specified or found and not running in batch"
)

environment_keys = os.environ.keys()
for launcher in self._supported_launchers:
jobid_field = f'{launcher.upper()}_JOBID'
if jobid_field in environment_keys:
self.alloc = os.environ[jobid_field]
logger.debug(
f"Running on allocation {self.alloc} from {jobid_field}"
)
return

# If this function did not return above, no allocations were found
raise AllocationError(
"No allocation specified or found and not running in batch"
)

def _build_exe(self):
"""Build the executable for this step
Expand All @@ -129,7 +133,7 @@ def _build_exe(self):
return exe + args

def _make_mpmd(self):
"""Build mpirun (MPMD) executable"""
"""Build mpiexec (MPMD) executable"""
exe = self.run_settings.exe
args = self.run_settings.exe_args
cmd = exe + args
Expand All @@ -142,3 +146,55 @@ def _make_mpmd(self):

cmd = sh_split(" ".join(cmd))
return cmd

class MpiexecStep(_BaseMPIStep):
def __init__(self, name, cwd, run_settings):
"""Initialize an mpiexec job step
:param name: name of the entity to be launched
:type name: str
:param cwd: path to launch dir
:type cwd: str
:param run_settings: run settings for entity
:type run_settings: RunSettings
:param default_run_command: The default command to launch an MPI
application
:type default_run_command: str, optional
"""

super().__init__(name, cwd, run_settings)


class MpirunStep(_BaseMPIStep):
def __init__(self, name, cwd, run_settings):
"""Initialize an mpirun job step
:param name: name of the entity to be launched
:type name: str
:param cwd: path to launch dir
:type cwd: str
:param run_settings: run settings for entity
:type run_settings: RunSettings
:param default_run_command: The default command to launch an MPI
application
:type default_run_command: str, optional
"""

super().__init__(name, cwd, run_settings)

class OrterunStep(_BaseMPIStep):
def __init__(self, name, cwd, run_settings):
"""Initialize an orterun job step
:param name: name of the entity to be launched
:type name: str
:param cwd: path to launch dir
:type cwd: str
:param run_settings: run settings for entity
:type run_settings: RunSettings
:param default_run_command: The default command to launch an MPI
application
:type default_run_command: str, optional
"""

super().__init__(name, cwd, run_settings)
35 changes: 23 additions & 12 deletions smartsim/database/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
CobaltBatchSettings,
JsrunSettings,
MpirunSettings,
MpiexecSettings,
PalsMpiexecSettings,
OrterunSettings,
QsubBatchSettings,
SbatchSettings,
SrunSettings,
Expand Down Expand Up @@ -101,9 +104,9 @@ def __init__(
launcher = detect_launcher()

by_launcher = {
"slurm": ["srun", "mpirun"],
"pbs": ["aprun", "mpirun"],
"cobalt": ["aprun", "mpirun"],
"slurm": ["srun", "mpirun", "mpiexec"],
"pbs": ["aprun", "mpirun", "mpiexec"],
"cobalt": ["aprun", "mpirun", "mpiexec"],
"lsf": ["jsrun"],
"local": [None],
}
Expand Down Expand Up @@ -719,16 +722,24 @@ def _check_network_interface(self):

def _fill_reserved(self):
"""Fill the reserved batch and run arguments dictionaries"""
self._reserved_run_args[MpirunSettings] = [
"np",
"N",
"c",
"output-filename",
"n",
"wdir",
"wd",
"host",

mpi_like_settings = [
MpirunSettings,
MpiexecSettings,
OrterunSettings,
PalsMpiexecSettings
]
for settings in mpi_like_settings:
self._reserved_run_args[settings] = [
"np",
"N",
"c",
"output-filename",
"n",
"wdir",
"wd",
"host",
]
self._reserved_run_args[SrunSettings] = [
"nodes",
"N",
Expand Down
4 changes: 3 additions & 1 deletion smartsim/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
from .base import RunSettings
from .cobaltSettings import CobaltBatchSettings
from .lsfSettings import BsubBatchSettings, JsrunSettings
from .mpirunSettings import MpiexecSettings, MpirunSettings, OrterunSettings
from .mpiSettings import MpiexecSettings, MpirunSettings, OrterunSettings
from .pbsSettings import QsubBatchSettings
from .slurmSettings import SbatchSettings, SrunSettings
from .palsSettings import PalsMpiexecSettings
from .containers import Container, Singularity

__all__ = [
Expand All @@ -19,6 +20,7 @@
"RunSettings",
"SbatchSettings",
"SrunSettings",
"PalsMpiexecSettings",
"Container",
"Singularity",
]
Loading

0 comments on commit ff2cf24

Please sign in to comment.