alter launchers to pass env when starting a local step

add slurm-wlm test for local steps
CrayLabs · Jul 27, 2023 · 4179960 · 4179960
1 parent 211268e
commit 4179960
Show file tree

Hide file tree

Showing 9 changed files with 169 additions and 48 deletions.
diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py
@@ -207,15 +207,10 @@ def main(
 
     try:
         logger.debug(
-            (
-                "\n\nColocated database information\n"
-                "\n\tIP Address(es): {}"
-                "\n\tCommand: {}\n\n"
-                "\n\t# of Database CPUs: {}"
-            ),
-            ' '.join(ip_addresses + [lo_address]),
-            ' '.join(cmd),
-            db_cpus
+            "\n\nColocated database information\n"
+            f"\n\tIP Address(es): {' '.join(ip_addresses + [lo_address])}"
+            f"\n\tCommand: {' '.join(cmd)}\n\n"
+            f"\n\t# of Database CPUs: {db_cpus}"
         )
     except Exception as e:
         cleanup()

diff --git a/smartsim/_core/launcher/cobalt/cobaltLauncher.py b/smartsim/_core/launcher/cobalt/cobaltLauncher.py
@@ -116,17 +116,17 @@ def run(self, step: Step) -> t.Optional[str]:
         else:
             # aprun doesn't direct output for us.
             out, err = step.get_output_files()
+
+            # LocalStep.run_command omits env, include it here
+            passed_env = step.env if isinstance(step, LocalStep) else None
+
             # pylint: disable-next=consider-using-with
-            output = open(
-                out, "w+", encoding="utf-8"
-            )
+            output = open(out, "w+", encoding="utf-8")
             # pylint: disable-next=consider-using-with
-            error = open(
-                err, "w+", encoding="utf-8"
-            )
+            error = open(err, "w+", encoding="utf-8")
 
             task_id = self.task_manager.start_task(
-                cmd_list, step.cwd, out=output.fileno(), err=error.fileno()
+                cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno()
             )
 
         # if batch submission did not successfully retrieve job ID

diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py
@@ -108,12 +108,11 @@ def run(self, step: Step) -> str:
         error = open(err, "w+")
         cmd = step.get_launch_cmd()
 
-        env = {}
-        if isinstance(step, LocalStep):
-            env = step.env
+        # LocalStep.run_command omits env, include it here
+        passed_env = step.env if isinstance(step, LocalStep) else None
 
         task_id = self.task_manager.start_task(
-            cmd, step.cwd, env=env, out=output.fileno(), err=error.fileno()
+            cmd, step.cwd, env=passed_env, out=output.fileno(), err=error.fileno()
         )
         self.step_mapping.add(step.name, task_id=task_id, managed=False)
         return task_id

diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py
@@ -116,18 +116,18 @@ def run(self, step: Step) -> t.Optional[str]:
             step_id = self._get_lsf_step_id(step)
             logger.debug(f"Gleaned jsrun step id: {step_id} for {step.name}")
         else:  # isinstance(step, MpirunStep) or isinstance(step, LocalStep)
-            out, err = step.get_output_files()
             # mpirun and local launch don't direct output for us
+            out, err = step.get_output_files()
+
+            # LocalStep.run_command omits env, include it here
+            passed_env = step.env if isinstance(step, LocalStep) else None
+
             # pylint: disable-next=consider-using-with
-            output = open(
-                out, "w+", encoding="utf-8"
-            )
+            output = open(out, "w+", encoding="utf-8")
             # pylint: disable-next=consider-using-with
-            error = open(
-                err, "w+", encoding="utf-8"
-            )
+            error = open(err, "w+", encoding="utf-8")
             task_id = self.task_manager.start_task(
-                cmd_list, step.cwd, out=output.fileno(), err=error.fileno()
+                cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno()
             )
 
         self.step_mapping.add(step.name, step_id, task_id, step.managed)

diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py
@@ -108,18 +108,18 @@ def run(self, step: Step) -> t.Optional[str]:
                 step_id = out.strip()
                 logger.debug(f"Gleaned batch job id: {step_id} for {step.name}")
         else:
-            # aprun doesn't direct output for us.
+            # aprun/local doesn't direct output for us.
             out, err = step.get_output_files()
+
+            # LocalStep.run_command omits env, include it here
+            passed_env = step.env if isinstance(step, LocalStep) else None
+
             # pylint: disable-next=consider-using-with
-            output = open(
-                out, "w+", encoding="utf-8"
-            )
+            output = open(out, "w+", encoding="utf-8")
             # pylint: disable-next=consider-using-with
-            error = open(
-                err, "w+", encoding="utf-8"
-            )
+            error = open(err, "w+", encoding="utf-8")
             task_id = self.task_manager.start_task(
-                cmd_list, step.cwd, out=output.fileno(), err=error.fileno()
+                cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno()
             )
 
         # if batch submission did not successfully retrieve job ID

diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py
@@ -149,17 +149,22 @@ def run(self, step: Step) -> t.Optional[str]:
                 logger.debug(f"Gleaned batch job id: {step_id} for {step.name}")
 
         # Launch a in-allocation or on-allocation (if srun) command
+        elif isinstance(step, SrunStep):
+            task_id = self.task_manager.start_task(cmd_list, step.cwd)
         else:
-            if isinstance(step, SrunStep):
-                task_id = self.task_manager.start_task(cmd_list, step.cwd)
-            else:
-                # Mpirun doesn't direct output for us like srun does
-                out, err = step.get_output_files()
-                output = open(out, "w+", encoding="utf-8")  # pylint: disable=consider-using-with
-                error = open(err, "w+", encoding="utf-8")  # pylint: disable=consider-using-with
-                task_id = self.task_manager.start_task(
-                    cmd_list, step.cwd, out=output.fileno(), err=error.fileno()
-                )
+            # MPI/local steps don't direct output like slurm steps
+            out, err = step.get_output_files()
+
+            # LocalStep.run_command omits env, include it here
+            passed_env = step.env if isinstance(step, LocalStep) else None
+
+            # pylint: disable-next=consider-using-with
+            output = open(out, "w+", encoding="utf-8")
+            # pylint: disable-next=consider-using-with
+            error = open(err, "w+", encoding="utf-8")
+            task_id = self.task_manager.start_task(
+                cmd_list, step.cwd, passed_env, out=output.fileno(), err=error.fileno()
+            )
 
         if not step_id and step.managed:
             step_id = self._get_slurm_step_id(step)

diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py
@@ -106,8 +106,8 @@ def _set_alloc(self) -> None:
         elif "COBALT_JOBID" in os.environ:
             self.alloc = os.environ["COBALT_JOBID"]
             logger.debug(
-                "Running on Cobalt allocation {} gleaned from user environment",
-                self.alloc,
+                f"Running on Cobalt allocation {self.alloc} gleaned "
+                "from user environment"
             )
         else:
             raise AllocationError(

diff --git a/tests/on_wlm/test_local_step.py b/tests/on_wlm/test_local_step.py
@@ -0,0 +1,111 @@
+# BSD 2-Clause License
+#
+# Copyright (c) 2021-2023, Hewlett Packard Enterprise
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import uuid
+
+from smartsim import Experiment, status
+from smartsim.settings import RunSettings
+
+
+"""
+Test execution of local steps within the WLM
+"""
+
+def test_local_env_pass_implicit(fileutils) -> None:
+    """Ensure implicitly exported env is available to running task"""
+    exp_value = str(uuid.uuid4())
+    env_key = "test_local_env_pass_implicit"
+    os.environ[env_key] = exp_value
+
+    test_dir = fileutils.make_test_dir()
+    exp_dir = f"{test_dir}/exp"
+    os.makedirs(exp_dir)
+    script = fileutils.get_test_conf_path("check_env.py")
+
+    exp = Experiment("LRZ", exp_path=exp_dir, launcher="slurm")
+
+    exe_name = "python"
+    exe_args = [script, env_key]
+
+    # Create the RunSettings associated with the workload manager (WLM) run command
+    run_args = {"--nodes": 1, "--ntasks": 1, "--time": "00:01:00"}
+    # NOTE: not passing env_args into run_settings here, relying on --export=ALL default
+    settings = RunSettings(exe_name, exe_args, run_command="srun", run_args=run_args)
+    app_name = "echo_app"
+    app = exp.create_model(app_name, settings)
+
+    # generate the experiment structure and start the model
+    exp.generate(app, overwrite=True)
+    exp.start(app, block=True, summary=False)
+
+    assert env_key not in settings.env_vars
+    os.environ.pop(env_key)
+
+    with open(f"{exp_dir}/{app_name}/{app_name}.out") as app_outfile:
+        app_output = app_outfile.read()
+
+    # verify application was able to access the env var
+    assert f"{env_key}=={exp_value}" in app_output
+
+
+def test_local_env_pass_explicit(fileutils) -> None:
+    """Ensure explicitly exported env is available to running task"""
+    exp_value = str(uuid.uuid4())
+    env_key = "test_local_env_pass_explicit"
+
+    assert env_key not in os.environ
+
+    test_dir = fileutils.make_test_dir()
+    script = fileutils.get_test_conf_path("check_env.py")
+
+    exp_dir = f"{test_dir}/exp"
+    os.makedirs(exp_dir)
+    exp = Experiment("LRZ", exp_path=exp_dir, launcher="slurm")
+
+    exe_name = "python"
+    exe_args = [script, env_key]
+
+    # Create the RunSettings associated with the workload manager (WLM) run command
+    run_args = {"--nodes": 1, "--ntasks": 1, "--time": "00:01:00"}
+    env_vars = {env_key: exp_value}  # <-- explicitly passing a new env var to task
+    settings = RunSettings(
+        exe_name, exe_args, run_command="srun", run_args=run_args, env_vars=env_vars
+    )
+    app_name = "echo_app"
+    app = exp.create_model(app_name, settings)
+
+    # generate the experiment structure and start the model
+    exp.generate(app, overwrite=True)
+    exp.start(app, block=True, summary=False)
+
+    assert env_key in settings.env_vars
+
+    with open(f"{exp_dir}/{app_name}/{app_name}.out") as app_outfile:
+        app_output = app_outfile.read()
+
+    # verify application was able to access the env var
+    assert f"{env_key}=={exp_value}" in app_output
diff --git a/tests/test_configs/check_env.py b/tests/test_configs/check_env.py
@@ -0,0 +1,11 @@
+import os
+import sys
+
+var_name = sys.argv[1]
+env_value = os.environ.get(sys.argv[1], None)
+
+if env_value:
+    print(f"{var_name}=={env_value}")
+    sys.exit(0)
+
+print('env var not found')