meta-pytorch
diff --git a/‎docs/source/workspace.rst‎
Lines changed: 12 additions & 0 deletions b/‎docs/source/workspace.rst‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎scripts/slurmtest.sh‎
Lines changed: 5 additions & 1 deletion b/‎scripts/slurmtest.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎torchx/runner/api.py‎
Lines changed: 1 addition & 1 deletion b/‎torchx/runner/api.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchx/runner/test/api_test.py‎
Lines changed: 1 addition & 1 deletion b/‎torchx/runner/test/api_test.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchx/schedulers/api.py‎
Lines changed: 1 addition & 1 deletion b/‎torchx/schedulers/api.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torchx/schedulers/slurm_scheduler.py‎
Lines changed: 58 additions & 10 deletions b/‎torchx/schedulers/slurm_scheduler.py‎
Lines changed: 58 additions & 10 deletions
diff --git a/‎torchx/schedulers/test/api_test.py‎
Lines changed: 3 additions & 1 deletion b/‎torchx/schedulers/test/api_test.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎torchx/schedulers/test/slurm_scheduler_test.py‎
Lines changed: 67 additions & 33 deletions b/‎torchx/schedulers/test/slurm_scheduler_test.py‎
Lines changed: 67 additions & 33 deletions
@@ -9,6 +9,8 @@ torchx.workspace
 .. autoclass:: Workspace
   :members:
 
+.. autofunction:: walk_workspace
+
 torchx.workspace.docker_workspace
 #######################################
 
@@ -19,3 +21,13 @@ torchx.workspace.docker_workspace
 .. autoclass:: DockerWorkspace
   :members:
   :private-members: _update_app_images, _push_images
+
+torchx.workspace.dir_workspace
+#######################################
+
+
+.. automodule:: torchx.workspace.dir_workspace
+.. currentmodule:: torchx.workspace.dir_workspace
+
+.. autoclass:: DirWorkspace
+  :members:
@@ -15,6 +15,8 @@ DIR="$BASE_DIR/project"
 mkdir "$DIR"
 cd "$DIR"
 
+JOB_DIR="$BASE_DIR/job"
+
 # shellcheck disable=SC1091
 source /opt/slurm/etc/slurm.sh
 sbatch --version
@@ -32,10 +34,12 @@ partition=compute
 time=10
 comment=hello
 nomem=true
+job_dir=$JOB_DIR
 EOT
 
 cat <<EOT > main.py
-print("hello world!")
+import sys
+print("hello world!", file=sys.stderr)
 EOT
 
 APP_ID="$(torchx run --wait --log --scheduler slurm dist.ddp -j 2x1 --script main.py)"
 
@@ -283,7 +283,7 @@ def dryrun(
                 logger.info(
                     f"Building workspace: {workspace} for role[0]: {role.name}, image: {old_img}"
                 )
-                sched.build_workspace_and_update_role(role, workspace)
+                sched.build_workspace_and_update_role(role, workspace, cfg)
                 logger.info("Done building workspace")
                 if old_img != role.image:
                     logger.info(f"New image: {role.image} built from workspace")
 
@@ -154,7 +154,7 @@ def _cancel_existing(self, app_id: str) -> None:
                 pass
 
             def build_workspace_and_update_role(
-                self, role: Role, workspace: str
+                    self, role: Role, workspace: str, cfg: Mapping[str, CfgVal],
             ) -> None:
                 if self.build_new_img:
                     role.image = f"{role.image}_new"
 
@@ -109,7 +109,7 @@ def submit(
             sched = self
             assert isinstance(sched, Workspace)
             role = app.roles[0]
-            sched.build_workspace_and_update_role(role, workspace)
+            sched.build_workspace_and_update_role(role, workspace, cfg)
         dryrun_info = self.submit_dryrun(app, cfg)
         return self.schedule(dryrun_info)
 
 
@@ -34,7 +34,9 @@
     macros,
     runopts,
 )
+from torchx.workspace.dir_workspace import DirWorkspace
 
+SLURM_JOB_DIRS = ".torchxslurmjobdirs"
 
 SLURM_STATES: Mapping[str, AppState] = {
     "BOOT_FAIL": AppState.FAILED,
@@ -166,6 +168,7 @@ class SlurmBatchRequest:
 
     cmd: List[str]
     replicas: Dict[str, SlurmReplicaRequest]
+    job_dir: Optional[str]
 
     def materialize(self) -> str:
         """
@@ -200,7 +203,7 @@ def materialize(self) -> str:
         return script
 
 
-class SlurmScheduler(Scheduler):
+class SlurmScheduler(Scheduler, DirWorkspace):
     """
     SlurmScheduler is a TorchX scheduling interface to slurm. TorchX expects
     that slurm CLI tools are locally installed and job accounting is enabled.
@@ -254,11 +257,8 @@ class SlurmScheduler(Scheduler):
                 Partial support. SlurmScheduler will return job and replica
                 status but does not provide the complete original AppSpec.
             workspaces: |
-                Partial support. Typical Slurm usage is from a shared NFS mount
-                so code will automatically be updated on the workers.
-                SlurmScheduler does not support programmatic patching via
-                WorkspaceScheduler.
-
+                If ``job_dir`` is specified the DirWorkspace will create a new
+                isolated directory with a snapshot of the workspace.
     """
 
     def __init__(self, session_name: str) -> None:
@@ -276,7 +276,9 @@ def run_opts(self) -> runopts:
             "time",
             type_=str,
             default=None,
-            help="The maximum time the job is allowed to run for.",
+            help='The maximum time the job is allowed to run for. Formats: \
+            "minutes", "minutes:seconds", "hours:minutes:seconds", "days-hours", \
+            "days-hours:minutes" or "days-hours:minutes:seconds"',
         )
         opts.add(
             "nomem",
@@ -304,25 +306,45 @@ def run_opts(self) -> runopts:
             type_=str,
             help="What events to mail users on.",
         )
+        opts.add(
+            "job_dir",
+            type_=str,
+            help="""The directory to place the job code and outputs. The
+            directory must not exist and will be created. To enable log
+            iteration, jobs will be tracked in ``.torchxslurmjobdirs``.
+            """,
+        )
         return opts
 
     def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:
         req = dryrun_info.request
+        job_dir = req.job_dir
         with tempfile.TemporaryDirectory() as tmpdir:
             script = req.materialize()
-            path = os.path.join(tmpdir, "job.sh")
+            path = os.path.join(job_dir or tmpdir, "torchx-sbatch.sh")
 
             with open(path, "w") as f:
                 f.write(script)
 
-            cmd = req.cmd + [path]
+            cmd = req.cmd
+            if job_dir is not None:
+                cmd += [f"--chdir={job_dir}"]
+            cmd += [path]
 
             p = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)
-            return p.stdout.decode("utf-8").strip()
+            job_id = p.stdout.decode("utf-8").strip()
+
+            if job_dir is not None:
+                _save_job_dir(job_id, job_dir)
+
+            return job_id
 
     def _submit_dryrun(
         self, app: AppDef, cfg: Mapping[str, CfgVal]
     ) -> AppDryRunInfo[SlurmBatchRequest]:
+        job_dir = cfg.get("job_dir")
+        assert job_dir is None or isinstance(job_dir, str), "job_dir must be str"
+
         replicas = {}
         for role in app.roles:
             for replica_id in range(role.num_replicas):
@@ -344,6 +366,7 @@ def _submit_dryrun(
         req = SlurmBatchRequest(
             cmd=cmd,
             replicas=replicas,
+            job_dir=job_dir,
         )
         return AppDryRunInfo(req, repr)
 
@@ -435,6 +458,10 @@ def log_iter(
             )
 
         log_file = f"slurm-{app_id}-{role_name}-{k}.{extension}"
+        job_dirs = _get_job_dirs()
+        print(job_dirs)
+        if app_id in job_dirs:
+            log_file = os.path.join(job_dirs[app_id], log_file)
 
         return LogIterator(
             app_id, regex or ".*", log_file, self, should_tail=should_tail
@@ -445,3 +472,24 @@ def create_scheduler(session_name: str, **kwargs: Any) -> SlurmScheduler:
     return SlurmScheduler(
         session_name=session_name,
     )
+
+
+def _save_job_dir(job_id: str, job_dir: str) -> None:
+    with open(SLURM_JOB_DIRS, "at") as f:
+        f.write(f"{job_id} = {job_dir}\n")
+
+
+def _get_job_dirs() -> Mapping[str, str]:
+    try:
+        with open(SLURM_JOB_DIRS, "rt") as f:
+            lines = f.readlines()
+    except FileNotFoundError:
+        return {}
+
+    out = {}
+    for line in lines:
+        first, _, second = line.partition("=")
+        if not first or not second:
+            continue
+        out[first.strip()] = second.strip()
+    return out
@@ -69,7 +69,9 @@ def run_opts(self) -> runopts:
         def resolve_resource(self, resource: Union[str, Resource]) -> Resource:
             return NULL_RESOURCE
 
-        def build_workspace_and_update_role(self, role: Role, workspace: str) -> None:
+        def build_workspace_and_update_role(
+            self, role: Role, workspace: str, cfg: Mapping[str, CfgVal]
+        ) -> None:
             role.image = workspace
 
     def test_invalid_run_cfg(self) -> None:
 
@@ -20,6 +20,8 @@
     SlurmReplicaRequest,
     SlurmScheduler,
     create_scheduler,
+    _save_job_dir,
+    _get_job_dirs,
 )
 
 
@@ -348,44 +350,50 @@ def test_describe_running(self, run: MagicMock) -> None:
     def test_log_iter(self, run: MagicMock) -> None:
         scheduler = create_scheduler("foo")
 
-        with tmp_cwd():
-            with open("slurm-54-echo-1.out", "wt") as f:
-                f.write("hello\nworld\n")
-
-            logs = list(
-                scheduler.log_iter(
-                    "54",
-                    "echo",
-                    1,
-                    streams=Stream.STDOUT,
-                    since=datetime.datetime.now(),
+        for job_dir in ["", "dir"]:
+            print("job_dir", job_dir)
+            with tmp_cwd():
+                if job_dir:
+                    os.mkdir(job_dir)
+                    _save_job_dir("54", job_dir)
+
+                with open(os.path.join(job_dir, "slurm-54-echo-1.out"), "wt") as f:
+                    f.write("hello\nworld\n")
+
+                logs = list(
+                    scheduler.log_iter(
+                        "54",
+                        "echo",
+                        1,
+                        streams=Stream.STDOUT,
+                        since=datetime.datetime.now(),
+                    )
                 )
-            )
-            self.assertEqual(logs, ["hello", "world"])
-
-            with open("slurm-54-echo-1.err", "wt") as f:
-                f.write("foo\nbar\n")
-
-            logs = list(
-                scheduler.log_iter(
-                    "54",
-                    "echo",
-                    1,
-                    streams=Stream.STDERR,
+                self.assertEqual(logs, ["hello", "world"])
+
+                with open(os.path.join(job_dir, "slurm-54-echo-1.err"), "wt") as f:
+                    f.write("foo\nbar\n")
+
+                logs = list(
+                    scheduler.log_iter(
+                        "54",
+                        "echo",
+                        1,
+                        streams=Stream.STDERR,
+                    )
                 )
-            )
 
-            self.assertEqual(logs, ["foo", "bar"])
+                self.assertEqual(logs, ["foo", "bar"])
 
-            # no stream specified should default to STDERR
-            logs = list(
-                scheduler.log_iter(
-                    "54",
-                    "echo",
-                    1,
+                # no stream specified should default to STDERR
+                logs = list(
+                    scheduler.log_iter(
+                        "54",
+                        "echo",
+                        1,
+                    )
                 )
-            )
-            self.assertEqual(logs, ["foo", "bar"])
+                self.assertEqual(logs, ["foo", "bar"])
 
         with self.assertRaises(ValueError):
             scheduler.log_iter("54", "echo", 1, streams=Stream.COMBINED)
@@ -422,3 +430,29 @@ def test_dryrun_mail(self) -> None:
             "--mail-type=END",
             info.request.cmd,
         )
+
+    @patch("subprocess.run")
+    def test_run_workspace_job_dir(self, run: MagicMock) -> None:
+        with tmp_cwd():
+            run.return_value.stdout = b"1234"
+            scheduler = create_scheduler("foo")
+            scheduler.submit(
+                simple_app(),
+                cfg={
+                    "job_dir": "dir",
+                },
+                workspace=".",
+            )
+
+        self.assertEqual(run.call_count, 1)
+        args, kwargs = run.call_args
+        (args,) = args
+        self.assertEqual(
+            args,
+            [
+                "sbatch",
+                "--parsable",
+                "--chdir=dir",
+                "dir/torchx-sbatch.sh",
+            ],
+        )
Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,7 @@ def dryrun(`
`283`	`283`	`logger.info(`
`284`	`284`	`f"Building workspace: {workspace} for role[0]: {role.name}, image: {old_img}"`
`285`	`285`	`)`
`286`		`- sched.build_workspace_and_update_role(role, workspace)`
	`286`	`+ sched.build_workspace_and_update_role(role, workspace, cfg)`
`287`	`287`	`logger.info("Done building workspace")`
`288`	`288`	`if old_img != role.image:`
`289`	`289`	`logger.info(f"New image: {role.image} built from workspace")`