5151 "TIMEOUT" : AppState .FAILED ,
5252}
5353
54+ SBATCH_OPTIONS = {
55+ "partition" ,
56+ "time" ,
57+ }
5458
5559def _apply_app_id_env (s : str ) -> str :
5660 """
@@ -68,7 +72,6 @@ class SlurmReplicaRequest:
6872 """
6973
7074 name : str
71- dir : str
7275 entrypoint : str
7376 args : List [str ]
7477 srun_opts : Dict [str , str ]
@@ -79,21 +82,25 @@ class SlurmReplicaRequest:
7982 def from_role (
8083 cls , name : str , role : Role , cfg : Mapping [str , CfgVal ]
8184 ) -> "SlurmReplicaRequest" :
82- sbatch_opts = {k : str (v ) for k , v in cfg .items () if v is not None }
85+ sbatch_opts = {}
86+ for k , v in cfg .items ():
87+ if v is None :
88+ continue
89+ if k in SBATCH_OPTIONS :
90+ sbatch_opts [k ] = str (v )
8391 sbatch_opts .setdefault ("ntasks-per-node" , "1" )
8492 resource = role .resource
8593
8694 if resource != NONE :
8795 if resource .cpu > 0 :
8896 sbatch_opts .setdefault ("cpus-per-task" , str (resource .cpu ))
89- if resource .memMB > 0 :
97+ if not cfg . get ( "nomem" ) and resource .memMB > 0 :
9098 sbatch_opts .setdefault ("mem" , str (resource .memMB ))
9199 if resource .gpu > 0 :
92100 sbatch_opts .setdefault ("gpus-per-task" , str (resource .gpu ))
93101
94102 return cls (
95103 name = name ,
96- dir = role .image ,
97104 entrypoint = role .entrypoint ,
98105 args = list (role .args ),
99106 sbatch_opts = sbatch_opts ,
@@ -110,8 +117,7 @@ def materialize(self) -> Tuple[List[str], List[str]]:
110117 f"--job-name={ self .name } " ,
111118 ] + [f"--{ key } ={ value } " for key , value in self .sbatch_opts .items ()]
112119 srun_args = (
113- [f"--chdir={ self .dir } " ]
114- + [f"--{ key } ={ value } " for key , value in self .srun_opts .items ()]
120+ [f"--{ key } ={ value } " for key , value in self .srun_opts .items ()]
115121 + [f"--export={ key } ={ value } " for key , value in self .env .items ()]
116122 )
117123
@@ -172,10 +178,14 @@ class SlurmScheduler(Scheduler):
172178
173179 Logs are written to the default slurm log file.
174180
175- Any scheduler options passed to it are added as SBATCH arguments to each
181+ Some of the config options passed to it are added as SBATCH arguments to each
176182 replica. See https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS for info
177183 on the arguments.
178184
185+ Slurm jobs inherit the currently active ``conda`` or ``virtualenv`` and run
186+ in the current working directory. This matches the behavior of the
187+ ``local_cwd`` scheduler.
188+
179189 For more info see:
180190
181191 * https://slurm.schedmd.com/sbatch.html
@@ -219,6 +229,12 @@ def run_opts(self) -> runopts:
219229 default = None ,
220230 help = "The maximum time the job is allowed to run for." ,
221231 )
232+ opts .add (
233+ "nomem" ,
234+ type_ = bool ,
235+ default = False ,
236+ help = "disables memory request to workaround https://github.com/aws/aws-parallelcluster/issues/2198" ,
237+ )
222238 return opts
223239
224240 def schedule (self , dryrun_info : AppDryRunInfo [SlurmBatchRequest ]) -> str :
0 commit comments