Skip to content

Commit 4914e7e

Browse files
authored
Move pickle modification into main rank behind barrier (#1061)
* move pickle modification into main rank behind barrier * update * typo
1 parent 72c708e commit 4914e7e

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

src/fairchem/core/_cli_hydra.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -188,19 +188,25 @@ def __init__(self) -> None:
188188

189189
def __call__(self, dict_config: DictConfig) -> None:
190190
self.config = dict_config
191-
if self.config.job.scheduler.mode == SchedulerType.SLURM:
192-
# modify the config metadata to add slurm info, this should be only time we intentionally modify the metadata
193-
self.config.job.metadata.slurm_env = get_slurm_env()
194-
remove_runner_state_from_submission(
195-
dict_config.job.metadata.log_dir,
196-
self.config.job.metadata.slurm_env.slurm_id,
197-
)
191+
# modify the config metadata to add slurm info if they exist
192+
self.config.job.metadata.slurm_env = get_slurm_env()
198193

199194
setup_env_vars()
200195
setup_logging()
201196

202197
dist_config = map_job_config_to_dist_config(self.config.job)
203198
distutils.setup(dist_config)
199+
distutils.synchronize()
200+
if (
201+
distutils.is_master()
202+
and self.config.job.scheduler.mode == SchedulerType.SLURM
203+
):
204+
# this pickle file is shared across all processes so can only modify this on the main rank
205+
remove_runner_state_from_submission(
206+
dict_config.job.metadata.log_dir,
207+
self.config.job.metadata.slurm_env.slurm_id,
208+
)
209+
204210
if self.config.job.graph_parallel_group_size is not None:
205211
gp_utils.setup_graph_parallel_groups(
206212
self.config.job.graph_parallel_group_size,

0 commit comments

Comments
 (0)