From 5e74d5618ef1256411369296b6f363ddd3b55b05 Mon Sep 17 00:00:00 2001 From: trajepl Date: Mon, 1 Aug 2022 17:04:49 +0800 Subject: [PATCH 1/8] generate empty files for nebula load --- .../nebula_checkpoint_engine.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index b7ff5cdf9829..f95ad11a90d6 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -37,11 +37,14 @@ def create(self, tag): self.checkpoint = torch_nebula.Checkpoint(tag, -2) def save(self, state_dict, path: str): + log_dist(f"[Nebula] Create dummy files for loading.") + torch.save("", path) + tag = _get_tag_from_path(path) partititon_name = os.path.basename(path) - logger.info(f"[Nebula] Saving {partititon_name} under tag{tag}...") + logger.info(f"[Nebula] Saving {partititon_name} under tag {tag}...") self.checkpoint.save(partititon_name, state_dict) - logger.info(f"[Nebula] Saved {partititon_name} under tag{tag}.") + logger.info(f"[Nebula] Saved {partititon_name} under tag {tag}.") return None def load(self, path: str, map_location=None): @@ -58,21 +61,24 @@ def load(self, path: str, map_location=None): partititon_name = os.path.basename(path) logger.info( - f"[Nebula] Loading {path} under tag{tag} from {self.nebula_load_path}...") + f"[Nebula] Loading {path} under tag {tag} from nebula path {self.nebula_load_path}..." + ) checkpoint = None if tag is None: checkpoint = torch_nebula.get_latest_checkpoint( persist_path=self.nebula_load_path) - if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): - logger.warning(f"Unable to find latest valid checkpoint from Nebula!") - return None else: checkpoint = torch_nebula.get_checkpoint(tag=tag, persist_path=self.nebula_load_path) + + if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): + logger.warning(f"Unable to find latest valid checkpoint from Nebula!") + return None + partition = checkpoint.load(partititon_name, map_location=map_location) logger.info( - f"[Nebula] Loaded {path} under tag{tag} from {self.nebula_load_path}.") + f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.") return partition def commit(self, tag): From abf04ba15cf3e35c7a967451884e551d45e01f12 Mon Sep 17 00:00:00 2001 From: trajepl Date: Fri, 5 Aug 2022 19:01:35 +0800 Subject: [PATCH 2/8] latest tag fix --- deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index f95ad11a90d6..b96900269433 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -65,7 +65,7 @@ def load(self, path: str, map_location=None): ) checkpoint = None - if tag is None: + if tag is None or tag == 'latest' or tag == 'latest_universal': checkpoint = torch_nebula.get_latest_checkpoint( persist_path=self.nebula_load_path) else: From a0ec298ab7eb6a58fc25d3a0183159e22d78c296 Mon Sep 17 00:00:00 2001 From: trajepl Date: Mon, 8 Aug 2022 12:26:01 +0800 Subject: [PATCH 3/8] update condition to 'in' --- deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index b96900269433..628030b776f7 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -65,7 +65,7 @@ def load(self, path: str, map_location=None): ) checkpoint = None - if tag is None or tag == 'latest' or tag == 'latest_universal': + if tag in (None, 'latest', 'latest_universal'): checkpoint = torch_nebula.get_latest_checkpoint( persist_path=self.nebula_load_path) else: From 2f288af5b6f033a3691523d711895db8a976eac2 Mon Sep 17 00:00:00 2001 From: chenguo Date: Tue, 9 Aug 2022 11:27:12 +0800 Subject: [PATCH 4/8] add some export envs for more platform --- deepspeed/nebula/constants.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py index 0e66fa8d1536..e21c8ce60303 100644 --- a/deepspeed/nebula/constants.py +++ b/deepspeed/nebula/constants.py @@ -79,7 +79,15 @@ 'MASTER_HOST', 'LOCAL_HOST', 'AZUREML_BLOB_ACCOUNT_NAME', - 'AZUREML_BLOB_ACCOUNT_KEY' + 'AZUREML_BLOB_ACCOUNT_KEY', + 'AZUREML_CR_AZUREML_CONTEXT', + 'AZUREML_CR_COMPUTE_CONTEXT', + 'AZUREML_COMMON_RUNTIME_USE_NEBULA_CAPABILITY', + 'DLTS_ROLE_IDX', + 'AZUREML_CR_NEBULA_CAPABILITY_CONFIG_OVERRIDE', + 'SINGULARITY_JOB', + 'AZUREML_CR_DISTRIBUTED_CONFIG', + 'AZUREML_NODE_COUNT' ] # ITP env files From 97e46bd86072fe2dae5eb7ceae59ce784a567e4c Mon Sep 17 00:00:00 2001 From: trajepl Date: Thu, 11 Aug 2022 19:04:17 +0800 Subject: [PATCH 5/8] fix: diff on ds_latest and nebula latest --- .../nebula_checkpoint_engine.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index 628030b776f7..2d739fdc513c 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -53,10 +53,9 @@ def load(self, path: str, map_location=None): if not self.enable_nebula_load and first_load_flag: self.tag_flag = tag logger.info( - f"[Nebula] Disable nebula load. Loading checkpoint from {path}...") + f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...") partition = torch.load(path, map_location=map_location) - logger.info( - f"[Nebula] Disable nebula load. Loaded checkpoint from {path}...") + logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .") return partition partititon_name = os.path.basename(path) @@ -73,9 +72,18 @@ def load(self, path: str, map_location=None): persist_path=self.nebula_load_path) if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): - logger.warning(f"Unable to find latest valid checkpoint from Nebula!") - return None + logger.info( + f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again!" + ) + checkpoint = torch_nebula.get_latest_checkpoint( + persist_path=self.nebula_load_path) + if checkpoint is None: + logger.warning( + f"Unable to find valid checkpoint from Nebula under tag:{tag}.") + return None + tag = checkpoint.tag + self.tag_flag = -1 partition = checkpoint.load(partititon_name, map_location=map_location) logger.info( f"[Nebula] Loaded {path} under tag {tag} from {self.nebula_load_path}.") From cbc0cda6004d43891384905dac31bfdb6038c666 Mon Sep 17 00:00:00 2001 From: trajepl Date: Tue, 16 Aug 2022 11:25:02 +0800 Subject: [PATCH 6/8] comments for loading back --- .../checkpoint_engine/nebula_checkpoint_engine.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py index 2d739fdc513c..4cdd0856e7b7 100644 --- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py +++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py @@ -65,6 +65,11 @@ def load(self, path: str, map_location=None): checkpoint = None if tag in (None, 'latest', 'latest_universal'): + # In some cases, there is the inconsistent tag between deepspeed metadata (latest file) + # and nebula metadata, will lead to the failure on loading with deepspeed tag. Then we + # will try to load the valid latest checkpoint from nebula(tier3 > tier1). So, in summary + # when met failure loading for given tag, the loading priority would be like: + # nebula tier3 latest > nebula tier1 latest. checkpoint = torch_nebula.get_latest_checkpoint( persist_path=self.nebula_load_path) else: @@ -73,11 +78,17 @@ def load(self, path: str, map_location=None): if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): logger.info( - f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again!" + f"Unable to find valid checkpoint tag:{tag} from Nebula, try to get latest checkpoint again from nebula {self.nebula_load_path} path!" ) + # nebula tier3 latest checkpoint = torch_nebula.get_latest_checkpoint( persist_path=self.nebula_load_path) - if checkpoint is None: + if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): + logger.info( + f"Unable to find latest checkpoint from Nebula tier3, try to get latest checkpoint again from nebula tier1 path!" + ) + # nebula tier1 latest + checkpoint = torch_nebula.get_latest_checkpoint() logger.warning( f"Unable to find valid checkpoint from Nebula under tag:{tag}.") return None From 3b04b7cb481f69f504f3cdac96af9278cc253a60 Mon Sep 17 00:00:00 2001 From: chenguo Date: Thu, 18 Aug 2022 17:51:36 +0800 Subject: [PATCH 7/8] revert to export more env vars --- deepspeed/nebula/constants.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py index e21c8ce60303..37fb681ac713 100644 --- a/deepspeed/nebula/constants.py +++ b/deepspeed/nebula/constants.py @@ -79,15 +79,6 @@ 'MASTER_HOST', 'LOCAL_HOST', 'AZUREML_BLOB_ACCOUNT_NAME', - 'AZUREML_BLOB_ACCOUNT_KEY', - 'AZUREML_CR_AZUREML_CONTEXT', - 'AZUREML_CR_COMPUTE_CONTEXT', - 'AZUREML_COMMON_RUNTIME_USE_NEBULA_CAPABILITY', - 'DLTS_ROLE_IDX', - 'AZUREML_CR_NEBULA_CAPABILITY_CONFIG_OVERRIDE', - 'SINGULARITY_JOB', - 'AZUREML_CR_DISTRIBUTED_CONFIG', - 'AZUREML_NODE_COUNT' ] # ITP env files From fc15dbdc81c001ba404cc77e126e68b4af203663 Mon Sep 17 00:00:00 2001 From: chenguo Date: Thu, 18 Aug 2022 17:54:01 +0800 Subject: [PATCH 8/8] revert to export more env vars --- deepspeed/nebula/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/nebula/constants.py b/deepspeed/nebula/constants.py index 37fb681ac713..0e66fa8d1536 100644 --- a/deepspeed/nebula/constants.py +++ b/deepspeed/nebula/constants.py @@ -79,6 +79,7 @@ 'MASTER_HOST', 'LOCAL_HOST', 'AZUREML_BLOB_ACCOUNT_NAME', + 'AZUREML_BLOB_ACCOUNT_KEY' ] # ITP env files