deepspeedai · tjruwase · Jul 28, 2022 · Jun 23, 2022 · Jun 30, 2022 · Jul 11, 2022
@@ -9,6 +9,7 @@
 
 from torch.nn.modules import Module
 from packaging import version as pkg_version
+from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 
 from ..runtime.state_dict_factory import SDLoaderFactory
 from ..runtime.weight_quantizer import WeightQuantization
@@ -92,6 +93,7 @@ def __init__(self,
         self.expert_mp_group = expert_mp_group
         self.enable_cuda_graph = enable_cuda_graph
         self.cuda_graph_created = False
+        self.checkpoint_engine = TorchCheckpointEngine()
         self._init_quantization_setting(quantization_setting)
 
         if enable_cuda_graph:
@@ -376,9 +378,10 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
                         tag = fd.read().strip()
 
             ckpt_list = self._get_all_ckpt_names(load_dir, tag)
-            sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list)
+            sd_loader = SDLoaderFactory.get_sd_loader(ckpt_list, self.checkpoint_engine)
         else:
-            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir)
+            sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir,
+                                                           self.checkpoint_engine)
 
         if type(sd_loader) is list:
             self.sd = torch.load(sd_loader[0], map_location='cpu')
@@ -416,10 +419,12 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
                     state_dict=checkpoint[self._choose_module_key(checkpoint)],
                     old_moe_load=old_moe_load,
                     model=self.module,
-                    mpu=self.mpu)
+                    mpu=self.mpu,
+                    checkpoint_engine=self.checkpoint_engine)
 
             self.module.load_state_dict(
                 state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                checkpoint_engine=self.checkpoint_engine,
                 strict=load_module_strict)
 
     def _choose_module_key(self, sd):

@@ -19,6 +19,7 @@
 from argparse import ArgumentParser, REMAINDER
 
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..nebula.constants import DLTS_POD_ENV_PATH
 from ..utils import logger
 
 PID_FILE_BASEPATH = "/tmp"
@@ -143,6 +144,17 @@ def main():
         with open(pid_file, 'w') as fd:
             fd.write(f"{launcher_pid}")
 
+    if os.path.exists(DLTS_POD_ENV_PATH):
+        with open(DLTS_POD_ENV_PATH) as file:
+            lines = file.readlines()
+            lines = [line.rstrip() for line in lines]
+            for line in lines:
+                if line.startswith('export FC_TASKROLE_NAME') or line.startswith(
+                        'export FC_TASK_INDEX'):
+                    key_val = line.split()[1]
+                    key, val = key_val.split('=')
+                    current_env[key] = val
+
     processes = []
     cmd = []
     for local_rank in range(0, num_local_procs):

@@ -20,12 +20,14 @@
 from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner
 from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
+from ..nebula.constants import NEBULA_EXPORT_ENVS
 from ..utils import logger
 
 from ..autotuning import Autotuner
 
 DLTS_HOSTFILE = "/job/hostfile"
-EXPORT_ENVS = ["NCCL", "PYTHON", "MV2", "UCX"]
+EXPORT_ENVS = ['NCCL', 'PYTHON', 'MV2', 'UCX']
+EXPORT_ENVS += NEBULA_EXPORT_ENVS
 DEEPSPEED_ENVIRONMENT_NAME = ".deepspeed_env"
 DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
 PDSH_MAX_FAN_OUT = 1024

@@ -0,0 +1,53 @@
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
+from deepspeed.nebula.constants import *
+
+
+class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
+    def __init__(self, param_dict):
+        super(DeepSpeedNebulaConfig, self).__init__()
+
+        self.enabled = None
+        self.persistent_storage_path = None
+        self.persistent_time_interval = None
+        self.num_of_version_in_retention = None
+        self.enable_nebula_load = None
+
+        if NEBULA in param_dict.keys():
+            nebula_dict = param_dict[NEBULA]
+        else:
+            nebula_dict = {}
+
+        self._initialize(nebula_dict)
+
+    def _initialize(self, nebula_dict):
+        self.enabled = get_scalar_param(nebula_dict,
+                                        NEBULA_ENABLED,
+                                        NEBULA_ENABLED_DEFAULT)
+
+        self.load_path = get_scalar_param(nebula_dict,
+                                          NEBULA_LOAD_PATH,
+                                          NEBULA_LOAD_PATH_DEFAULT)
+
+        self.enable_nebula_load = get_scalar_param(nebula_dict,
+                                                   NEBULA_ENABLE_NEBULA_LOAD,
+                                                   NEBULA_ENABLE_NEBULA_LOAD_DEFAULT)
+
+        self.persistent_storage_path = get_scalar_param(
+            nebula_dict,
+            NEBULA_PERSISTENT_STORAGE_PATH,
+            NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
+
+        self.persistent_time_interval = get_scalar_param(
+            nebula_dict,
+            NEBULA_PERSISTENT_TIME_INTERVAL,
+            NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
+
+        self.num_of_version_in_retention = get_scalar_param(
+            nebula_dict,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
@@ -0,0 +1,86 @@
+"""
+Copyright (c) Microsoft Corporation
+Licensed under the MIT license.
+"""
+
+#########################################
+# nebula
+#########################################
+# Nebula. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+NEBULA_FORMAT = '''
+nebula should be enabled as:
+"session_params": {
+  "nebula": {
+        "enabled": true,
+        "persistent_storage_path": "/foo/bar",
+        "persistent_time_interval": 100,
+        "num_of_version_in_retention": 2,
+        "enable_nebula_load": true
+    }
+}
+'''
+
+NEBULA = "nebula"
+
+NEBULA_ENABLED = "enabled"
+NEBULA_ENABLED_DEFAULT = False
+
+# There is a case where customer want to load the checkpoint saved
+# by raw torch. Because nebula cannot load torch checkpoint directly
+# as they have different folder structures to bring the gap for
+# loading(the data are totaly same in bytes for torch and enbula s
+# aving).
+# In this case, we must disable nebula load to use raw torch load.
+# Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
+# original way of deepspeed to load, i.e. set the value of "--load".
+NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load"
+NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True
+
+# When you want to resume the previous checkpoint saved by nebula,
+# you can set NEBULA_LOAD_PATH as the parent folder of checkpoint.
+# If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH
+# will be the default path to load.
+NEBULA_LOAD_PATH = "nebula_load_path"
+NEBULA_LOAD_PATH_DEFAULT = None
+
+# Nebula will save the checkpoint under NEBULA_LOAD_PATH in the
+# asynchronous way.
+NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path"
+NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None
+
+# Time interval to trigger the nebula persistence.
+NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval"
+NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
+
+# Checkpoint number which will be kept in memory. Let us say,
+# if the value is 2. Then we have checkpoints 1 and 2 are ready
+# now. When it comes to checkpoint 3, the 1 will be removed if
+# 1 has been persisted to disk.
+NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
+NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
+
+# Neubla envs
+NEBULA_EXPORT_ENVS = [
+    'DLTS_JOB_ID',
+    'DLTS_NUM_WORKER',
+    'NEBULA_PERSISTENT_STORAGE_PATH',
+    'NEBULA_PERSISTENT_TIME_INTERVAL',
+    'AML_RUN_ID',
+    'AZUREML_RUN_TOKEN',
+    'AZUREML_WORKSPACE_SCOPE',
+    'AZUREML_EXPERIMENT_SCOPE',
+    'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
+    'AZUREML_RUN_ID',
+    'NEBULA_MEMORY_BUFFER_SIZE',
+    'AZUREML_PARAMETER_ITPJOB_NAME',
+    'FC_TASKROLE_NAME',
+    'FC_TASK_INDEX',
+    'MASTER_HOST',
+    'LOCAL_HOST',
+    'AZUREML_BLOB_ACCOUNT_NAME',
+    'AZUREML_BLOB_ACCOUNT_KEY'
+]
+
+# ITP env files
+DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'
@@ -0,0 +1,37 @@
+# Checkpoint Engine
+
+
+The `CheckpointEngine` was designed to modularized the checkpoint serialization. In this way, we can simply replace/refine the checkpoint serialization methods.
+
+### Interface for `CheckpointEngine`
+
+Basically, for checkpoint management(save/load by deepspeed with the given tag), the `CheckpointEngine` will:
+
+	1. To make preliminaries ready by call `create(tag)`. For `torch`, we can just log some extra info as `torch` can directly call `save/load` without other preparation.
+
+	2. After the `create(tag)`, deepspeed can call `save/load` to persist files into disk/memory/etc.
+
+	3. When all the files for a tag are ready, deepspeed engine will call `commit()` to tell the checkpoint engine current checkpoint is complete. For original torch, it also plays the role of logger.
+
+
+```python
+class CheckpointEngine(object):
+    # init checkpoint engine for save/load
+    def __init__(self, config_params=None):
+        pass
+
+    def create(self, tag):
+        # create checkpoint on give tag for save/load.
+        pass
+
+    def save(self, state_dict, path: str):
+        pass
+
+    def load(self, path: str, map_location=None):
+        pass
+
+    def commit(self, tag):
+        # to tell checkpoint services if all files are readys.
+        pass
+
+```
@@ -0,0 +1,19 @@
+class CheckpointEngine(object):
+
+    # init checkpoint engine for save/load
+    def __init__(self, config_params=None):
+        pass
+
+    def create(self, tag):
+        # create checkpoint on give tag for save/load.
+        pass
+
+    def save(self, state_dict, path: str):
+        pass
+
+    def load(self, path: str, map_location=None):
+        pass
+
+    def commit(self, tag):
+        # to tell checkpoint services if all files are readys.
+        pass
@@ -0,0 +1,88 @@
+import os
+import torch
+import torch_nebula
+
+from deepspeed.runtime.checkpoint_engine.checkpoint_engine import \
+    CheckpointEngine
+from deepspeed.utils import logger, log_dist
+from deepspeed.nebula.constants import *
+
+
+def _get_tag_from_path(path):
+    return os.path.basename(os.path.dirname(path))
+
+
+class NebulaCheckpointEngine(CheckpointEngine):
+    def __init__(self, config_params=None):
+        super().__init__(config_params)
+        self.checkpoint = None
+        self.tag_flag = None
+        self.enable_nebula_load = config_params.enable_nebula_load
+        self.nebula_load_path = config_params.load_path
+        if self.nebula_load_path is None:
+            self.nebula_load_path = config_params.persistent_storage_path
+
+        nebula_config_params = {
+            NEBULA_PERSISTENT_STORAGE_PATH: config_params.persistent_storage_path,
+            NEBULA_PERSISTENT_TIME_INTERVAL: config_params.persistent_time_interval,
+            NEBULA_NUM_OF_VERSION_IN_RETENTION:
+            config_params.num_of_version_in_retention,
+        }
+        torch_nebula.init(**nebula_config_params)
+
+    def create(self, tag):
+        log_dist(f"[Nebula] Start Checkpoint for tag:{tag}", ranks=[0])
+        # -2 means: customer needs to  explicitly tell nebula
+        # current checkpoint is complete by commit methond.
+        self.checkpoint = torch_nebula.Checkpoint(tag, -2)
+
+    def save(self, state_dict, path: str):
+        tag = _get_tag_from_path(path)
+        partititon_name = os.path.basename(path)
+        logger.info(f"[Nebula] Saving {partititon_name} under tag{tag}...")
+        self.checkpoint.save(partititon_name, state_dict)
+        logger.info(f"[Nebula] Saved {partititon_name} under tag{tag}.")
+        return None
+
+    def load(self, path: str, map_location=None):
+        tag = _get_tag_from_path(path)
+        first_load_flag = self.tag_flag is None or self.tag_flag == tag
+        if not self.enable_nebula_load and first_load_flag:
+            self.tag_flag = tag
+            logger.info(
+                f"[Nebula] Disable nebula load. Loading checkpoint from {path}...")
+            partition = torch.load(path, map_location=map_location)
+            logger.info(
+                f"[Nebula] Disable nebula load. Loaded checkpoint from {path}...")
+            return partition
+
+        partititon_name = os.path.basename(path)
+        logger.info(
+            f"[Nebula] Loading {path} under tag{tag} from {self.nebula_load_path}...")
+
+        checkpoint = None
+        if tag is None:
+            checkpoint = torch_nebula.get_latest_checkpoint(
+                persist_path=self.nebula_load_path)
+            if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''):
+                logger.warning(f"Unable to find latest valid checkpoint from Nebula!")
+                return None
+        else:
+            checkpoint = torch_nebula.get_checkpoint(tag=tag,
+                                                     persist_path=self.nebula_load_path)
+        partition = checkpoint.load(partititon_name, map_location=map_location)
+        logger.info(
+            f"[Nebula] Loaded {path} under tag{tag} from {self.nebula_load_path}.")
+        return partition
+
+    def commit(self, tag):
+        # nebula commit will be call when all files under give tag are ready to be persisted in the async way.
+        logger.info(
+            f"[Nebula] all files for {tag} are saved in tier1. It is ready to start persisting"
+        )
+        commit_rls = self.checkpoint.commit()
+        if not commit_rls:
+            logger.error(
+                f"[Nebula] failed to commit the checkpoint, please check the log.")
+            return False
+        return commit_rls