EleutherAI · sdtblck · Apr 6, 2021 · Mar 12, 2021 · Mar 12, 2021 · Mar 12, 2021
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
 [![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
 
+### 03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)
+
 [DeepSpeed](https://www.deepspeed.ai/) is a deep learning optimization
 library that makes distributed training easy, efficient, and effective.
 
@@ -31,15 +33,18 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/04/01] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
+* [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
+* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
 * [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
+* [2021/01/19] [[🤗Hugging Face Blog] Fit More and Train Faster With ZeRO via DeepSpeed and FairScale](https://huggingface.co/blog/zero-deepspeed-fairscale)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
   * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
   * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
   * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
   * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
-* [2020/08/07] [DeepSpeed Microsoft Research Webinar](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html) is now available on-demand
 
 
 # Table of Contents

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
@@ -49,8 +49,8 @@ def _parse_version(version_str):
 sys.modules['deepspeed.pt.loss_scaler'] = deepspeed.runtime.fp16.loss_scaler
 
 
-def initialize(args,
-               model,
+def initialize(args=None,
+               model=None,
                optimizer=None,
                model_parameters=None,
                training_data=None,
@@ -62,8 +62,7 @@ def initialize(args,
     """Initialize the DeepSpeed Engine.
 
     Arguments:
-        args: a dictionary containing local_rank and deepspeed_config
-            file location
+        args: an object containing local_rank and deepspeed_config fields. This is optional if `config_params` is passed.
 
         model: Required: nn.module class before apply any wrappers
 
@@ -88,6 +87,9 @@ def initialize(args,
             mini-batch of Tensor(s).  Used when using batched loading from a
             map-style dataset.
 
+        config_params: Optional: Instead of requiring args.deepspeed_config you can pass your deepspeed config
+            as a dictionary instead.
+
     Returns:
         A tuple of ``engine``, ``optimizer``, ``training_dataloader``, ``lr_scheduler``
 
@@ -108,6 +110,8 @@ def initialize(args,
         __git_branch__),
              ranks=[0])
 
+    assert model is not None, "deepspeed.initialize requires a model"
+
     if not isinstance(model, PipelineModule):
         engine = DeepSpeedEngine(args=args,
                                  model=model,

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
@@ -365,6 +365,12 @@ def main(args=None):
     result = subprocess.Popen(cmd, env=env)
     result.wait()
 
+    # In case of failure must propagate the error-condition back to the caller (usually shell). The
+    # actual error and traceback should have been printed in the subprocess, so in order to avoid
+    # unnecessary noise we just quietly exit here with the same code as the subprocess
+    if result.returncode > 0:
+        sys.exit(result.returncode)
+
 
 if __name__ == "__main__":
     main()
diff --git a/deepspeed/profiling/config.py b/deepspeed/profiling/config.py
@@ -3,12 +3,15 @@
 Licensed under the MIT license.
 """
 
-from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 from deepspeed.profiling.constants import *
 
 
-class DeepSpeedFlopsProfilerConfig(object):
+class DeepSpeedFlopsProfilerConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
+        """
+        docstring
+        """
         super(DeepSpeedFlopsProfilerConfig, self).__init__()
 
         self.enabled = None
@@ -24,6 +27,9 @@ def __init__(self, param_dict):
         self._initialize(flops_profiler_dict)
 
     def _initialize(self, flops_profiler_dict):
+        """
+        docstring
+        """
         self.enabled = get_scalar_param(flops_profiler_dict,
                                         FLOPS_PROFILER_ENABLED,
                                         FLOPS_PROFILER_ENABLED_DEFAULT)

diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
@@ -265,7 +265,7 @@ def del_extra_repr(module):
                 "Each module profile is listed after its name in the following order: \nnumber of parameters, percentage of total parameters, number of multiply-accumulate operations (MACs), percentage of total MACs, latency, percentage of total latency, number of floating point operations per second (FLOPS, computed as 2 * MACs / latency)."
             )
             print(
-                "Note: \n1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.\n2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throught.\n"
+                "Note: \n1. A module can have torch.nn.functional (e.g. to compute logits) along with submodules, thus making the difference between the parent's MACs(or latency) and the sum of its submodules'.\n2. Number of floating point operations is a theoretical estimation, thus FLOPS computed using that could be larger than the maximum system throughput.\n"
             )
             print(self.model)
 

diff --git a/deepspeed/runtime/activation_checkpointing/config.py b/deepspeed/runtime/activation_checkpointing/config.py
@@ -3,7 +3,7 @@
 Licensed under the MIT license.
 """
 
-from deepspeed.runtime.config_utils import get_scalar_param
+from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
 
 #########################################
 #  DeepSpeed Activation Checkpointing
@@ -56,7 +56,7 @@
 }
 
 
-class DeepSpeedActivationCheckpointingConfig(object):
+class DeepSpeedActivationCheckpointingConfig(DeepSpeedConfigObject):
     def __init__(self, param_dict):
         super(DeepSpeedActivationCheckpointingConfig, self).__init__()
 
@@ -74,13 +74,6 @@ def __init__(self, param_dict):
 
         self._initialize(act_chkpt_config_dict)
 
-    """
-    For json serialization
-    """
-
-    def repr(self):
-        return self.__dict__
-
     def _initialize(self, act_chkpt_config_dict):
         self.partition_activations = get_scalar_param(
             act_chkpt_config_dict,

diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
@@ -86,7 +86,7 @@ def compressed_allreduce(self,
 
         # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
         recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
-        # recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
+        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
         recvbuf_scale = [
             torch.zeros(1,
                         dtype=worker_scale.dtype,
@@ -106,13 +106,13 @@ def compressed_allreduce(self,
         cupy_sign_list_packed = None
 
         cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
-        # cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))
+        #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))
 
         compensated_server_m = self.compression_backend.cupy2torch(
             (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
                 self.size,
                 -1)).float().add_(-0.5).mul_(2.0).mul_(
-            torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
+                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
         compensated_server_m.add_(server_error)
         server_scale = torch.norm(compensated_server_m) / np.sqrt(
             compensated_server_m.numel())
@@ -172,7 +172,7 @@ def compressed_allreduce(self,
                 (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
                     self.size,
                     -1)).float().add_(-0.5).mul_(2.0).mul_(
-                self.compression_backend.cupy2torch(
+                        self.compression_backend.cupy2torch(
                     cupy_recvbuf_scale_server)).flatten().data)
         if original_size != worker_error_size:
             buffer_m = buffer_m[0:original_size]

diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
@@ -5,10 +5,21 @@
 """
 Collection of DeepSpeed configuration utilities
 """
-
+import json
 from collections import Counter
 
 
+class DeepSpeedConfigObject(object):
+    """
+    For json serialization
+    """
+    def repr(self):
+        return self.__dict__
+
+    def __repr__(self):
+        return json.dumps(self.__dict__, sort_keys=True, indent=4)
+
+
 def get_scalar_param(param_dict, param_name, param_default_value):
     return param_dict.get(param_name, param_default_value)