ghosthamlet · ghosthamlet · Mar 15, 2021 · Dec 11, 2020 · Dec 11, 2020 · Dec 11, 2020
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -4,14 +4,12 @@ name: Build
 
 # Controls when the action will run.
 on:
-  # Triggers the workflow on push or pull request events but only for the master branch
   push:
-    branches: [ master ]
+    paths-ignore:
+    - 'docs/**'
   pull_request:
-    branches: [ master ]
-
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
+    paths-ignore:
+    - 'docs/**'
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
@@ -50,4 +48,4 @@ jobs:
       - name: Unit tests
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose tests/unit/
diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml
@@ -0,0 +1,47 @@
+# This is a basic workflow to help you get started with Actions
+
+name: Tests-w-precompiled-ops
+
+# Controls when the action will run.
+on:
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: self-hosted
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          DS_BUILD_OPS=1 pip install .[dev]
+          ds_report
+
+      - name: Formatting checks
+        run: |
+           pre-commit run --all-files
+
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/.github/workflows/torch16.yml b/.github/workflows/torch16.yml
@@ -0,0 +1,44 @@
+# Unit test config for manual use on torch1.6 runners
+
+name: Torch16
+
+# Controls when the action will run.
+on:
+  #pull_request:
+  #  paths-ignore:
+  #  - 'docs/**'
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "build"
+  build:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, torch1.6]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
diff --git a/DeepSpeedExamples b/DeepSpeedExamples
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
+* [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
 * [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
 * [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
 * [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
@@ -113,7 +114,7 @@ overview](https://www.deepspeed.ai/features/) for descriptions and usage.
 * [Ultra-fast dense transformer kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
 * [Sparse attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html)
   * Memory- and compute-efficient sparse kernels
-  * Support 10x long sequences than dense
+  * Support 10x longer sequences than dense
   * Flexible support to different sparse structures
 * [1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html)
   * Custom communication collective
@@ -185,6 +186,8 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
 2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
+4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840).
+5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888).
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial

diff --git a/bin/ds_elastic b/bin/ds_elastic
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+
+import deepspeed
+from deepspeed.elasticity import compute_elastic_config
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json")
+    parser.add_argument('-w',
+                        '--world-size',
+                        type=int,
+                        default=0,
+                        help="Intended/current world size")
+    args = parser.parse_args()
+    ds_config = json.load(open(args.config, 'r'))
+
+    ds_version = deepspeed.__version__
+
+    elastic_config = ds_config['elasticity']
+    print('------------------------------------------')
+    print("Elasticity config:")
+    print('------------------------------------------')
+    print(json.dumps(elastic_config, indent=4, sort_keys=True))
+
+    if args.world_size > 0:
+        final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size)
+        print('------------------------------------------')
+        print(f"Calculated results for world size {args.world_size}:")
+        print('------------------------------------------')
+        print(f'final_batch_size .... {final_batch_size}')
+        print(f'valid_gpus .......... {valid_gpus}')
+        print(f'micro_batch_size .... {micro_batch_size}')
+    else:
+        final_batch_size, valid_gpus = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version)
+        print('------------------------------------------')
+        print("Calculated results:")
+        print('------------------------------------------')
+        print(f'final_batch_size .... {final_batch_size}')
+        print(f'valid_gpus .......... {valid_gpus}')
diff --git a/bin/ds_report b/bin/ds_report
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
@@ -62,6 +62,8 @@ void Adam_Optimizer::Step(float* _params,
         size_t copy_size = TILE;
         if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
         size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
+
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH) {
             AVX_Data grad_4;
@@ -101,47 +103,50 @@ void Adam_Optimizer::Step(float* _params,
             SIMD_STORE(_exp_avg_sq + i, variance_4.data);
         }
         if (dev_params) {
-            launch_param_update(_doubled_buffer[_buf_index],
-                                dev_params + t,
-                                copy_size,
-                                Context::Instance().GetCurrentStream());
+            launch_param_update(
+                _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
             _buf_index = !_buf_index;
         }
     }
 
 #endif
 
     if (_param_size > rounded_size) {
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
 #pragma omp parallel for
-        for (size_t k = rounded_size; k < _param_size; k++) {
-            float grad = grads[k];
-            float param = _params[k];
-            float momentum = _exp_avg[k];
-            float variance = _exp_avg_sq[k];
-            if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
-            momentum = momentum * _betta1;
-            momentum = grad * betta1_minus1 + momentum;
-
-            variance = variance * _betta2;
-            grad = grad * grad;
-            variance = grad * betta2_minus1 + variance;
-
-            grad = sqrt(variance);
-            grad = grad * _bias_correction2 + _eps;
-            grad = momentum / grad;
-            if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
-            param = grad * step_size + param;
-            if (dev_params) _doubled_buffer[_buf_index][k - rounded_size] = (__half)param;
-
-            _params[k] = param;
-            _exp_avg[k] = momentum;
-            _exp_avg_sq[k] = variance;
-        }
-        if (dev_params) {
-            launch_param_update(_doubled_buffer[_buf_index],
-                                dev_params + rounded_size,
-                                (_param_size - rounded_size),
-                                Context::Instance().GetCurrentStream());
+            for (size_t k = t; k < offset; k++) {
+                float grad = grads[k];
+                float param = _params[k];
+                float momentum = _exp_avg[k];
+                float variance = _exp_avg_sq[k];
+                if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
+                momentum = momentum * _betta1;
+                momentum = grad * betta1_minus1 + momentum;
+
+                variance = variance * _betta2;
+                grad = grad * grad;
+                variance = grad * betta2_minus1 + variance;
+
+                grad = sqrt(variance);
+                grad = grad * _bias_correction2 + _eps;
+                grad = momentum / grad;
+                if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
+                param = grad * step_size + param;
+                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
+
+                _params[k] = param;
+                _exp_avg[k] = momentum;
+                _exp_avg_sq[k] = variance;
+            }
+            if (dev_params) {
+                launch_param_update(
+                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
+                _buf_index = !_buf_index;
+            }
         }
     }
 }
@@ -189,6 +194,7 @@ void Adam_Optimizer::Step_4(float* _params,
         size_t copy_size = TILE;
         if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
         size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += (SIMD_WIDTH << 2)) {
             AVX_Data grad_4[4];
@@ -295,10 +301,8 @@ void Adam_Optimizer::Step_4(float* _params,
         }
 
         if (dev_params) {
-            launch_param_update(_doubled_buffer[_buf_index],
-                                dev_params + t,
-                                copy_size,
-                                Context::Instance().GetCurrentStream());
+            launch_param_update(
+                _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
             _buf_index = !_buf_index;
         }
     }
@@ -400,6 +404,7 @@ void Adam_Optimizer::Step_8(float* _params,
         size_t copy_size = TILE;
         if ((t + TILE) > rounded_size) copy_size = rounded_size - t;
         size_t offset = copy_size + t;
+        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += (SIMD_WIDTH << 3)) {
             AVX_Data grad_4[8];
@@ -582,10 +587,8 @@ void Adam_Optimizer::Step_8(float* _params,
             SIMD_STORE(_exp_avg_sq + i + SIMD_WIDTH * 7, variance_4[7].data);
         }
         if (dev_params) {
-            launch_param_update(_doubled_buffer[_buf_index],
-                                dev_params + t,
-                                copy_size,
-                                Context::Instance().GetCurrentStream());
+            launch_param_update(
+                _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
             _buf_index = !_buf_index;
         }
     }
@@ -628,6 +631,7 @@ int ds_adam_step(int optimizer_id,
     opt->update_state(lr, epsilon, weight_decay, bias_correction);
     opt->Step_8(params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0));
 
+    opt->SynchronizeStreams();
     return 0;
 }
 
@@ -664,6 +668,7 @@ int ds_adam_step_plus_copy(int optimizer_id,
     opt->Step_8(
         params_ptr, grads_ptr, exp_avg_ptr, exp_avg_sq_ptr, params_c.size(0), gpu_params_ptr);
 
+    opt->SynchronizeStreams();
     return 0;
 }
 

diff --git a/csrc/includes/context.h b/csrc/includes/context.h
@@ -64,17 +64,10 @@ class Context {
         return _ctx;
     }
 
-    void GenWorkSpace(size_t size)
+    void SetWorkSpace(void* workspace)
     {
-        if (!_workspace) {
-            assert(_workspace == nullptr);
-            cudaMalloc(&_workspace, size);
-        } else if (_workSpaceSize < size) {
-            cudaFree(_workspace);
-            cudaMalloc(&_workspace, size);
-        }
-
-        _workSpaceSize = size;
+        if (!workspace) { throw std::runtime_error("Workspace is null."); }
+        _workspace = workspace;
     }
 
     void* GetWorkSpace() { return _workspace; }
@@ -88,6 +81,8 @@ class Context {
         return stream;
     }
 
+    cudaStream_t GetNewStream() { return at::cuda::getStreamFromPool(); }
+
     cublasHandle_t GetCublasHandle() { return _cublasHandle; }
 
     std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
@@ -172,6 +167,5 @@ class Context {
     void* _workspace;
     uint64_t _seed;
     uint64_t _curr_offset;
-    size_t _workSpaceSize;
     std::vector<std::array<int, 3>> _gemm_algos;
 };