deepspeedai · jeffra · Jun 15, 2022 · Jun 3, 2022 · Jun 3, 2022 · Jun 3, 2022
@@ -63,5 +63,5 @@ jobs:
         run: |
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 -m 'not sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -n 4 unit/
           TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -x -m 'sequential' unit/
@@ -0,0 +1,63 @@
+name: nv-inference
+
+on:
+  push:
+    branches:
+      - 'master'
+      - 'staging**'
+    paths-ignore:
+      - 'docs/**'
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn,inf]
+          ds_report
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'inference' unit/
@@ -0,0 +1,52 @@
+name: nv-nightly
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu111, v100]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: environment
+        run: |
+          nvidia-smi
+          which python
+          python --version
+          which nvcc
+          nvcc --version
+          pip install --upgrade pip
+          pip uninstall --yes torch torchvision
+          pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          # git checkout 1cc453d33
+          git rev-parse --short HEAD
+          pip uninstall --yes transformers
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip uninstall --yes deepspeed
+          pip install .[dev,1bit,autotuning,sparse_attn]
+          ds_report
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'nightly' unit/
@@ -60,5 +60,5 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
           TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
@@ -53,5 +53,5 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 unit/
           TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
@@ -60,5 +60,5 @@ jobs:
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4  unit/
           TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
@@ -57,6 +57,8 @@ jobs:
           pip install .[testing]
           # find reqs used in ds integration tests
           find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec grep -v 'torch' {} \; | xargs -I {} pip install --upgrade {}
+          # force datasets version due to issues
+          pip install datasets==2.2.2
           # force protobuf version due to issues
           pip install "protobuf<4.21.0"
           pip list

@@ -0,0 +1 @@
+ds_report
@@ -174,7 +174,8 @@ __global__ void fused_bias_residual(float* input,
                                     float* attnbias,
                                     int total_count,
                                     int intermediate_size,
-                                    int mp_size)
+                                    int mp_size,
+                                    bool preln)
 {
     float4* input_cast = reinterpret_cast<float4*>(input);
     float4* output_cast = reinterpret_cast<float4*>(output);
@@ -189,12 +190,17 @@ __global__ void fused_bias_residual(float* input,
         float4 res_vec = attn_cast[offset];
         float4 bias_data = bias_cast[offset % intermediate_size];
         float4 attn_bias = attnbias_cast[offset % intermediate_size];
-
-        data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-        data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-        data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-        data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
-
+        if (preln) {
+            data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
+            data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
+            data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
+            data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
+        } else {
+            data.x = data.x + out.x + bias_data.x;
+            data.y = data.y + out.y + bias_data.y;
+            data.z = data.z + out.z + bias_data.z;
+            data.w = data.w + out.w + bias_data.w;
+        }
         output_cast[offset] = data;
     }
 }
@@ -206,7 +212,8 @@ __global__ void fused_bias_residual(__half* input,
                                     __half* attn_bias,
                                     int total_count,
                                     int intermediate_size,
-                                    int mp_size)
+                                    int mp_size,
+                                    bool preln)
 {
 #ifdef HALF_PRECISION_AVAILABLE
 
@@ -248,15 +255,21 @@ __global__ void fused_bias_residual(__half* input,
         float2 attn_low_bias = __half22float2(attnbias_half[0]);
         float2 attn_high_bias = __half22float2(attnbias_half[1]);
 
-        low_data.x =
-            (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
-        low_data.y =
-            (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-        high_data.x =
-            (high_data.x + high_res.x) * mp_size + (high_out.x + (high_bias.x + attn_high_bias.x));
-        high_data.y =
-            (high_data.y + high_res.y) * mp_size + (high_out.y + (high_bias.y + attn_high_bias.y));
-
+        if (preln) {
+            low_data.x =
+                (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
+            low_data.y =
+                (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
+            high_data.x = (high_data.x + high_res.x) * mp_size +
+                          (high_out.x + (high_bias.x + attn_high_bias.x));
+            high_data.y = (high_data.y + high_res.y) * mp_size +
+                          (high_out.y + (high_bias.y + attn_high_bias.y));
+        } else {
+            low_data.x = (low_data.x + low_out.x + low_bias.x);
+            low_data.y = (low_data.y + low_out.y + low_bias.y);
+            high_data.x = (high_data.x + high_out.x + high_bias.x);
+            high_data.y = (high_data.y + high_out.y + high_bias.y);
+        }
         vals_half[0] = __float22half2_rn(low_data);
         vals_half[1] = __float22half2_rn(high_data);
 
@@ -274,27 +287,21 @@ void launch_bias_residual(T* input,
                           int batch,
                           int hidden_dim,
                           int mp_size,
+                          bool preln,
                           cudaStream_t stream)
 {
     int total_count = batch * hidden_dim / 4;
     dim3 block_dims(1024);
     dim3 grid_dims((total_count - 1) / 1024 + 1);  // (batch_size);
 
     fused_bias_residual<<<grid_dims, block_dims, 0, stream>>>(
-        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size);
+        input, output, attn, bias, attn_bias, total_count, hidden_dim / 4, 1.0 / mp_size, preln);
 }
 
-template void
-launch_bias_residual<float>(float*, float*, float*, float*, float*, int, int, int, cudaStream_t);
-template void launch_bias_residual<__half>(__half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           __half*,
-                                           int,
-                                           int,
-                                           int,
-                                           cudaStream_t);
+template void launch_bias_residual<
+    float>(float*, float*, float*, float*, float*, int, int, int, bool, cudaStream_t);
+template void launch_bias_residual<
+    __half>(__half*, __half*, __half*, __half*, __half*, int, int, int, bool, cudaStream_t);
 
 __global__ void gptj_residual_add(float* input,
                                   float* output,

@@ -787,17 +787,17 @@ at::Tensor ds_vector_matmul_int8(at::Tensor& input,
 }
 
 template <typename T>
-void mlp_unfused_cublas(at::Tensor& output,
-                        at::Tensor& input,
-                        at::Tensor& residual,
-                        at::Tensor& input_bias,
-                        at::Tensor& weight,
-                        at::Tensor& bias,
-                        at::Tensor& gamma,
-                        at::Tensor& beta,
-                        const float epsilon,
-                        bool preLayerNorm,
-                        bool mlp_after_attn)
+at::Tensor mlp_unfused_cublas(at::Tensor& output,
+                              at::Tensor& input,
+                              at::Tensor& residual,
+                              at::Tensor& input_bias,
+                              at::Tensor& weight,
+                              at::Tensor& bias,
+                              at::Tensor& gamma,
+                              at::Tensor& beta,
+                              const float epsilon,
+                              bool preLayerNorm,
+                              bool mlp_after_attn)
 {
     int bsz = input.size(0) * input.size(1);
     auto inp_norm = at::empty_like(input);
@@ -840,18 +840,19 @@ void mlp_unfused_cublas(at::Tensor& output,
                      weight.size(1),
                      bsz,
                      Context::Instance().GetCurrentStream());
+    return inp_norm;
 }
 template <typename T>
-at::Tensor ds_mlp_gemm(at::Tensor& input,
-                       at::Tensor& residual,
-                       at::Tensor& input_bias,
-                       at::Tensor& weight,
-                       at::Tensor& bias,
-                       at::Tensor& gamma,
-                       at::Tensor& beta,
-                       const float epsilon,
-                       bool preLayerNorm,
-                       bool mlp_after_attn)
+std::vector<at::Tensor> ds_mlp_gemm(at::Tensor& input,
+                                    at::Tensor& residual,
+                                    at::Tensor& input_bias,
+                                    at::Tensor& weight,
+                                    at::Tensor& bias,
+                                    at::Tensor& gamma,
+                                    at::Tensor& beta,
+                                    const float epsilon,
+                                    bool preLayerNorm,
+                                    bool mlp_after_attn)
 {
     auto input_cont = input.contiguous();
     auto options = at::TensorOptions()
@@ -863,19 +864,19 @@ at::Tensor ds_mlp_gemm(at::Tensor& input,
     auto output = at::empty({input_cont.size(0), input_cont.size(1), weight.size(1)}, options);
     int bsz = input_cont.size(0) * input_cont.size(1);
 
-    mlp_unfused_cublas<T>(output,
-                          mlp_after_attn ? input : residual,
-                          residual,
-                          input_bias,
-                          weight,
-                          bias,
-                          gamma,
-                          beta,
-                          epsilon,
-                          preLayerNorm,
-                          mlp_after_attn);
-
-    return output;
+    auto res_add = mlp_unfused_cublas<T>(output,
+                                         mlp_after_attn ? input : residual,
+                                         residual,
+                                         input_bias,
+                                         weight,
+                                         bias,
+                                         gamma,
+                                         beta,
+                                         epsilon,
+                                         preLayerNorm,
+                                         mlp_after_attn);
+
+    return {output, res_add};
 }
 
 template <typename T>
@@ -1001,7 +1002,8 @@ void residual_add_bias(at::Tensor& output,
                        at::Tensor& attention_b,
                        int mp_size,
                        bool mlp_after_attn,
-                       bool add_bias)
+                       bool add_bias,
+                       bool preln)
 {
     int bsz = input.size(0) * input.size(1);
     int hidden_size = input.size(2);
@@ -1017,6 +1019,7 @@ void residual_add_bias(at::Tensor& output,
                                  bsz,
                                  hidden_size,
                                  mp_size,
+                                 preln,
                                  Context::Instance().GetCurrentStream());
         else
             launch_gptj_residual_add<float>((float*)input.data_ptr(),
@@ -1037,6 +1040,7 @@ void residual_add_bias(at::Tensor& output,
                              bsz,
                              hidden_size,
                              mp_size,
+                             preln,
                              Context::Instance().GetCurrentStream());
     else
         launch_gptj_residual_add<__half>((__half*)input.data_ptr(),

@@ -58,6 +58,7 @@ void launch_bias_residual(T* input,
                           int batch,
                           int hidden_dim,
                           int mp_size,
+                          bool preln,
                           cudaStream_t stream);
 
 template <typename T>