From 7cf7c89fcd2648513a08ffa02fefdd93fe6fcd08 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sun, 10 Jul 2022 04:50:35 +0500
Subject: [PATCH 01/22] Add Inference support for running the BigScience-BLOOM
 Architecture

---
 csrc/transformer/inference/csrc/gelu.cu       |  24 +-
 csrc/transformer/inference/csrc/normalize.cu  |   3 +-
 .../transformer/inference/csrc/pt_binding.cpp |  27 +-
 csrc/transformer/inference/csrc/softmax.cu    |  36 +-
 csrc/transformer/inference/includes/context.h |   1 +
 .../inference/includes/custom_cuda_layers.h   |   4 +-
 deepspeed/inference/engine.py                 | 142 ++++--
 deepspeed/module_inject/__init__.py           |   1 +
 deepspeed/module_inject/replace_module.py     | 444 ++++++++++++------
 deepspeed/module_inject/replace_policy.py     |  40 ++
 .../inference/transformer_inference.py        | 272 +++++++++--
 deepspeed/runtime/engine.py                   |   2 +
 deepspeed/runtime/state_dict_factory.py       |   2 +
 13 files changed, 758 insertions(+), 240 deletions(-)

diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index acb35325dd9e..c8736d563623 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -174,7 +174,7 @@ __global__ void fused_bias_residual(float* input,
                                     float* attnbias,
                                     int total_count,
                                     int intermediate_size,
-                                    int mp_size,
+                                    float mp_size,
                                     bool preln)
 {
     float4* input_cast = reinterpret_cast<float4*>(input);
@@ -191,10 +191,10 @@ __global__ void fused_bias_residual(float* input,
         float4 bias_data = bias_cast[offset % intermediate_size];
         float4 attn_bias = attnbias_cast[offset % intermediate_size];
         if (preln) {
-            data.x = (data.x + res_vec.x) * mp_size + (out.x + bias_data.x + attn_bias.x);
-            data.y = (data.y + res_vec.y) * mp_size + (out.y + bias_data.y + attn_bias.y);
-            data.z = (data.z + res_vec.z) * mp_size + (out.z + bias_data.z + attn_bias.z);
-            data.w = (data.w + res_vec.w) * mp_size + (out.w + bias_data.w + attn_bias.w);
+            data.x = (data.x + res_vec.x + bias_data.x + attn_bias.x) * mp_size + (out.x);
+            data.y = (data.y + res_vec.y + bias_data.y + attn_bias.y) * mp_size + (out.y);
+            data.z = (data.z + res_vec.z + bias_data.z + attn_bias.z) * mp_size + (out.z);
+            data.w = (data.w + res_vec.w + bias_data.w + attn_bias.w) * mp_size + (out.w);
         } else {
             data.x = data.x + out.x + bias_data.x;
             data.y = data.y + out.y + bias_data.y;
@@ -212,7 +212,7 @@ __global__ void fused_bias_residual(__half* input,
                                     __half* attn_bias,
                                     int total_count,
                                     int intermediate_size,
-                                    int mp_size,
+                                    float mp_size,
                                     bool preln)
 {
 #ifdef HALF_PRECISION_AVAILABLE
@@ -257,13 +257,13 @@ __global__ void fused_bias_residual(__half* input,
 
         if (preln) {
             low_data.x =
-                (low_data.x + low_res.x) * mp_size + (low_out.x + (low_bias.x + attn_low_bias.x));
+                (low_data.x + low_res.x + (low_bias.x + attn_low_bias.x)) * mp_size + low_out.x;
             low_data.y =
-                (low_data.y + low_res.y) * mp_size + (low_out.y + (low_bias.y + attn_low_bias.y));
-            high_data.x = (high_data.x + high_res.x) * mp_size +
-                          (high_out.x + (high_bias.x + attn_high_bias.x));
-            high_data.y = (high_data.y + high_res.y) * mp_size +
-                          (high_out.y + (high_bias.y + attn_high_bias.y));
+                (low_data.y + low_res.y + (low_bias.y + attn_low_bias.y)) * mp_size + low_out.y;
+            high_data.x = (high_data.x + high_res.x + (high_bias.x + attn_high_bias.x)) * mp_size +
+                          high_out.x;
+            high_data.y = (high_data.y + high_res.y + (high_bias.y + attn_high_bias.y)) * mp_size +
+                          high_out.y;
         } else {
             low_data.x = (low_data.x + low_out.x + low_bias.x);
             low_data.y = (low_data.y + low_out.y + low_bias.y);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index 7f3cfc118631..1569467e83aa 100644
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -88,6 +88,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                                int row_stride)
 {
 #ifdef HALF_PRECISION_AVAILABLE
+
     int iteration_stride = blockDim.x;
     int iterations = row_stride / iteration_stride;
 
@@ -99,7 +100,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
     int gid = id >> 5;
     int warp_num = iteration_stride >> 5;
 
-    __half2 inp_reg[NORM_REG];
+    __half2 inp_reg[8];
 
     const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
     __half2* out_cast = reinterpret_cast<__half2*>(output);
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 2c300c0b6c92..87bb214731ed 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -15,7 +15,8 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                       bool recompute,
                       bool local_attention,
                       int window_size,
-                      bool async_op)
+                      bool async_op,
+                      float layer_scale)
 {
     auto attn_scores_c = attn_scores.contiguous();
     int bsz = attn_scores_c.size(0);
@@ -32,6 +33,8 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
+                           (T*)attn_mask.data_ptr(),
+                           layer_scale,
                            triangular,
                            recompute,
                            local_attention,
@@ -145,7 +148,7 @@ void attention_unfused(at::Tensor& prev_key_cont,
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
     attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false);
+        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false, 1.0);
     alpha = 1.0;
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 k,
@@ -225,6 +228,8 @@ std::vector<at::Tensor> ds_softmax_context1(at::Tensor& query,
 template <typename T>
 void ds_softmax_internal(T* attn_scores,
                          at::Tensor& attn_mask,
+                         at::Tensor& alibi,
+                         float& layer_scale,
                          bool triangular,
                          bool recompute,
                          bool local_attention,
@@ -236,6 +241,8 @@ void ds_softmax_internal(T* attn_scores,
 {
     launch_attn_softmax_v2((T*)attn_scores,
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
+                           (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
+                           layer_scale,
                            triangular,
                            recompute,
                            local_attention,
@@ -263,9 +270,12 @@ void attention_unfused(T* prev_key_cont,
                        bool triangular,
                        bool recompute,
                        bool local_attention,
-                       int window_size)
+                       int window_size,
+                       at::Tensor& alibi,
+                       int layer_id)
 {
-    float alpha = norm_factor * norm_factor;
+    float layer_scale = std::max(1, layer_id);
+    float alpha = norm_factor * norm_factor / layer_scale;
     float gemm_beta = 0.0;
     T* workspace = (T*)output + bsz * seq_len * heads * k;
 
@@ -292,6 +302,8 @@ void attention_unfused(T* prev_key_cont,
 #endif
     ds_softmax_internal<T>(workspace,
                            attn_mask,
+                           alibi,
+                           layer_scale,
                            triangular,
                            recompute,
                            local_attention,
@@ -336,7 +348,8 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            int window_size,
                                            bool no_masking,
                                            unsigned layer_id,
-                                           unsigned num_layers)
+                                           unsigned num_layers,
+                                           at::Tensor& alibi)
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
@@ -410,7 +423,9 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                          (triangular && is_prompt),
                          is_prompt,
                          local_attention,
-                         window_size);
+                         window_size,
+                         alibi,
+                         layer_id);
     launch_transform4d_0213<T>((T*)output.data_ptr(),
                                temp_buf,
                                bsz,
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index bf3c8bc90049..7f6af09c650b 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -28,6 +28,8 @@ namespace cg = cooperative_groups;
 
 __global__ void attn_softmax_v2(__half* vals,
                                 __half* mask,
+                                __half* alibi,
+                                float layer_scale,
                                 bool triangular,
                                 bool recompute,
                                 bool local_attention,
@@ -64,7 +66,7 @@ __global__ void attn_softmax_v2(__half* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        int mask_offset = (iter_offset / (num_seq)) * (sequence_length);
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -76,7 +78,7 @@ __global__ void attn_softmax_v2(__half* vals,
             (local_attention && real_seq_id >= window_size) ? real_seq_id - window_size : -1;
 
         float max_val = minus_infinity;
-
+        // if (lane == 0) printf("%d, %d: %d \n", wid, blockIdx.x, mask_offset);
         for (int i = 0; i < iterations; i++) {
             int data_id = i * (reduceWidth << 2) + (seq_lane << 2);
             if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
@@ -102,6 +104,16 @@ __global__ void attn_softmax_v2(__half* vals,
                         high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
                         high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
                     }
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x * layer_scale +
+                                        __half2float(alibi[data_id + mask_offset]);
+                        low_data[i].y = low_data[i].y * layer_scale +
+                                        __half2float(alibi[data_id + mask_offset + 1]);
+                        high_data[i].x = high_data[i].x * layer_scale +
+                                         __half2float(alibi[data_id + mask_offset + 2]);
+                        high_data[i].y = high_data[i].y * layer_scale +
+                                         __half2float(alibi[data_id + mask_offset + 3]);
+                    }
                 } else {
                     low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
                                                             : minus_infinity;
@@ -123,6 +135,16 @@ __global__ void attn_softmax_v2(__half* vals,
                         if ((data_id + 2) < sequence_length)
                             high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
                     }
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x * layer_scale +
+                                        __half2float(alibi[data_id + mask_offset]);
+                        if ((data_id + 1) < sequence_length)
+                            low_data[i].y = low_data[i].y * layer_scale +
+                                            __half2float(alibi[data_id + mask_offset + 1]);
+                        if ((data_id + 2) < sequence_length)
+                            high_data[i].x = high_data[i].x * layer_scale +
+                                             __half2float(alibi[data_id + mask_offset + 2]);
+                    }
                 }
                 // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
                 max_val = (low_data[i].x > max_val ? low_data[i].x : max_val);
@@ -204,6 +226,8 @@ __global__ void attn_softmax_v2(__half* vals,
 
 __global__ void attn_softmax_v2(float* vals,
                                 float* attn_mask,
+                                float* alibi,
+                                float layer_scale,
                                 bool triangular,
                                 bool recompute,
                                 bool local_attention,
@@ -371,6 +395,8 @@ __global__ void attn_softmax_v2(float* vals,
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                             T* mask,
+                            T* alibi,
+                            float layer_scale,
                             bool triangular,
                             bool recompute,
                             bool local_attention,
@@ -393,6 +419,8 @@ void launch_attn_softmax_v2(T* vals,
         attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
             vals,
             mask,
+            alibi,
+            layer_scale,
             triangular,
             recompute,
             local_attention,
@@ -410,6 +438,8 @@ void launch_attn_softmax_v2(T* vals,
 
 template void launch_attn_softmax_v2(float* vals,
                                      float* mask,
+                                     float* alibi,
+                                     float layer_scale,
                                      bool triangular,
                                      bool recompute,
                                      bool local_attention,
@@ -422,6 +452,8 @@ template void launch_attn_softmax_v2(float* vals,
                                      cudaStream_t stream);
 template void launch_attn_softmax_v2(__half* vals,
                                      __half* mask,
+                                     __half* alibi,
+                                     float layer_scale,
                                      bool triangular,
                                      bool recompute,
                                      bool local_attention,
diff --git a/csrc/transformer/inference/includes/context.h b/csrc/transformer/inference/includes/context.h
index 21f0b3cfe07b..75dd625eab8f 100644
--- a/csrc/transformer/inference/includes/context.h
+++ b/csrc/transformer/inference/includes/context.h
@@ -89,6 +89,7 @@ class Context {
             cudaMalloc(&_workspace, size);
         }
 
+        if (!_workspace) { throw std::runtime_error("Workspace is null."); }
         _workSpaceSize = size;
     }
 
diff --git a/csrc/transformer/inference/includes/custom_cuda_layers.h b/csrc/transformer/inference/includes/custom_cuda_layers.h
index a48b2d7f06cc..cfcceee9a3ba 100644
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <iostream>
 
-#define MAX_OUT_TOKES 1024
+#define MAX_OUT_TOKES 128
 #define MAX_WARP_NUM 32
 #define WARP_SIZE 32
 
@@ -28,6 +28,8 @@
 template <typename T>
 void launch_attn_softmax_v2(T* vals,
                             T* mask,
+                            T* alibi,
+                            float layer_scale,
                             bool triangular,
                             bool recompute,
                             bool local_attention,
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index b5e68aaf297f..06c5e9a23342 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -18,8 +18,11 @@
 from ..pipe import PipelineModule
 from ..moe.utils import has_moe_layers
 from ..moe.layer import MoE
+from ..runtime.zero import GatheredParameters
+from ..module_inject import LinearAllreduce, LinearLayer, GatherEmbedding, Normalize, ReplaceWithTensorSlicing
 
 DS_INFERENCE_ENABLED = False
+from torch import nn
 
 
 class InferenceEngine(Module):
@@ -124,7 +127,8 @@ def __init__(self,
                                              moe,
                                              moe_experts,
                                              moe_type,
-                                             training_mp_size)
+                                             training_mp_size,
+                                             self.checkpoint)
         elif replace_method == 'auto':
             self._apply_injection_policy(
                 return_tuple=return_tuple,
@@ -132,7 +136,8 @@ def __init__(self,
                 moe=moe,
                 moe_experts=moe_experts,
                 moe_type=moe_type,
-                training_mp_size=training_mp_size)
+                training_mp_size=training_mp_size,
+                checkpoint=self.checkpoint)
 
         device = torch.cuda.current_device()
         logger.info(f"Place model to device: {device}")
@@ -230,6 +235,73 @@ def _validate_args(self, mpu):
             raise ValueError(
                 f"injection_dict must be None or a dict, got: {self.injection_dict}")
 
+    def load_model_with_checkpoint(self, r_module):
+        self.mp_replace = ReplaceWithTensorSlicing(
+            mp_group=self.mp_group,
+            mp_size=self.mp_world_size)  #, out_dim=0, in_dim=1)
+        error_msgs = []
+
+        def load(module, state_dict, prefix):
+            args = (state_dict, prefix, {}, True, [], [], error_msgs)
+            if len(list(module.parameters())) > 0 and list(
+                    module.parameters())[0].numel() == 0:
+                with GatheredParameters(list(module.parameters(recurse=False)),
+                                        modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        module._load_from_state_dict(*args)
+            else:
+                if hasattr(module, 'weight'):
+                    if 'query_key_value' in prefix:
+                        module.weight = self.mp_replace.qkv_copy(
+                            module.weight.data,
+                            state_dict[prefix + 'weight'])
+                    else:
+                        module.weight = self.mp_replace.copy(
+                            module.weight.data,
+                            state_dict[prefix + 'weight'])
+                else:
+                    module.norm.weight = self.mp_replace.copy(
+                        module.norm.weight.data,
+                        state_dict[prefix + 'weight'])
+                if prefix + 'bias' in self.key_list:
+                    if hasattr(module, 'norm'):
+                        module.norm.bias = self.mp_replace.copy(
+                            module.norm.bias,
+                            state_dict[prefix + 'bias'])
+                    else:
+                        data = state_dict[prefix + 'bias']
+                        data = data.to(torch.cuda.current_device())
+                        module.bias = self.mp_replace.copy(module.bias, data)
+
+        layer_policies = {
+            nn.Linear: load,
+            nn.Embedding: load,
+            nn.LayerNorm: load,
+            LinearLayer: load,
+            LinearAllreduce: load
+        }
+
+        def load_module_recursive(module, prefix='', level=0):
+            for name, child in module.named_children():
+                if child.__class__ in layer_policies:
+                    checking_key = prefix + name + '.'
+                    if not any(checking_key in item for item in self.key_list):
+                        continue
+                    if len(list(child.parameters())) > 0 and list(
+                            child.parameters())[0].numel() == 0:
+                        if len(child.weight.ds_shape) == 1:
+                            child = Normalize(dim=child.weight.ds_shape[-1],
+                                              dtype=child.weight.dtype,
+                                              eps=child.eps)
+                            setattr(module, name, child)
+                    load(child, self.sd, prefix + name + '.')
+                else:
+                    load_module_recursive(child,
+                                          prefix if level == 0 else prefix + name + '.',
+                                          level + 1)
+
+        load_module_recursive(r_module)
+
     def _apply_injection_policy(self,
                                 client_module=None,
                                 injection_policy=None,
@@ -238,8 +310,8 @@ def _apply_injection_policy(self,
                                 moe=False,
                                 moe_experts=1,
                                 moe_type='standard',
-                                training_mp_size=1):
-
+                                training_mp_size=1,
+                                checkpoint=None):
         replace_transformer_layer(client_module,
                                   self.module,
                                   triangular_masking=self.triangular_masking,
@@ -303,34 +375,48 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
         else:
             sd_loader = SDLoaderFactory.get_sd_loader_json(load_dir)
 
-        mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
+        if type(sd_loader) is list:
+            self.sd = torch.load(sd_loader[0], map_location='cpu')
+            self.key_list = list(self.sd.keys())
 
-        load_path, checkpoint, quantize_config = sd_loader.load(self.mp_world_size,
-                                                  mp_rank,
-                                                  is_pipe_parallel=is_pipe_parallel,
-                                                  quantize=(self.dtype is torch.int8),
-                                                  quantize_groups=self.quantize_groups,
-                                                  mlp_extra_grouping=self.mlp_extra_grouping)
+            self.load_model_with_checkpoint(self.module)
 
-        self.quantization_scales, self.quantize_merge_count = quantize_config
+            for i in range(1, len(sd_loader)):
+                if not torch.distributed.is_initialized() or torch.distributed.get_rank(
+                ) == 0:
+                    print(f"loading checkpoint ({i})")
+                self.sd = torch.load(sd_loader[i], map_location='cuda')
+                self.key_list = list(self.sd.keys())
+                self.load_model_with_checkpoint(self.module)
+        else:
+            mp_rank = 0 if self.mpu is None else self.mpu.get_model_parallel_rank()
 
-        moe, _ = has_moe_layers(self.module)
-        if moe:
-            from deepspeed.runtime.engine import DeepSpeedEngine
-            old_moe_load = False
-            if not isinstance(checkpoint['num_experts'], list):
-                old_moe_load = True
-            DeepSpeedEngine.load_moe_state_dict(
-                load_dir,
-                tag,
+            load_path, checkpoint, quantize_config = sd_loader.load(self.mp_world_size,
+                                                    mp_rank,
+                                                    is_pipe_parallel=is_pipe_parallel,
+                                                    quantize=(self.dtype is torch.int8),
+                                                    quantize_groups=self.quantize_groups,
+                                                    mlp_extra_grouping=self.mlp_extra_grouping)
+
+            self.quantization_scales, self.quantize_merge_count = quantize_config
+
+            moe, _ = has_moe_layers(self.module)
+            if moe:
+                from deepspeed.runtime.engine import DeepSpeedEngine
+                old_moe_load = False
+                if not isinstance(checkpoint['num_experts'], list):
+                    old_moe_load = True
+                DeepSpeedEngine.load_moe_state_dict(
+                    load_dir,
+                    tag,
+                    state_dict=checkpoint[self._choose_module_key(checkpoint)],
+                    old_moe_load=old_moe_load,
+                    model=self.module,
+                    mpu=self.mpu)
+
+            self.module.load_state_dict(
                 state_dict=checkpoint[self._choose_module_key(checkpoint)],
-                old_moe_load=old_moe_load,
-                model=self.module,
-                mpu=self.mpu)
-
-        self.module.load_state_dict(
-            state_dict=checkpoint[self._choose_module_key(checkpoint)],
-            strict=load_module_strict)
+                strict=load_module_strict)
 
     def _choose_module_key(self, sd):
         assert not ('module' in sd and 'model' in sd), "checkpoint has both 'model' and 'module' keys, not sure how to proceed"
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
index 7fe1a3b36b2e..c7d652a31db9 100755
--- a/deepspeed/module_inject/__init__.py
+++ b/deepspeed/module_inject/__init__.py
@@ -1,3 +1,4 @@
 from .replace_module import replace_transformer_layer, revert_transformer_layer
 from .module_quantize import quantize_transformer_layer
 from .replace_policy import DSPolicy, HFBertLayerPolicy
+from .replace_module import LinearAllreduce, LinearLayer, GatherEmbedding, Normalize, ReplaceWithTensorSlicing
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 16afaefab495..00677ddb76bf 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -2,12 +2,15 @@
 import torch
 import deepspeed
 import deepspeed.ops.transformer as transformer_inference
-from .replace_policy import HFBertLayerPolicy, HFGPT2LayerPolicy, HFGPTJLayerPolicy
+from .replace_policy import HFBertLayerPolicy, HFGPT2LayerPolicy, HFGPTJLayerPolicy, BLOOMLayerPolicy
 from .replace_policy import replace_policies
 from ..constants import INFERENCE_GENERIC_MODE, INFERENCE_SPECIALIZED_MODE
 from ..runtime.weight_quantizer import WeightQuantization
 from torch import nn
 from deepspeed import comm as dist
+from torch import nn
+from torch.nn import functional as F
+#from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 
 class LinearAllreduce(nn.Module):
@@ -18,7 +21,7 @@ def __init__(self, weight, bias=None, mp_group=None):
         self.mp_group = mp_group
 
     def forward(self, input):
-        output = torch.matmul(input, self.weight)
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
         if self.mp_group is not None:
             dist.all_reduce(output, group=self.mp_group)
         if self.bias is not None:
@@ -33,18 +36,46 @@ def __init__(self, weight, bias=None):
         self.bias = bias
 
     def forward(self, input):
-        output = torch.matmul(input, self.weight)
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
         if self.bias is not None:
             output += self.bias
         return output
 
 
+class Normalize(nn.Module):
+    def __init__(self, dim, dtype=torch.float, eps=1e-5):
+        super(Normalize, self).__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps).to(dtype).to(torch.cuda.current_device())
+        self.weight = self.norm.weight
+        self.bias = self.norm.bias
+
+    def forward(self, input):
+        return self.norm(input)
+
+
+class GatherEmbedding(nn.Module):
+    def __init__(self, weight_shape, dtype=torch.float, mp_size=1, mp_group=None):
+        super(GatherEmbedding, self).__init__()
+        self.weight = torch.empty(weight_shape).to(dtype).to(torch.cuda.current_device())
+        self.mp_size = mp_size
+        self.mp_group = mp_group
+
+    def forward(self, input):
+        output = F.embedding(input, self.weight)
+        out_list = [torch.empty_like(output) for _ in range(self.mp_size)]
+        torch.distributed.all_gather(out_list, output, group=self.mp_group)
+        return torch.cat(out_list, dim=-1)
+
+
 class ReplaceWithTensorSlicing:
-    def __init__(self, mp_group=None):
+    def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
         if mp_group is not None:
             self.gpu_index = dist.get_rank(group=mp_group)
         else:
             self.gpu_index = 0
+        self.out_dim = out_dim
+        self.in_dim = in_dim
+        self.mp_size = mp_size
 
     def merge_assert(self, dim1, dim2):
         assert dim1 > dim2, \
@@ -57,68 +88,86 @@ def qkv_copy(self, dst, src):
             return src
         src_shape = src.shape
         dst_shape = dst.shape
-
-        src_split = torch.split(src.data, src.shape[-1] // 3, dim=-1)
+        if self.out_dim == 0:
+            src_split = torch.split(src.data,
+                                    src_shape[self.out_dim] // self.mp_size,
+                                    dim=0)
+        else:
+            src_split = torch.split(src.data, src.shape[-1] // 3, dim=-1)
 
         if (len(src_shape) == 2 and len(dst_shape) == 2):
-            if src_shape[1] == dst_shape[1]:
-                return torch.nn.Parameter(src)
-
-            self.merge_assert(src_shape[1], dst_shape[1])
-            qkv_size = dst_shape[1] // 3
-            qkv_split = [torch.split(src_s, qkv_size, dim=1) for src_s in src_split]
-
-            weight_split = [
-                torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                          axis=1) for i in range(len(qkv_split[0]))
-            ]
-            dst.data.copy_(weight_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+            if src_shape[self.out_dim] == dst_shape[self.out_dim]:
+                return torch.nn.parameter.Parameter(src)
+            if self.out_dim == 1:
+                self.merge_assert(src_shape[self.out_dim], dst_shape[self.out_dim])
+                qkv_size = dst_shape[self.out_dim] // 3
+                qkv_split = [
+                    torch.split(src_s,
+                                qkv_size,
+                                dim=self.out_dim) for src_s in src_split
+                ]
+
+                weight_split = [
+                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
+                              axis=self.out_dim) for i in range(len(qkv_split[0]))
+                ]
+                dst.data.copy_(weight_split[self.gpu_index].to(
+                    torch.cuda.current_device()).contiguous())
+            else:
+                dst.data.copy_(src_split[self.gpu_index].to(
+                    torch.cuda.current_device()).contiguous())
         else:
             if src_shape[0] == dst_shape[0]:
-                return torch.nn.Parameter(src)
-
-            qkv_size = dst_shape[0] // 3
-            qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
-            bias_split = [
-                torch.cat([qkv_s[i] for qkv_s in qkv_split],
-                          axis=0) for i in range(len(qkv_split[0]))
-            ]
-            dst.data.copy_(bias_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                return torch.nn.parameter.Parameter(src)
+            if self.out_dim == 1:
+                qkv_size = dst_shape[0] // 3
+                qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
+                bias_split = [
+                    torch.cat([qkv_s[i] for qkv_s in qkv_split],
+                              axis=0) for i in range(len(qkv_split[0]))
+                ]
+                dst.data.copy_(bias_split[self.gpu_index].to(
+                    torch.cuda.current_device()).contiguous())
+            else:
+                dst.data.copy_(src_split[self.gpu_index].to(
+                    torch.cuda.current_device()).contiguous())
 
-        return torch.nn.Parameter(dst)
+        return torch.nn.parameter.Parameter(dst)
 
     def copy(self, dst, src):
         if src is None:
             return src
-
+        #src = src.to(dst.dtype)
         src_shape = src.shape
         dst_shape = dst.shape
-
         if (len(src_shape) == 2 and len(dst_shape) == 2):
 
             if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
-                return torch.nn.Parameter(src)
-
-            if src_shape[0] != dst_shape[0]:
-                self.merge_assert(src_shape[0], dst_shape[0])
-                weight_split = torch.split(src, dst_shape[0])
+                dst.data.copy_(src)
             else:
-                self.merge_assert(src_shape[1], dst_shape[1])
-                weight_split = torch.split(src.data, dst_shape[1], dim=1)
-
-            dst.data.copy_(weight_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                if src_shape[self.in_dim] != dst_shape[self.in_dim]:
+                    self.merge_assert(src_shape[self.in_dim], dst_shape[self.in_dim])
+                    weight_split = torch.split(src,
+                                               dst_shape[self.in_dim],
+                                               dim=self.in_dim)
+                else:
+                    self.merge_assert(src_shape[self.out_dim], dst_shape[self.out_dim])
+                    weight_split = torch.split(src.data,
+                                               dst_shape[self.out_dim],
+                                               dim=self.out_dim)
+                dst.data.copy_(weight_split[self.gpu_index].to(
+                    torch.cuda.current_device()).contiguous())
         else:
             if src_shape[0] == dst_shape[0]:
-                return torch.nn.Parameter(src)
-
-            bias_split = torch.split(src.data, dst_shape[-1])
-            dst.data.copy_(bias_split[self.gpu_index].to(
-                torch.cuda.current_device()).contiguous())
+                dst.data.copy_(src)
+            else:
+                #print(f'{torch.distributed.get_rank()}:{src.data.shape}, {dst_shape}\n')
+                #exit()
+                bias_split = torch.split(src.data, dst_shape[-1])
+                dst.data.copy_(bias_split[self.gpu_index].to(
+                    torch.cuda.current_device()).contiguous())
 
-        return torch.nn.Parameter(dst)
+        return torch.nn.parameter.Parameter(dst, requires_grad=False)
 
 
 def replace_transformer_layer(orig_layer_impl,
@@ -206,6 +255,7 @@ def replace_with_policy(child,
             moe = True
 
         attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention, megatron_v2 = policy.attention()
+        #print(f"********************* {torch.distributed.get_rank()}: {dense_b}")
         if not moe or moe_type == 'standard':
             mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp()
         else:
@@ -243,7 +293,8 @@ def replace_with_policy(child,
             _res_4hh_w = _res_4hh_w.half()
             _res_coef = _res_coef.half()
 
-        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
+        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group,
+                                              mp_size=mp_size)  #, out_dim=0, in_dim=1)
         #expert_mp_replace = ReplaceWithTensorSlicing(mp_group=expert_mp_group)
 
         if inference:
@@ -267,6 +318,7 @@ def replace_with_policy(child,
             else:
                 rotary_dim = config.rotary_dim if hasattr(config, 'rotary_dim') else child.attention.rotary_ndims \
                                             if hasattr(child, 'attention') and hasattr(child.attention,'rotary_ndims') else -1
+                bigscience_bloom = policy_cls is BLOOMLayerPolicy
                 transformer_config = transformer_inference.DeepSpeedInferenceConfig(
                     hidden_size=hidden_size,
                     heads=num_attention_heads,
@@ -291,7 +343,8 @@ def replace_with_policy(child,
                                                                'window_size') else 1),
                     rotary_dim=rotary_dim,
                     mlp_after_attn=(rotary_dim is None or rotary_dim < 0),
-                    training_mp_size=training_mp_size)
+                    training_mp_size=training_mp_size,
+                    bigscience_bloom=bigscience_bloom)
 
             if quantize and quantize_settings is not None:
                 (quantization_scales,
@@ -353,58 +406,78 @@ def replace_with_policy(child,
             # transpose it here to reduce inference cost!
             def transpose(data):
                 # temp move to cpu to avoid requiring extra GPU memory during the reshape
-                data = data.to('cpu')
                 data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
                 data = data.reshape(data.shape[-1], data.shape[-2])
-                data.to(torch.cuda.current_device())
                 return data
 
+            from ..runtime.zero import GatheredParameters
             if attn_linear_layer:
-                qkvw.data = transpose(qkvw.data)
-                dense_w.data = transpose(dense_w.data)
+                if qkvw.numel() == 0:
+                    with GatheredParameters([qkvw,
+                                             dense_w,
+                                             qkvb,
+                                             dense_b],
+                                            modifier_rank=0):
+                        qkvw = transpose(qkvw.data)
+                        dense_w = transpose(dense_w.data)
+                        qkvb = qkvb.data
+                        dense_b = dense_b.data
+                else:
+                    qkvw.data = transpose(qkvw.data)
+                    dense_w.data = transpose(dense_w.data)
+
+            def _transpose(x):
+                num_attention_heads_per_partition = transformer_config.heads // transformer_config.mp_size
+                attention_head_size = x.shape[-1] // num_attention_heads_per_partition
+                new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
+                                               attention_head_size)
+                x_1 = x.view(*new_x_shape)
+                (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1))
+                if len(q.shape) > 2:
+                    return torch.cat((q.reshape(q.shape[0],
+                                                -1),
+                                      k.reshape(q.shape[0],
+                                                -1),
+                                      v.reshape(q.shape[0],
+                                                -1)),
+                                     dim=-1).reshape(x.shape)
+                else:
+                    return torch.cat((q.reshape(-1),
+                                      k.reshape(-1),
+                                      v.reshape(-1)),
+                                     dim=-1).reshape(x.shape)
 
             if megatron_v2:
                 new_module.config.rotate_half = True
                 new_module.config.rotate_every_two = False
 
-                def _transpose(x):
-                    num_attention_heads_per_partition = transformer_config.heads // transformer_config.mp_size
-                    attention_head_size = x.shape[-1] // num_attention_heads_per_partition
-                    new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
-                                                   attention_head_size)
-                    x_1 = x.view(*new_x_shape)
-                    (q,
-                     k,
-                     v) = torch.split(x_1,
-                                      (x_1.shape[-1] // 3),
-                                      dim=(x_1.dim() - 1))
-                    if len(q.shape) > 2:
-                        return torch.cat((q.reshape(q.shape[0],
-                                                    -1),
-                                          k.reshape(q.shape[0],
-                                                    -1),
-                                          v.reshape(q.shape[0],
-                                                    -1)),
-                                         dim=-1).reshape(x.shape)
-                    else:
-                        return torch.cat((q.reshape(-1),
-                                          k.reshape(-1),
-                                          v.reshape(-1)),
-                                         dim=-1).reshape(x.shape)
-
-                qkvw = torch.nn.Parameter(_transpose(qkvw).contiguous())
-                qkvb = torch.nn.Parameter(_transpose(qkvb).contiguous())
+                # Note: this part needs to be added for BLOOM architecture
+                qkvw = torch.nn.parameter.Parameter(_transpose(qkvw).contiguous())
+                qkvb = torch.nn.parameter.Parameter(_transpose(qkvb).contiguous())
 
-            dense_b = dense_b if dense_b is None else dense_b * (
-                transformer_config.training_mp_size / transformer_config.mp_size)
-            _4hh_b = _4hh_b * (transformer_config.training_mp_size /
-                               transformer_config.mp_size)
+            # NOTE: This part caused instability in the multi-GPU inference!
+            # TODO: This needs to be incorporated in the kernels.
+            #dense_b = dense_b if dense_b is None else dense_b * (
+            #    transformer_config.training_mp_size / transformer_config.mp_size)
+            #_4hh_b = _4hh_b * (transformer_config.training_mp_size /
+            #                   transformer_config.mp_size)
 
             if mlp_linear_layer:
-                _h4h_w = [transpose(moe_w1.data)
-                          for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data)
-                _4hh_w = [transpose(moe_w1.data)
-                          for moe_w1 in _4hh_w] if moe else transpose(_4hh_w.data)
+                if not moe and _4hh_w.numel() == 0:
+                    with GatheredParameters([_h4h_w,
+                                             _4hh_w,
+                                             _4hh_b,
+                                             _h4h_b],
+                                            modifier_rank=0):
+                        _h4h_w = transpose(_h4h_w.data)
+                        _4hh_w = transpose(_4hh_w.data)
+                        _h4h_b = _h4h_b.data
+                        _4hh_b = _4hh_b.data
+                else:
+                    _h4h_w = [transpose(moe_w1.data)
+                              for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data)
+                    _4hh_w = [transpose(moe_w1.data)
+                              for moe_w1 in _4hh_w] if moe else transpose(_4hh_w.data)
 
             if moe and moe_type == 'residual':
                 _res_h4h_w.data = transpose(_res_h4h_w.data)
@@ -412,11 +485,24 @@ def _transpose(x):
                 _res_coef.data = transpose(_res_coef.data)
 
             attn_block = new_module.attention
-            attn_block.attn_qkvw = mp_replace.qkv_copy(attn_block.attn_qkvw, qkvw)
-            attn_block.attn_qkvb = mp_replace.qkv_copy(attn_block.attn_qkvb, qkvb)
 
-            attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
-            attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
+            if qkvw.numel() == 0:
+                with GatheredParameters([attn_qkvw,
+                                         attn_qkvb,
+                                         attn_ow,
+                                         attn_ob],
+                                        modifier_rank=0):
+                    attn_block.attn_qkvw = mp_replace.copy(attn_block.attn_qkvw, qkvw)
+                    attn_block.attn_qkvb = mp_replace.copy(attn_block.attn_qkvb, qkvb)
+
+                    attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
+                    attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
+            else:
+                attn_block.attn_qkvw = mp_replace.copy(attn_block.attn_qkvw, qkvw)
+                attn_block.attn_qkvb = mp_replace.copy(attn_block.attn_qkvb, qkvb)
+
+                attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
+                attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
 
             mpl_block = new_module.mlp
             if moe:
@@ -448,20 +534,54 @@ def _transpose(x):
                         torch.cuda.current_device())
                     new_module.res_coef.data = _res_coef.to(torch.cuda.current_device())
             else:
-                mpl_block.inter_w.data = mp_replace.copy(mpl_block.inter_w, _h4h_w)
-                mpl_block.inter_b.data = mp_replace.copy(mpl_block.inter_b, _h4h_b)
-                mpl_block.output_w.data = mp_replace.copy(mpl_block.output_w, _4hh_w)
-                mpl_block.output_b.data = mp_replace.copy(mpl_block.output_b, _4hh_b)
-                if attn_nw is None:
-                    new_module.mlp.attn_nw = attn_nw
+
+                if _4hh_w.numel() == 0:
+                    with GatheredParameters([_h4h_w,
+                                             _4hh_w,
+                                             _4hh_w,
+                                             _4hh_b],
+                                            modifier_rank=0):
+                        mpl_block.inter_w = mp_replace.copy(mpl_block.inter_w, _h4h_w)
+                        mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b)
+                        mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w)
+                        mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b)
                 else:
-                    new_module.mlp.attn_nw.data = attn_nw.to(torch.cuda.current_device())
-                if attn_nb is None:
-                    new_module.mlp.attn_nb = attn_nb
+                    mpl_block.inter_w = mp_replace.copy(mpl_block.inter_w, _h4h_w)
+                    mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b)
+                    mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w)
+                    mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b)
+                if attn_nw.numel() == 0:
+                    with GatheredParameters([attn_nw, attn_nb], modifier_rank=0):
+                        if attn_nw is None:
+                            new_module.mlp.attn_nw = attn_nw
+                        else:
+                            new_module.mlp.attn_nw.data.copy_(
+                                attn_nw.to(torch.cuda.current_device()))
+                        if attn_nb is None:
+                            new_module.mlp.attn_nb = attn_nb
+                        else:
+                            new_module.mlp.attn_nb.data.copy_(
+                                attn_nb.to(torch.cuda.current_device()))
                 else:
-                    new_module.mlp.attn_nb.data = attn_nb.to(torch.cuda.current_device())
-            new_module.norm_w.data = input_nw.to(torch.cuda.current_device())
-            new_module.norm_b.data = input_nb.to(torch.cuda.current_device())
+                    if attn_nw is None:
+                        new_module.mlp.attn_nw = attn_nw
+                    else:
+                        new_module.mlp.attn_nw.data.copy_(
+                            attn_nw.to(torch.cuda.current_device()))
+                    if attn_nb is None:
+                        new_module.mlp.attn_nb = attn_nb
+                    else:
+                        new_module.mlp.attn_nb.data.copy_(
+                            attn_nb.to(torch.cuda.current_device()))
+            if input_nw.numel() == 0:
+                with GatheredParameters([input_nw, input_nb], modifier_rank=0):
+                    new_module.norm_w.data.copy_(input_nw.to(
+                        torch.cuda.current_device()))
+                    new_module.norm_b.data.copy_(input_nb.to(
+                        torch.cuda.current_device()))
+            else:
+                new_module.norm_w.data.copy_(input_nw.to(torch.cuda.current_device()))
+                new_module.norm_b.data.copy_(input_nb.to(torch.cuda.current_device()))
         else:
             transformer_config = deepspeed.DeepSpeedTransformerConfig(
                 batch_size=micro_batch_size if micro_batch_size > 0 else 1,
@@ -502,58 +622,86 @@ def _transpose(x):
     def replace_wo_policy(module, all_reduce_linears):
         def _replace(child, name, conv_linear_layer):
             mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
+            z_inference = (len(list(child.parameters())) > 0) and (list(
+                child.parameters())[0].numel() == 0)
+            if z_inference:
+                weight_shape = child.weight.ds_shape
+            else:
+                weight_shape = child.weight.shape
             if name in all_reduce_linears:
-                new_weight = torch.empty(
-                    (child.weight.shape[0]
-                     if conv_linear_layer else child.weight.shape[1] // mp_size,
-                     child.weight.shape[1]
-                     if conv_linear_layer else child.weight.shape[0]),
-                    device=child.weight.device,
-                    dtype=torch.half if fp16 else torch.float)
-                if not conv_linear_layer:
-                    child.weight.data.view(-1).copy_(
-                        child.weight.data.transpose(-1,
-                                                    -2).contiguous().view(-1))
-                    child.weight.data = child.weight.data.reshape(
-                        child.weight.data.shape[-1],
-                        child.weight.data.shape[-2])
-                data = mp_replace.copy(new_weight,
-                                       child.weight.data).to(torch.cuda.current_device())
+                new_weight = torch.empty((
+                    weight_shape[1] if conv_linear_layer else weight_shape[0],
+                    (weight_shape[0] if conv_linear_layer else weight_shape[1]) //
+                    mp_size,
+                ),
+                                         device=child.weight.device,
+                                         dtype=child.weight.dtype)
+                if z_inference:
+                    with deepspeed.zero.GatheredParameters(child.weight,
+                                                           modifier_rank=0):
+                        data = child.weight.data.to(new_weight.device)
+                        if conv_linear_layer:
+                            data = data.transpose(-1, -2).contiguous()
+                        data = mp_replace.copy(new_weight, data)
+                else:
+                    if conv_linear_layer:
+                        child.weight.data = child.weight.data.transpose(-1,
+                                                                        -2).contiguous()
+                    data = mp_replace.copy(new_weight, child.weight.data)
+                new_bias = torch.empty((weight_shape[0]),
+                                       device=child.weight.device,
+                                       dtype=child.weight.dtype)
+
+                if z_inference:
+                    with deepspeed.zero.GatheredParameters(child.bias, modifier_rank=0):
+                        new_bias.data.copy_(child.bias.data)
+                else:
+                    new_bias.data.copy_(child.bias.data)
                 return LinearAllreduce(data, child.bias if child.bias is None else \
-                            child.bias.to(torch.cuda.current_device()), mp_group)
+                            torch.nn.parameter.Parameter(new_bias.to(torch.cuda.current_device())), mp_group)
             else:
-                new_weight = torch.empty(
-                    (child.weight.shape[0] //
-                     mp_size if conv_linear_layer else child.weight.shape[1],
-                     child.weight.shape[1]
-                     if conv_linear_layer else child.weight.shape[0] // mp_size),
-                    device=child.weight.device,
-                    dtype=torch.half if fp16 else torch.float)
-                if not conv_linear_layer:
-                    child.weight.data.view(-1).copy_(
-                        child.weight.data.transpose(-1,
-                                                    -2).contiguous().view(-1))
-                    child.weight.data = child.weight.data.reshape(
-                        child.weight.data.shape[-1],
-                        child.weight.data.shape[-2])
-                data = mp_replace.copy(new_weight, child.weight.data)
-                new_bias = torch.empty((child.weight.shape[1] // mp_size),
+                new_weight = torch.empty((
+                    (weight_shape[1] if conv_linear_layer else weight_shape[0]) //
+                    mp_size,
+                    weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1],
+                ),
+                                         device=child.weight.device,
+                                         dtype=child.weight.dtype)
+                if z_inference:
+                    with deepspeed.zero.GatheredParameters(child.weight,
+                                                           modifier_rank=0):
+                        data = child.weight.data.to(new_weight.device)
+                        if conv_linear_layer:
+                            data = data.transpose(-1, -2).contiguous()
+                        data = mp_replace.copy(new_weight, data)
+                else:
+                    if conv_linear_layer:
+                        child.weight.data = child.weight.data.transpose(-1,
+                                                                        -2).contiguous()
+                    data = mp_replace.copy(new_weight, child.weight.data)
+
+                new_bias = torch.empty((weight_shape[0] // mp_size),
                                        device=child.weight.device,
-                                       dtype=torch.half if fp16 else torch.float)
-                bias_data = None if child.bias is None else mp_replace.copy(
-                    new_bias,
-                    child.bias.data).to(torch.cuda.current_device())
+                                       dtype=child.weight.dtype)
+                if z_inference:
+                    with deepspeed.zero.GatheredParameters(child.bias, modifier_rank=0):
+                        bias_data = None if child.bias is None else mp_replace.copy(
+                            new_bias,
+                            child.bias.data).to(torch.cuda.current_device())
+                else:
+                    bias_data = None if child.bias is None else mp_replace.copy(
+                        new_bias,
+                        child.bias.data).to(torch.cuda.current_device())
                 return LinearLayer(data.to(torch.cuda.current_device()), bias_data)
 
         def _slice_embedding(child, name, conv_linear_layer):
             mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
-            new_weight = torch.empty((child.weight.shape[0],
-                                      child.weight.shape[1] // mp_size),
+            new_weight = torch.empty((weight_shape[0],
+                                      weight_shape[1] // mp_size),
                                      device=child.weight.device,
                                      dtype=child.weight.dtype)
-            data = mp_replace.copy(new_weight, child.weight.data)
-            new_embedding = nn.Embedding(child.weight.shape[0],
-                                         child.weight.shape[1] // mp_size)
+            data = mp_replace.copy(new_weight, child.weight.ds_tensor.data)
+            new_embedding = nn.Embedding(weight_shape[0], weight_shape[1] // mp_size)
             new_embedding.weight.data.copy_(data)
             return new_embedding
 
@@ -570,6 +718,8 @@ def update_mp_params(child):
                 child.all_head_size = child.all_head_size // mp_size
             if hasattr(child, 'embed_dim'):
                 child.embed_dim = child.embed_dim // mp_size
+            if hasattr(child, 'hidden_size'):
+                child.hidden_size = child.hidden_size // mp_size
 
         conv_linear_layer = False
         if linear_layer_setting is not None:
diff --git a/deepspeed/module_inject/replace_policy.py b/deepspeed/module_inject/replace_policy.py
index f4054c633810..59393b1d8477 100755
--- a/deepspeed/module_inject/replace_policy.py
+++ b/deepspeed/module_inject/replace_policy.py
@@ -321,6 +321,45 @@ def layerNorm(self):
                self.client_module.ln_1.bias
 
 
+class BLOOMLayerPolicy(DSPolicy):
+    _orig_layer_class = None
+
+    def __init__(self, client_module, inference=True):
+        super().__init__(inference, linear_layer=True)
+        self.client_module = client_module
+        try:
+            import transformers
+            BLOOMLayerPolicy._orig_layer_class = transformers.models.bloom.modeling_bloom.BloomBlock
+        except:
+            BLOOMLayerPolicy._orig_layer_class = None
+
+    def get_hidden_heads(self):
+        return self.client_module.self_attention.hidden_size, \
+                self.client_module.self_attention.num_heads
+
+    def attention(self):
+        return self.linear_layer, \
+                self.client_module.self_attention.query_key_value.weight, \
+                self.client_module.self_attention.query_key_value.bias, \
+                self.client_module.self_attention.dense.weight, \
+                self.client_module.self_attention.dense.bias, \
+                self.scale_attention, \
+                self.is_megatron_v2
+
+    def mlp(self):
+        return self.linear_layer, \
+            self.client_module.mlp.dense_h_to_4h.weight, \
+            self.client_module.mlp.dense_h_to_4h.bias, \
+            self.client_module.mlp.dense_4h_to_h.weight, \
+            self.client_module.mlp.dense_4h_to_h.bias
+
+    def layerNorm(self):
+        return self.client_module.post_attention_layernorm.weight, \
+               self.client_module.post_attention_layernorm.bias, \
+               self.client_module.input_layernorm.weight, \
+               self.client_module.input_layernorm.bias
+
+
 class GPTNEOXLayerPolicy(DSPolicy):
     _orig_layer_class = None
     version = 0
@@ -383,4 +422,5 @@ def layerNorm(self):
     HFGPTJLayerPolicy,
     MegatronLayerPolicy,
     HFGPT2LayerPolicy,
+    BLOOMLayerPolicy,
 ]
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 4fa162bc3491..cefe318ceb67 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -48,6 +48,7 @@ class DeepSpeedInferenceConfig(TransformerConfig):
 
             scale_attention: If true, both q and k are scaled by 1/sqrt(attention_heads) before attention computation.
             return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
+            bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
     """
     def __init__(self,
                  hidden_size=-1,
@@ -70,7 +71,8 @@ def __init__(self,
                  rotate_every_two=True,
                  return_tuple=True,
                  mlp_after_attn=True,
-                 training_mp_size=1):
+                 training_mp_size=1,
+                 bigscience_bloom=False):
         super(DeepSpeedInferenceConfig,
               self).__init__(
                   hidden_size,
@@ -95,6 +97,7 @@ def __init__(self,
         self.mlp_after_attn = mlp_after_attn
         self.specialized_mode = False
         self.training_mp_size = training_mp_size
+        self.bigscience_bloom = bigscience_bloom
 
     @classmethod
     def from_dict(cls, json_object):
@@ -136,7 +139,8 @@ def forward(ctx,
                 q_groups,
                 merge_count,
                 qkv_merging,
-                score_context_func):
+                score_context_func,
+                alibi):
         def _transpose_for_scores(x, key=False, reshape=False):
             attention_head_size = x.shape[-1] // num_attention_heads_per_partition
             new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
@@ -156,6 +160,143 @@ def _transpose_for_context(x):
                                       (hidden_size_per_partition,)
             return x.view(*new_x_layer_shape).contiguous()
 
+        ########### This part is taken/modified form the HF modeling_bloom.py ################
+        # Reference: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py
+
+        def split_tensor_along_last_dim(tensor,
+                                        num_partitions,
+                                        contiguous_split_chunks=True):
+            """Split a tensor along its last dimension.
+
+            Args:
+                tensor: ([`torch.tensor`], *required*):
+                    input tensor to split
+                num_partitions ([`int`], *required*):
+                    number of partitions to split the tensor
+                contiguous_split_chunks ([`bool`], *optional*, default=`False`)::
+                    If True, make each chunk contiguous in memory.
+            """
+            # Get the size and dimension.
+            last_dim = tensor.dim() - 1
+            numerator, denominator = tensor.size()[last_dim], num_partitions
+            if not (numerator % denominator == 0):
+                raise ValueError(f"{numerator} is not divisible by {denominator}")
+            last_dim_size = numerator // denominator
+            # Split.
+            tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
+            # Note: torch.split does not create contiguous tensors by default.
+            if contiguous_split_chunks:
+                return tuple(chunk.contiguous() for chunk in tensor_list)
+
+            return tensor_list
+
+        def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
+            # [batch_size, seq_length, 3 x hidden_size] --> [batch_size, seq_length, num_heads, 3 x head_dim]
+            head_dim = hidden_size_per_partition // num_attention_heads_per_partition
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                num_attention_heads_per_partition,
+                3 * head_dim)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            # [batch_size, seq_length, num_heads, 3 x head_dim] --> 3  [batch_size, seq_length, num_heads, head_dim]
+            (query_layer,
+             key_layer,
+             value_layer) = split_tensor_along_last_dim(mixed_x_layer,
+                                                        3)
+
+            if layer_past is not None:
+                past_key, past_value = layer_past
+                key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=1)
+                value_layer = torch.cat((past_value.type_as(value_layer),
+                                         value_layer),
+                                        dim=1)
+
+            presents = (key_layer, value_layer)
+            # [batch_size, head_dim, q_length, k_length]
+            output_size = (query_layer.size(0),
+                           query_layer.size(2),
+                           query_layer.size(1),
+                           key_layer.size(1))
+            # [batch_size, q_length, num_heads, head_dim] -> [q_length, batch_size * num_heads, head_dim]
+            query_layer = query_layer.transpose(1,
+                                                0).reshape(
+                                                    output_size[2],
+                                                    output_size[0] * output_size[1],
+                                                    -1)
+            # [batch_size, k_length, num_heads, head_dim] -> [k_length, batch_size * num_heads, head_dim]
+            key_layer = key_layer.transpose(1,
+                                            0).reshape(output_size[3],
+                                                       output_size[0] * output_size[1],
+                                                       -1)
+
+            # slice alibi tensor until the query length
+            batch_heads = output_size[0] * output_size[1]
+            offset = torch.distributed.get_rank(
+            ) * batch_heads if torch.distributed.is_initialized() else 0
+            sliced_alibi = alibi[offset:batch_heads + offset, :, :output_size[3]]
+
+            # Raw attention scores. [batch_size * num_heads, q_length, k_length]
+            layer_number = max(1, config.layer_id)
+            norm_factor = (1 / norm_factor) * (1 / norm_factor) / layer_number
+            matmul_result = torch.matmul(query_layer.transpose(1,
+                                                               0),
+                                         key_layer.transpose(1,
+                                                             0).transpose(
+                                                                 1,
+                                                                 2)) * norm_factor
+
+            # change view to [batch_size, num_heads, q_length, k_length]
+            attention_scores = matmul_result.view(*output_size)
+
+            attention_probs = inference_cuda_module.softmax_fp16(
+                attention_scores,
+                sliced_alibi,
+                (output_size[-2] > 1),
+                False,
+                False,
+                1,
+                False,
+                layer_number)
+
+            # context layer shape: [batch_size, num_heads, q_length, head_dim]
+            output_size = (value_layer.size(0),
+                           value_layer.size(2),
+                           query_layer.size(0),
+                           value_layer.size(3))
+            # change view [k_length, batch_size x num_heads, head_dim]
+            value_layer = value_layer.transpose(1,
+                                                0).reshape(
+                                                    value_layer.size(1),
+                                                    output_size[0] * output_size[1],
+                                                    -1)
+
+            # change view [batch_size x num_heads, q_length, k_length]
+            attention_probs_reshaped = attention_probs.view(
+                output_size[0] * output_size[1],
+                output_size[2],
+                -1)
+
+            # matmul: [batch_size * num_heads, q_length, head_dim]
+            context_layer = torch.bmm(attention_probs_reshaped,
+                                      value_layer.transpose(0,
+                                                            1))
+
+            # change view [batch_size, num_heads, q_length, head_dim]
+            context_layer = context_layer.view(*output_size)
+
+            # [batchs_size, num_heads, q_length, head_dim] --> [q_length, batch_size, num_heads, head_dim]
+            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+
+            # [q_length, batch_size, num_heads, head_dim] --> [q_length, batch_size, hidden_size]
+            new_context_layer_shape = context_layer.size()[:-2] + (
+                hidden_size_per_partition,
+            )
+            context_layer = context_layer.view(*new_context_layer_shape)
+
+            return context_layer, presents
+
+        ###################### End of HF modeling_bloom addition ########################
+
         def compute_attention(qkv_out, input_mask):
             no_masking = input_mask is None
 
@@ -244,23 +385,33 @@ def compute_attention(qkv_out, input_mask):
 
                 return context_layer, presents[0], presents[1] # atten_output, key_layer, value_layer
             else:
-                attn_key_value = score_context_func(
-                    qkv_out,
-                    input_mask,
-                    config.rotary_dim,
-                    config.rotate_half,
-                    config.rotate_every_two,
-                    num_attention_heads_per_partition,
-                    (1 / norm_factor if config.scale_attention else 1.0),
-                    config.triangular_masking,
-                    config.local_attention,
-                    config.window_size,
-                    no_masking,
-                    config.layer_id,
-                    DeepSpeedTransformerInference.layer_id)
-
-                context_layer, key_layer, value_layer = attn_key_value
-                return context_layer, key_layer, value_layer
+                # Note: This modification is added for the BLOOM-176B model and will be removed later!
+                if config.bigscience_bloom:
+                    context_layer, presents = backup_attention(qkv_out, layer_past, alibi, input_mask, norm_factor)
+                    return context_layer, presents[0], presents[1] #key_layer, value_layer
+                else:
+                    if alibi is not None:
+                        batch_heads = qkv_out.shape[0] * num_attention_heads_per_partition
+                        offset = torch.distributed.get_rank(
+                        ) * batch_heads if torch.distributed.is_initialized() else 0
+                        sliced_alibi = alibi[offset:batch_heads + offset, :, :]
+                    attn_key_value = score_context_func(
+                        qkv_out,
+                        input_mask,
+                        config.rotary_dim,
+                        config.rotate_half,
+                        config.rotate_every_two,
+                        num_attention_heads_per_partition,
+                        (1 / norm_factor if config.scale_attention else 1.0),
+                        config.triangular_masking,
+                        config.local_attention,
+                        config.window_size,
+                        no_masking,
+                        config.layer_id,
+                        DeepSpeedTransformerInference.layer_id,
+                        sliced_alibi if alibi is not None else torch.empty(1))
+                    context_layer, key_layer, value_layer = attn_key_value
+                    return context_layer, key_layer, value_layer
 
         def selfAttention_fp():
             vector_matmul_func = inference_cuda_module.vector_matmul_fp16 if config.fp16 else \
@@ -276,7 +427,6 @@ def selfAttention_fp():
             else:
                 qkv_func = inference_cuda_module.qkv_gemm_fp16 if config.fp16 else \
                                     inference_cuda_module.qkv_gemm_fp32
-
                 qkv_out = qkv_func(input,
                                    attn_qkvw,
                                    (attn_qkvb if attn_qkvb is not None else norm_b),
@@ -285,7 +435,6 @@ def selfAttention_fp():
                                    config.epsilon,
                                    (attn_qkvb is not None),
                                    DeepSpeedTransformerInference.layer_id)
-
             context_layer, key_layer, value_layer = compute_attention(qkv_out[0] if isinstance(qkv_out, list) else qkv_out, input_mask)
             output = vector_matmul_func(context_layer, attn_ow, False)
 
@@ -347,19 +496,29 @@ def __init__(self,
                  qkv_merging=False):
         super(DeepSpeedSelfAttention, self).__init__()
         self.config = config
+        data_type = torch.half if config.fp16 else torch.float
         self.config.layer_id = DeepSpeedSelfAttention.num_layers
         DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
         self.attn_qkvw = nn.Parameter(
-            torch.Tensor(self.config.hidden_size,
-                         (self.config.hidden_size // self.config.mp_size) * 3))
+            torch.empty(self.config.hidden_size,
+                        (self.config.hidden_size // self.config.mp_size) * 3,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
         self.attn_qkvb = nn.Parameter(
-            torch.Tensor((self.config.hidden_size // self.config.mp_size) * 3))
+            torch.empty((self.config.hidden_size // self.config.mp_size) * 3,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
 
         self.attn_ow = nn.Parameter(
-            torch.Tensor(self.config.hidden_size // self.config.mp_size,
-                         self.config.hidden_size))
+            torch.empty(self.config.hidden_size // self.config.mp_size,
+                        self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
 
-        self.attn_ob = nn.Parameter(torch.Tensor(self.config.hidden_size))
+        self.attn_ob = nn.Parameter(
+            torch.empty(self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
 
         self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
@@ -389,7 +548,8 @@ def forward(self,
                 encoder_attention_mask=None,
                 output_attentions=False,
                 norm_w=None,
-                norm_b=None):
+                norm_b=None,
+                alibi=None):
         output = DeepSpeedSelfAttentionFunction.apply(
             input,
             input_mask,
@@ -414,7 +574,8 @@ def forward(self,
             self.q_groups,
             self.merge_count,
             self.qkv_merging,
-            self.score_context_func)
+            self.score_context_func,
+            alibi)
 
         return output
 
@@ -482,6 +643,7 @@ def forward(ctx,
                                              config.pre_layer_norm,
                                              config.mlp_after_attn)
                 output = vector_matmul_func(intermediate, output_w, False)
+
         inference_cuda_module.residual_add(
             output,
             residual if config.pre_layer_norm else residual_add,
@@ -513,17 +675,33 @@ def __init__(self,
         super(DeepSpeedMLP, self).__init__()
 
         self.config = config
-        self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
+        data_type = torch.half if config.fp16 else torch.float
+        self.attn_nw = nn.Parameter(
+            torch.empty(self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
+        self.attn_nb = nn.Parameter(
+            torch.empty(self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
         self.inter_w = nn.Parameter(
-            torch.Tensor(self.config.hidden_size,
-                         self.config.intermediate_size // self.config.mp_size))
+            torch.empty(self.config.hidden_size,
+                        self.config.intermediate_size // self.config.mp_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
         self.inter_b = nn.Parameter(
-            torch.Tensor(self.config.intermediate_size // self.config.mp_size))
+            torch.empty(self.config.intermediate_size // self.config.mp_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
         self.output_w = nn.Parameter(
-            torch.Tensor((self.config.intermediate_size // self.config.mp_size),
-                         self.config.hidden_size))
-        self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
+            torch.empty((self.config.intermediate_size // self.config.mp_size),
+                        self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
+        self.output_b = nn.Parameter(
+            torch.empty(self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
 
         # used for quantization
         self.q_scales = q_scales
@@ -595,6 +773,7 @@ def __init__(self,
         self.config.layer_id = DeepSpeedTransformerInference.layer_id
         DeepSpeedTransformerInference.layer_id += 1
 
+        data_type = torch.half if config.fp16 else torch.float
         global inference_cuda_module
         if inference_cuda_module is None:
             builder = op_builder.InferenceBuilder()
@@ -615,8 +794,14 @@ def __init__(self,
                                 merge_count,
                                 mlp_extra_grouping)
 
-        self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
-        self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
+        self.norm_w = nn.Parameter(
+            torch.empty(self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
+        self.norm_b = nn.Parameter(
+            torch.empty(self.config.hidden_size,
+                        dtype=data_type,
+                        device=torch.cuda.current_device()))
         self.layer_past = None
 
     def forward(self,
@@ -632,7 +817,9 @@ def forward(self,
                 encoder_hidden_states=None,
                 encoder_attention_mask=None,
                 use_cache=False,
+                alibi=None,
                 output_attentions=False):
+        #import pdb;pdb.set_trace()
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
         layer_past = layer_past if layer_past is not None else self.layer_past
@@ -658,10 +845,11 @@ def forward(self,
                                               encoder_attention_mask,
                                               output_attentions,
                                               self.norm_w,
-                                              self.norm_b)
+                                              self.norm_b,
+                                              alibi)
             presents = (key, value)
             self.layer_past = presents
-
+            #import pdb;pdb.set_trace()
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
@@ -673,8 +861,6 @@ def forward(self,
                                       self.config.epsilon)
 
             output = output.to(input_type)
-        #print(f'[{deepspeed.comm.get_rank()}] {self.config.layer_id}: {output.norm()}')
-        #exit()
         if get_present:
             output = (output, presents)
 
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 892394e049ad..128ad121b2c5 100644
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -443,6 +443,7 @@ def __getattr__(self, name):
         """
         Pass through attributes defined in the model if they are not overridden by ds-engine.
         """
+
         _module = {}
         if "module" in self.__dict__:
             _module = self.__dict__['module']
@@ -944,6 +945,7 @@ def is_replicated(p):
 
     @staticmethod
     def __check_params(model: Module, dtype: torch.dtype) -> None:
+        return
         if not all(param.dtype == dtype
                    for param in model.parameters()) and dist.get_rank() == 0:
             raise ValueError(
diff --git a/deepspeed/runtime/state_dict_factory.py b/deepspeed/runtime/state_dict_factory.py
index 09887aaa275c..35ccb8d5fd02 100755
--- a/deepspeed/runtime/state_dict_factory.py
+++ b/deepspeed/runtime/state_dict_factory.py
@@ -22,6 +22,8 @@ def get_sd_loader_json(json_file):
             sd_type = data['type']
             ckpt_list = data['checkpoints']
             version = data['version']
+            if 'BLOOM' in sd_type or 'Bloom' in sd_type:
+                return ckpt_list
             return SDLoaderFactory.get_sd_loader(ckpt_list, sd_type, version)
 
     @staticmethod

From 6cc23400ca491d8b850ea9b2202cd382fab63c04 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 11 Jul 2022 08:32:15 -0700
Subject: [PATCH 02/22] formatting

---
 deepspeed/inference/engine.py                              | 5 ++---
 deepspeed/module_inject/replace_module.py                  | 5 +----
 .../ops/transformer/inference/transformer_inference.py     | 7 +++----
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 06c5e9a23342..ad0d53a46b3b 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -247,7 +247,7 @@ def load(module, state_dict, prefix):
                     module.parameters())[0].numel() == 0:
                 with GatheredParameters(list(module.parameters(recurse=False)),
                                         modifier_rank=0):
-                    if torch.distributed.get_rank() == 0:
+                    if dist.get_rank() == 0:
                         module._load_from_state_dict(*args)
             else:
                 if hasattr(module, 'weight'):
@@ -382,8 +382,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
             self.load_model_with_checkpoint(self.module)
 
             for i in range(1, len(sd_loader)):
-                if not torch.distributed.is_initialized() or torch.distributed.get_rank(
-                ) == 0:
+                if not dist.is_initialized() or dist.get_rank() == 0:
                     print(f"loading checkpoint ({i})")
                 self.sd = torch.load(sd_loader[i], map_location='cuda')
                 self.key_list = list(self.sd.keys())
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 00677ddb76bf..855fb0e77a1b 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -63,7 +63,7 @@ def __init__(self, weight_shape, dtype=torch.float, mp_size=1, mp_group=None):
     def forward(self, input):
         output = F.embedding(input, self.weight)
         out_list = [torch.empty_like(output) for _ in range(self.mp_size)]
-        torch.distributed.all_gather(out_list, output, group=self.mp_group)
+        dist.all_gather(out_list, output, group=self.mp_group)
         return torch.cat(out_list, dim=-1)
 
 
@@ -161,8 +161,6 @@ def copy(self, dst, src):
             if src_shape[0] == dst_shape[0]:
                 dst.data.copy_(src)
             else:
-                #print(f'{torch.distributed.get_rank()}:{src.data.shape}, {dst_shape}\n')
-                #exit()
                 bias_split = torch.split(src.data, dst_shape[-1])
                 dst.data.copy_(bias_split[self.gpu_index].to(
                     torch.cuda.current_device()).contiguous())
@@ -255,7 +253,6 @@ def replace_with_policy(child,
             moe = True
 
         attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention, megatron_v2 = policy.attention()
-        #print(f"********************* {torch.distributed.get_rank()}: {dense_b}")
         if not moe or moe_type == 'standard':
             mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp()
         else:
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index cefe318ceb67..02d985fc5c80 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -231,8 +231,7 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
 
             # slice alibi tensor until the query length
             batch_heads = output_size[0] * output_size[1]
-            offset = torch.distributed.get_rank(
-            ) * batch_heads if torch.distributed.is_initialized() else 0
+            offset = dist.get_rank() * batch_heads if dist.is_initialized() else 0
             sliced_alibi = alibi[offset:batch_heads + offset, :, :output_size[3]]
 
             # Raw attention scores. [batch_size * num_heads, q_length, k_length]
@@ -392,8 +391,8 @@ def compute_attention(qkv_out, input_mask):
                 else:
                     if alibi is not None:
                         batch_heads = qkv_out.shape[0] * num_attention_heads_per_partition
-                        offset = torch.distributed.get_rank(
-                        ) * batch_heads if torch.distributed.is_initialized() else 0
+                        offset = dist.get_rank() * batch_heads if dist.is_initialized(
+                        ) else 0
                         sliced_alibi = alibi[offset:batch_heads + offset, :, :]
                     attn_key_value = score_context_func(
                         qkv_out,

From 2a69756b7c5f90c0becc5fbf8db88af8d4fcc6ad Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Wed, 13 Jul 2022 02:37:08 +0500
Subject: [PATCH 03/22] add the checkpoint loading at the same of
 kernel-injection

---
 deepspeed/inference/engine.py              |  33 +--
 deepspeed/module_inject/__init__.py        |   4 +-
 deepspeed/module_inject/layers.py          |  71 ++++++
 deepspeed/module_inject/load_checkpoint.py | 112 +++++++++
 deepspeed/module_inject/replace_module.py  | 256 ++++++++++-----------
 5 files changed, 328 insertions(+), 148 deletions(-)
 create mode 100644 deepspeed/module_inject/layers.py
 create mode 100644 deepspeed/module_inject/load_checkpoint.py

diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 06c5e9a23342..cb5926904b92 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -19,7 +19,7 @@
 from ..moe.utils import has_moe_layers
 from ..moe.layer import MoE
 from ..runtime.zero import GatheredParameters
-from ..module_inject import LinearAllreduce, LinearLayer, GatherEmbedding, Normalize, ReplaceWithTensorSlicing
+from ..module_inject import LinearAllreduce, LinearLayer, Normalize, ReplaceWithTensorSlicing
 
 DS_INFERENCE_ENABLED = False
 from torch import nn
@@ -99,7 +99,7 @@ def __init__(self,
             assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
                 "If you want to use cuda graph, please upgrade torch to at least v1.10"
 
-        if self.checkpoint:
+        if self.checkpoint and not replace_with_kernel_inject:
             self._load_checkpoint(self.checkpoint)
 
         # convert model to intended dtype
@@ -120,15 +120,16 @@ def __init__(self,
 
         if self.injection_dict:
             for client_module, injection_policy in self.injection_dict.items():
-                self._apply_injection_policy(client_module,
-                                             injection_policy,
-                                             return_tuple,
-                                             replace_with_kernel_inject,
-                                             moe,
-                                             moe_experts,
-                                             moe_type,
-                                             training_mp_size,
-                                             self.checkpoint)
+                self._apply_injection_policy(
+                    client_module,
+                    injection_policy,
+                    return_tuple,
+                    replace_with_kernel_inject,
+                    moe,
+                    moe_experts,
+                    moe_type,
+                    training_mp_size,
+                    self.checkpoint if replace_with_kernel_inject else None)
         elif replace_method == 'auto':
             self._apply_injection_policy(
                 return_tuple=return_tuple,
@@ -137,7 +138,7 @@ def __init__(self,
                 moe_experts=moe_experts,
                 moe_type=moe_type,
                 training_mp_size=training_mp_size,
-                checkpoint=self.checkpoint)
+                checkpoint_dir=self.checkpoint if replace_with_kernel_inject else None)
 
         device = torch.cuda.current_device()
         logger.info(f"Place model to device: {device}")
@@ -311,7 +312,8 @@ def _apply_injection_policy(self,
                                 moe_experts=1,
                                 moe_type='standard',
                                 training_mp_size=1,
-                                checkpoint=None):
+                                checkpoint_dir=None):
+        checkpoint = SDLoaderFactory.get_sd_loader_json(checkpoint_dir)
         replace_transformer_layer(client_module,
                                   self.module,
                                   triangular_masking=self.triangular_masking,
@@ -333,7 +335,8 @@ def _apply_injection_policy(self,
                                   moe=moe,
                                   moe_experts=moe_experts,
                                   moe_type=moe_type,
-                                  training_mp_size=training_mp_size)
+                                  training_mp_size=training_mp_size,
+                                  checkpoint=checkpoint)
 
     def _get_all_ckpt_names(self, checkpoints_path, tag):
         ckpt_file_pattern = self._get_ckpt_name(checkpoints_path,
@@ -435,6 +438,8 @@ def _convert_to_dtype(self):
                                                                         self.quantize_groups)
         elif self.dtype == torch.half:
             self.module.half()
+        elif self.dtype == torch.bfloat16:
+            self.module.bfloat16()
         elif self.dtype == torch.float:
             self.module.float()
 
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
index c7d652a31db9..6082d0aff862 100755
--- a/deepspeed/module_inject/__init__.py
+++ b/deepspeed/module_inject/__init__.py
@@ -1,4 +1,4 @@
-from .replace_module import replace_transformer_layer, revert_transformer_layer
+from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing
 from .module_quantize import quantize_transformer_layer
 from .replace_policy import DSPolicy, HFBertLayerPolicy
-from .replace_module import LinearAllreduce, LinearLayer, GatherEmbedding, Normalize, ReplaceWithTensorSlicing
+from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
new file mode 100644
index 000000000000..fa71c8d7d4d0
--- /dev/null
+++ b/deepspeed/module_inject/layers.py
@@ -0,0 +1,71 @@
+import torch
+import deepspeed
+import deepspeed.ops.transformer as transformer_inference
+from deepspeed import comm as dist
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn.parameter import Parameter
+
+
+class LinearAllreduce(nn.Module):
+    def __init__(self, weight, bias=None, mp_group=None):
+        super(LinearAllreduce, self).__init__()
+        self.weight = weight
+        self.bias = bias
+        self.mp_group = mp_group
+
+    def forward(self, input):
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.mp_group is not None:
+            dist.all_reduce(output, group=self.mp_group)
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class LinearLayer(nn.Module):
+    def __init__(self, weight_shape=None, dtype=None, weight=None, bias=None):
+        super(LinearLayer, self).__init__()
+        if weight is not None:
+            self.weight = weight
+            self.bias = bias
+        else:
+            self.weight = Parameter(
+                torch.empty(weight_shape,
+                            dtype=dtype,
+                            device=torch.cuda.current_device()))
+            self.bias = Parameter(
+                torch.empty(weight_shape[0],
+                            dtype=dtype,
+                            device=torch.cuda.current_device()))
+
+    def forward(self, input):
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+
+class Normalize(nn.Module):
+    def __init__(self, dim, dtype=torch.float, eps=1e-5):
+        super(Normalize, self).__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps).to(dtype).to(torch.cuda.current_device())
+        self.weight = self.norm.weight
+        self.bias = self.norm.bias
+
+    def forward(self, input):
+        return self.norm(input)
+
+
+class EmbeddingLayer(nn.Module):
+    def __init__(self, weight_shape, dtype=torch.float):
+        super(EmbeddingLayer, self).__init__()
+        self.weight = Parameter(
+            torch.empty(weight_shape[0],
+                        weight_shape[1],
+                        dtype=dtype,
+                        device=torch.cuda.current_device()))
+
+    def forward(self, input):
+        return F.embedding(input, self.weight)
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
new file mode 100644
index 000000000000..2f7547c4c2f5
--- /dev/null
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -0,0 +1,112 @@
+import deepspeed
+import torch
+from torch import nn
+from torch.nn import functional as F
+import deepspeed.ops.transformer as transformer_inference
+from ..runtime.zero import GatheredParameters
+from .layers import LinearAllreduce, LinearLayer, Normalize, EmbeddingLayer
+
+
+def load_model_with_checkpoint(r_module, sd, mp_replace):
+    error_msgs = []
+
+    def transpose(data):
+        data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
+        data = data.reshape(data.shape[-1], data.shape[-2])
+        return data
+
+    def load(module, prefix):
+        args = (sd, prefix, {}, True, [], [], error_msgs)
+
+        if len(list(module.parameters())) > 0 and list(
+                module.parameters())[0].numel() == 0:
+            with GatheredParameters(list(module.parameters(recurse=False)),
+                                    modifier_rank=0):
+                module._load_from_sd(*args)
+        else:
+            if hasattr(module, 'weight'):
+                weight_data = sd[prefix + 'weight']
+                module.weight = mp_replace.copy(module.weight.data, weight_data)
+            if prefix + 'bias' in sd.keys():
+                data = sd[prefix + 'bias']
+                data = data.to(torch.cuda.current_device())
+                module.bias = mp_replace.copy(module.bias.data, data)
+
+    def load_transformer_layer(module, prefix):
+        module.norm_w.data.copy_(sd[prefix + 'input_layernorm.' + 'weight'])
+        module.norm_b.data.copy_(sd[prefix + 'input_layernorm.' + 'bias'])
+        module.attention.attn_qkvw = mp_replace.copy(
+            module.attention.attn_qkvw.data,
+            transpose(sd[prefix + 'self_attention.query_key_value.' + 'weight']))
+        module.attention.attn_qkvb = mp_replace.copy(
+            module.attention.attn_qkvb.data,
+            sd[prefix + 'self_attention.query_key_value.' + 'bias'])
+        module.attention.attn_ow = mp_replace.copy(
+            module.attention.attn_ow.data,
+            transpose(sd[prefix + 'self_attention.dense.' + 'weight']))
+        module.attention.attn_ob = mp_replace.copy(
+            module.attention.attn_ob.data,
+            sd[prefix + 'self_attention.dense.' + 'bias'])
+        module.mlp.attn_nw.data.copy_(sd[prefix + 'post_attention_layernorm.' +
+                                         'weight'])
+        module.mlp.attn_nb.data.copy_(sd[prefix + 'post_attention_layernorm.' + 'bias'])
+        module.mlp.inter_w = mp_replace.copy(
+            module.mlp.inter_w.data,
+            transpose(sd[prefix + 'mlp.dense_h_to_4h.' + 'weight']))
+        module.mlp.inter_b = mp_replace.copy(module.mlp.inter_b.data,
+                                             sd[prefix + 'mlp.dense_h_to_4h.' + 'bias'])
+        module.mlp.output_w = mp_replace.copy(
+            module.mlp.output_w.data,
+            transpose(sd[prefix + 'mlp.dense_4h_to_h.' + 'weight']))
+        module.mlp.output_b = mp_replace.copy(module.mlp.output_b.data,
+                                              sd[prefix + 'mlp.dense_4h_to_h.' + 'bias'])
+
+    layer_policies = {
+        nn.Linear: load,
+        nn.Embedding: load,
+        nn.LayerNorm: load,
+        EmbeddingLayer: load,
+        LinearLayer: load,
+        Normalize: load,
+        transformer_inference.DeepSpeedTransformerInference: load_transformer_layer
+    }
+
+    all_ds_ids = {}
+
+    def load_module_recursive(module, prefix='', level=0):
+        for name, child in module.named_children():
+            if child.__class__ in layer_policies:
+                checking_key = prefix + name + '.'
+                if not any(checking_key in item for item in sd.keys()):
+                    if hasattr(child, 'weight') and \
+                        hasattr(child.weight, 'ds_id') and \
+                        child.weight.ds_id in all_ds_ids:
+                        prefix1 = all_ds_ids[child.weight.ds_id]
+                        if child.__class__ is nn.Linear:
+                            child = LinearLayer(weight=all_ds_ids[child.weight.ds_id])
+                            setattr(module, name, child)
+                    continue
+                if len(list(child.parameters())) > 0 and list(
+                        child.parameters())[0].numel() == 0:
+                    if child.__class__ is nn.LayerNorm:
+                        child = Normalize(dim=child.weight.ds_shape[-1],
+                                          dtype=child.weight.dtype,
+                                          eps=child.eps)
+                        setattr(module, name, child)
+                    else:
+                        ds_id = None
+                        if hasattr(child.weight, 'ds_id'):
+                            ds_id = child.weight.ds_id
+                        child = EmbeddingLayer(weight_shape=child.weight.ds_shape,
+                                               dtype=child.weight.dtype)
+                        if ds_id is not None:
+                            all_ds_ids[ds_id] = child.weight
+                        setattr(module, name, child)
+
+                layer_policies[child.__class__](child, prefix + name + '.')
+            else:
+                load_module_recursive(child,
+                                      prefix if level == 0 else prefix + name + '.',
+                                      level + 1)
+
+    load_module_recursive(r_module)
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 00677ddb76bf..bd93278a3777 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -10,61 +10,10 @@
 from deepspeed import comm as dist
 from torch import nn
 from torch.nn import functional as F
-#from transformers.deepspeed import is_deepspeed_zero3_enabled
 
-
-class LinearAllreduce(nn.Module):
-    def __init__(self, weight, bias=None, mp_group=None):
-        super(LinearAllreduce, self).__init__()
-        self.weight = weight
-        self.bias = bias
-        self.mp_group = mp_group
-
-    def forward(self, input):
-        output = torch.matmul(input, self.weight.transpose(-1, -2))
-        if self.mp_group is not None:
-            dist.all_reduce(output, group=self.mp_group)
-        if self.bias is not None:
-            output += self.bias
-        return output
-
-
-class LinearLayer(nn.Module):
-    def __init__(self, weight, bias=None):
-        super(LinearLayer, self).__init__()
-        self.weight = weight
-        self.bias = bias
-
-    def forward(self, input):
-        output = torch.matmul(input, self.weight.transpose(-1, -2))
-        if self.bias is not None:
-            output += self.bias
-        return output
-
-
-class Normalize(nn.Module):
-    def __init__(self, dim, dtype=torch.float, eps=1e-5):
-        super(Normalize, self).__init__()
-        self.norm = nn.LayerNorm(dim, eps=eps).to(dtype).to(torch.cuda.current_device())
-        self.weight = self.norm.weight
-        self.bias = self.norm.bias
-
-    def forward(self, input):
-        return self.norm(input)
-
-
-class GatherEmbedding(nn.Module):
-    def __init__(self, weight_shape, dtype=torch.float, mp_size=1, mp_group=None):
-        super(GatherEmbedding, self).__init__()
-        self.weight = torch.empty(weight_shape).to(dtype).to(torch.cuda.current_device())
-        self.mp_size = mp_size
-        self.mp_group = mp_group
-
-    def forward(self, input):
-        output = F.embedding(input, self.weight)
-        out_list = [torch.empty_like(output) for _ in range(self.mp_size)]
-        torch.distributed.all_gather(out_list, output, group=self.mp_group)
-        return torch.cat(out_list, dim=-1)
+from ..runtime.zero import GatheredParameters
+from .layers import LinearAllreduce, LinearLayer, Normalize, EmbeddingLayer
+from .load_checkpoint import load_model_with_checkpoint
 
 
 class ReplaceWithTensorSlicing:
@@ -137,7 +86,6 @@ def qkv_copy(self, dst, src):
     def copy(self, dst, src):
         if src is None:
             return src
-        #src = src.to(dst.dtype)
         src_shape = src.shape
         dst_shape = dst.shape
         if (len(src_shape) == 2 and len(dst_shape) == 2):
@@ -147,25 +95,27 @@ def copy(self, dst, src):
             else:
                 if src_shape[self.in_dim] != dst_shape[self.in_dim]:
                     self.merge_assert(src_shape[self.in_dim], dst_shape[self.in_dim])
-                    weight_split = torch.split(src,
-                                               dst_shape[self.in_dim],
-                                               dim=self.in_dim)
+                    weight_split = torch.split(
+                        src,
+                        dst_shape[self.in_dim],
+                        dim=self.in_dim)[self.gpu_index].to(
+                            torch.cuda.current_device()).contiguous()
                 else:
                     self.merge_assert(src_shape[self.out_dim], dst_shape[self.out_dim])
-                    weight_split = torch.split(src.data,
-                                               dst_shape[self.out_dim],
-                                               dim=self.out_dim)
-                dst.data.copy_(weight_split[self.gpu_index].to(
-                    torch.cuda.current_device()).contiguous())
+                    weight_split = torch.split(
+                        src.data,
+                        dst_shape[self.out_dim],
+                        dim=self.out_dim)[self.gpu_index].to(
+                            torch.cuda.current_device()).contiguous()
+                dst.data.copy_(weight_split.contiguous())
         else:
             if src_shape[0] == dst_shape[0]:
                 dst.data.copy_(src)
             else:
-                #print(f'{torch.distributed.get_rank()}:{src.data.shape}, {dst_shape}\n')
-                #exit()
-                bias_split = torch.split(src.data, dst_shape[-1])
-                dst.data.copy_(bias_split[self.gpu_index].to(
-                    torch.cuda.current_device()).contiguous())
+                bias_split = torch.split(src.data,
+                                         dst_shape[-1])[self.gpu_index].to(
+                                             torch.cuda.current_device()).contiguous()
+                dst.data.copy_(bias_split)
 
         return torch.nn.parameter.Parameter(dst, requires_grad=False)
 
@@ -196,7 +146,8 @@ def replace_transformer_layer(orig_layer_impl,
                               linear_layer_setting=None,
                               moe=False,
                               moe_experts=1,
-                              moe_type='standard'):
+                              moe_type='standard',
+                              checkpoint=None):
     """ Replace bert-style transformer layers with DeepSpeed's transformer layer
     Arguments:
         orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
@@ -231,6 +182,9 @@ def replace_transformer_layer(orig_layer_impl,
     Returns:
         Updated nn.module with replaced transformer layers
     """
+    mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group,
+                                          mp_size=mp_size)  #, out_dim=0, in_dim=1)
+
     def replace_with_policy(child,
                             policy_cls,
                             triangular_masking,
@@ -293,8 +247,6 @@ def replace_with_policy(child,
             _res_4hh_w = _res_4hh_w.half()
             _res_coef = _res_coef.half()
 
-        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group,
-                                              mp_size=mp_size)  #, out_dim=0, in_dim=1)
         #expert_mp_replace = ReplaceWithTensorSlicing(mp_group=expert_mp_group)
 
         if inference:
@@ -410,18 +362,23 @@ def transpose(data):
                 data = data.reshape(data.shape[-1], data.shape[-2])
                 return data
 
-            from ..runtime.zero import GatheredParameters
+            attn_block = new_module.attention
+            mpl_block = new_module.mlp
+
             if attn_linear_layer:
                 if qkvw.numel() == 0:
-                    with GatheredParameters([qkvw,
-                                             dense_w,
-                                             qkvb,
-                                             dense_b],
-                                            modifier_rank=0):
-                        qkvw = transpose(qkvw.data)
-                        dense_w = transpose(dense_w.data)
-                        qkvb = qkvb.data
-                        dense_b = dense_b.data
+                    if qkvw.ds_tensor.numel() < attn_block.attn_qkvw.numel():
+                        pass
+                    else:
+                        with GatheredParameters([qkvw,
+                                                 dense_w,
+                                                 qkvb,
+                                                 dense_b],
+                                                modifier_rank=0):
+                            qkvw = transpose(qkvw.data)
+                            dense_w = transpose(dense_w.data)
+                            qkvb = qkvb.data
+                            dense_b = dense_b.data
                 else:
                     qkvw.data = transpose(qkvw.data)
                     dense_w.data = transpose(dense_w.data)
@@ -464,15 +421,18 @@ def _transpose(x):
 
             if mlp_linear_layer:
                 if not moe and _4hh_w.numel() == 0:
-                    with GatheredParameters([_h4h_w,
-                                             _4hh_w,
-                                             _4hh_b,
-                                             _h4h_b],
-                                            modifier_rank=0):
-                        _h4h_w = transpose(_h4h_w.data)
-                        _4hh_w = transpose(_4hh_w.data)
-                        _h4h_b = _h4h_b.data
-                        _4hh_b = _4hh_b.data
+                    if _4hh_w.ds_tensor.numel() < mpl_block.inter_w.numel():
+                        pass
+                    else:
+                        with GatheredParameters([_h4h_w,
+                                                 _4hh_w,
+                                                 _4hh_b,
+                                                 _h4h_b],
+                                                modifier_rank=0):
+                            _h4h_w = transpose(_h4h_w.data)
+                            _4hh_w = transpose(_4hh_w.data)
+                            _h4h_b = _h4h_b.data
+                            _4hh_b = _4hh_b.data
                 else:
                     _h4h_w = [transpose(moe_w1.data)
                               for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data)
@@ -484,19 +444,24 @@ def _transpose(x):
                 _res_4hh_w.data = transpose(_res_4hh_w.data)
                 _res_coef.data = transpose(_res_coef.data)
 
-            attn_block = new_module.attention
-
             if qkvw.numel() == 0:
-                with GatheredParameters([attn_qkvw,
-                                         attn_qkvb,
-                                         attn_ow,
-                                         attn_ob],
-                                        modifier_rank=0):
-                    attn_block.attn_qkvw = mp_replace.copy(attn_block.attn_qkvw, qkvw)
-                    attn_block.attn_qkvb = mp_replace.copy(attn_block.attn_qkvb, qkvb)
-
-                    attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
-                    attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
+                if qkvw.ds_tensor.numel() < attn_block.attn_qkvw.numel():
+                    pass
+                else:
+                    with GatheredParameters([attn_qkvw,
+                                             attn_qkvb,
+                                             attn_ow,
+                                             attn_ob],
+                                            modifier_rank=0):
+                        attn_block.attn_qkvw = mp_replace.copy(
+                            attn_block.attn_qkvw,
+                            qkvw)
+                        attn_block.attn_qkvb = mp_replace.copy(
+                            attn_block.attn_qkvb,
+                            qkvb)
+
+                        attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
+                        attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
             else:
                 attn_block.attn_qkvw = mp_replace.copy(attn_block.attn_qkvw, qkvw)
                 attn_block.attn_qkvb = mp_replace.copy(attn_block.attn_qkvb, qkvb)
@@ -504,7 +469,6 @@ def _transpose(x):
                 attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
                 attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
 
-            mpl_block = new_module.mlp
             if moe:
                 gpu_index = dist.get_rank()
                 gpu_index = 0
@@ -536,32 +500,46 @@ def _transpose(x):
             else:
 
                 if _4hh_w.numel() == 0:
-                    with GatheredParameters([_h4h_w,
-                                             _4hh_w,
-                                             _4hh_w,
-                                             _4hh_b],
-                                            modifier_rank=0):
-                        mpl_block.inter_w = mp_replace.copy(mpl_block.inter_w, _h4h_w)
-                        mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b)
-                        mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w)
-                        mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b)
+                    if _4hh_w.ds_tensor.numel() < mpl_block.inter_w.numel():
+                        pass
+                    else:
+                        with GatheredParameters([_h4h_w,
+                                                 _4hh_w,
+                                                 _4hh_w,
+                                                 _4hh_b],
+                                                modifier_rank=0):
+                            mpl_block.inter_w = mp_replace.copy(
+                                mpl_block.inter_w,
+                                _h4h_w)
+                            mpl_block.inter_b = mp_replace.copy(
+                                mpl_block.inter_b,
+                                _h4h_b)
+                            mpl_block.output_w = mp_replace.copy(
+                                mpl_block.output_w,
+                                _4hh_w)
+                            mpl_block.output_b = mp_replace.copy(
+                                mpl_block.output_b,
+                                _4hh_b)
                 else:
                     mpl_block.inter_w = mp_replace.copy(mpl_block.inter_w, _h4h_w)
                     mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b)
                     mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w)
                     mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b)
                 if attn_nw.numel() == 0:
-                    with GatheredParameters([attn_nw, attn_nb], modifier_rank=0):
-                        if attn_nw is None:
-                            new_module.mlp.attn_nw = attn_nw
-                        else:
-                            new_module.mlp.attn_nw.data.copy_(
-                                attn_nw.to(torch.cuda.current_device()))
-                        if attn_nb is None:
-                            new_module.mlp.attn_nb = attn_nb
-                        else:
-                            new_module.mlp.attn_nb.data.copy_(
-                                attn_nb.to(torch.cuda.current_device()))
+                    if attn_nw.ds_tensor.numel() < new_module.mlp.attn_nw.numel():
+                        pass
+                    else:
+                        with GatheredParameters([attn_nw, attn_nb], modifier_rank=0):
+                            if attn_nw is None:
+                                new_module.mlp.attn_nw = attn_nw
+                            else:
+                                new_module.mlp.attn_nw.data.copy_(
+                                    attn_nw.to(torch.cuda.current_device()))
+                            if attn_nb is None:
+                                new_module.mlp.attn_nb = attn_nb
+                            else:
+                                new_module.mlp.attn_nb.data.copy_(
+                                    attn_nb.to(torch.cuda.current_device()))
                 else:
                     if attn_nw is None:
                         new_module.mlp.attn_nw = attn_nw
@@ -574,11 +552,14 @@ def _transpose(x):
                         new_module.mlp.attn_nb.data.copy_(
                             attn_nb.to(torch.cuda.current_device()))
             if input_nw.numel() == 0:
-                with GatheredParameters([input_nw, input_nb], modifier_rank=0):
-                    new_module.norm_w.data.copy_(input_nw.to(
-                        torch.cuda.current_device()))
-                    new_module.norm_b.data.copy_(input_nb.to(
-                        torch.cuda.current_device()))
+                if input_nw.ds_tensor.numel() < new_module.norm_w.numel():
+                    pass
+                else:
+                    with GatheredParameters([input_nw, input_nb], modifier_rank=0):
+                        new_module.norm_w.data.copy_(
+                            input_nw.to(torch.cuda.current_device()))
+                        new_module.norm_b.data.copy_(
+                            input_nb.to(torch.cuda.current_device()))
             else:
                 new_module.norm_w.data.copy_(input_nw.to(torch.cuda.current_device()))
                 new_module.norm_b.data.copy_(input_nb.to(torch.cuda.current_device()))
@@ -643,6 +624,7 @@ def _replace(child, name, conv_linear_layer):
                         if conv_linear_layer:
                             data = data.transpose(-1, -2).contiguous()
                         data = mp_replace.copy(new_weight, data)
+                    child.weight.ds_tensor = torch.empty(1)
                 else:
                     if conv_linear_layer:
                         child.weight.data = child.weight.data.transpose(-1,
@@ -674,6 +656,7 @@ def _replace(child, name, conv_linear_layer):
                         if conv_linear_layer:
                             data = data.transpose(-1, -2).contiguous()
                         data = mp_replace.copy(new_weight, data)
+                    child.weight.ds_tensor = torch.empty(1)
                 else:
                     if conv_linear_layer:
                         child.weight.data = child.weight.data.transpose(-1,
@@ -776,10 +759,19 @@ def replace_fn(child, _policy, layer_id=0):
 
         return new_module
 
-    return replace_module(model=model,
-                          orig_class=orig_layer_impl,
-                          replace_fn=replace_fn,
-                          _replace_policy=policy)
+    replaced_module = replace_module(model=model,
+                                     orig_class=orig_layer_impl,
+                                     replace_fn=replace_fn,
+                                     _replace_policy=policy)
+
+    if checkpoint is not None:
+        for i in range(len(checkpoint)):
+            if not torch.distributed.is_initialized() or torch.distributed.get_rank(
+            ) == 0:
+                print(f"loading checkpoint ({i})")
+            sd = torch.load(checkpoint[i], map_location='cpu')
+            load_model_with_checkpoint(replaced_module, sd, mp_replace)
+    return replaced_module
 
 
 def revert_transformer_layer(orig_layer_impl, model, config, preln=False):

From 6e304aa889eb2e8efec647023e56a5581e3547a4 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Wed, 13 Jul 2022 03:22:40 +0500
Subject: [PATCH 04/22] releasing checkpoint CPU-memory after loading it

---
 deepspeed/module_inject/load_checkpoint.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index 2f7547c4c2f5..3c6d4e960d3e 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -11,9 +11,10 @@ def load_model_with_checkpoint(r_module, sd, mp_replace):
     error_msgs = []
 
     def transpose(data):
-        data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
-        data = data.reshape(data.shape[-1], data.shape[-2])
-        return data
+        data1 = data.transpose(-1, -2).reshape(-1)
+        data.reshape(-1).copy_(data1)
+        data1 = None
+        return data.reshape(data.shape[-1], data.shape[-2])
 
     def load(module, prefix):
         args = (sd, prefix, {}, True, [], [], error_msgs)
@@ -25,12 +26,10 @@ def load(module, prefix):
                 module._load_from_sd(*args)
         else:
             if hasattr(module, 'weight'):
-                weight_data = sd[prefix + 'weight']
-                module.weight = mp_replace.copy(module.weight.data, weight_data)
+                module.weight = mp_replace.copy(module.weight.data,
+                                                sd[prefix + 'weight'])
             if prefix + 'bias' in sd.keys():
-                data = sd[prefix + 'bias']
-                data = data.to(torch.cuda.current_device())
-                module.bias = mp_replace.copy(module.bias.data, data)
+                module.bias = mp_replace.copy(module.bias.data, sd[prefix + 'bias'])
 
     def load_transformer_layer(module, prefix):
         module.norm_w.data.copy_(sd[prefix + 'input_layernorm.' + 'weight'])
@@ -110,3 +109,5 @@ def load_module_recursive(module, prefix='', level=0):
                                       level + 1)
 
     load_module_recursive(r_module)
+    del sd
+    sd = None

From 16dd1dc02d4cdeffa139cfe2db1bb14ac2ef583e Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Wed, 13 Jul 2022 04:52:29 +0500
Subject: [PATCH 05/22] some fixes plus formatting

---
 csrc/transformer/inference/csrc/gelu.cu       | 40 +++++++++----------
 csrc/transformer/inference/csrc/normalize.cu  |  2 +-
 deepspeed/inference/engine.py                 |  3 +-
 deepspeed/module_inject/replace_module.py     |  3 +-
 .../inference/transformer_inference.py        |  1 -
 5 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index c8736d563623..c3d65fa037c2 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -174,7 +174,7 @@ __global__ void fused_bias_residual(float* input,
                                     float* attnbias,
                                     int total_count,
                                     int intermediate_size,
-                                    float mp_size,
+                                    float mp_scale,
                                     bool preln)
 {
     float4* input_cast = reinterpret_cast<float4*>(input);
@@ -191,10 +191,10 @@ __global__ void fused_bias_residual(float* input,
         float4 bias_data = bias_cast[offset % intermediate_size];
         float4 attn_bias = attnbias_cast[offset % intermediate_size];
         if (preln) {
-            data.x = (data.x + res_vec.x + bias_data.x + attn_bias.x) * mp_size + (out.x);
-            data.y = (data.y + res_vec.y + bias_data.y + attn_bias.y) * mp_size + (out.y);
-            data.z = (data.z + res_vec.z + bias_data.z + attn_bias.z) * mp_size + (out.z);
-            data.w = (data.w + res_vec.w + bias_data.w + attn_bias.w) * mp_size + (out.w);
+            data.x = (data.x + res_vec.x + bias_data.x + attn_bias.x) * mp_scale + (out.x);
+            data.y = (data.y + res_vec.y + bias_data.y + attn_bias.y) * mp_scale + (out.y);
+            data.z = (data.z + res_vec.z + bias_data.z + attn_bias.z) * mp_scale + (out.z);
+            data.w = (data.w + res_vec.w + bias_data.w + attn_bias.w) * mp_scale + (out.w);
         } else {
             data.x = data.x + out.x + bias_data.x;
             data.y = data.y + out.y + bias_data.y;
@@ -212,7 +212,7 @@ __global__ void fused_bias_residual(__half* input,
                                     __half* attn_bias,
                                     int total_count,
                                     int intermediate_size,
-                                    float mp_size,
+                                    float mp_scale,
                                     bool preln)
 {
 #ifdef HALF_PRECISION_AVAILABLE
@@ -257,12 +257,12 @@ __global__ void fused_bias_residual(__half* input,
 
         if (preln) {
             low_data.x =
-                (low_data.x + low_res.x + (low_bias.x + attn_low_bias.x)) * mp_size + low_out.x;
+                (low_data.x + low_res.x + (low_bias.x + attn_low_bias.x)) * mp_scale + low_out.x;
             low_data.y =
-                (low_data.y + low_res.y + (low_bias.y + attn_low_bias.y)) * mp_size + low_out.y;
-            high_data.x = (high_data.x + high_res.x + (high_bias.x + attn_high_bias.x)) * mp_size +
+                (low_data.y + low_res.y + (low_bias.y + attn_low_bias.y)) * mp_scale + low_out.y;
+            high_data.x = (high_data.x + high_res.x + (high_bias.x + attn_high_bias.x)) * mp_scale +
                           high_out.x;
-            high_data.y = (high_data.y + high_res.y + (high_bias.y + attn_high_bias.y)) * mp_size +
+            high_data.y = (high_data.y + high_res.y + (high_bias.y + attn_high_bias.y)) * mp_scale +
                           high_out.y;
         } else {
             low_data.x = (low_data.x + low_out.x + low_bias.x);
@@ -310,7 +310,7 @@ __global__ void gptj_residual_add(float* input,
                                   float* attnbias,
                                   int total_count,
                                   int intermediate_size,
-                                  float mp_size)
+                                  float mp_scale)
 {
     float4* input_cast = reinterpret_cast<float4*>(input);
     float4* output_cast = reinterpret_cast<float4*>(output);
@@ -332,10 +332,10 @@ __global__ void gptj_residual_add(float* input,
             data.z += attn_bias.z;
             data.w += attn_bias.w;
         }
-        data.x = data.x * mp_size + (out.x + res_vec.x + bias_data.x);
-        data.y = data.y * mp_size + (out.y + res_vec.y + bias_data.y);
-        data.z = data.z * mp_size + (out.z + res_vec.z + bias_data.z);
-        data.w = data.w * mp_size + (out.w + res_vec.w + bias_data.w);
+        data.x = data.x * mp_scale + (out.x + res_vec.x + bias_data.x);
+        data.y = data.y * mp_scale + (out.y + res_vec.y + bias_data.y);
+        data.z = data.z * mp_scale + (out.z + res_vec.z + bias_data.z);
+        data.w = data.w * mp_scale + (out.w + res_vec.w + bias_data.w);
 
         output_cast[offset] = data;
     }
@@ -348,7 +348,7 @@ __global__ void gptj_residual_add(__half* input,
                                   __half* attn_bias,
                                   int total_count,
                                   int intermediate_size,
-                                  float mp_size)
+                                  float mp_scale)
 {
 #if __CUDA_ARCH__ >= 700 || defined(__HIP_PLATFORM_HCC__)
 
@@ -395,10 +395,10 @@ __global__ void gptj_residual_add(__half* input,
             high_data.y += attn_high_bias.y;
         }
 
-        low_data.x = low_data.x * mp_size + (low_out.x + low_res.x + (low_bias.x));
-        low_data.y = low_data.y * mp_size + (low_out.y + low_res.y + (low_bias.y));
-        high_data.x = high_data.x * mp_size + (high_out.x + high_res.x + (high_bias.x));
-        high_data.y = high_data.y * mp_size + (high_out.y + high_res.y + (high_bias.y));
+        low_data.x = low_data.x * mp_scale + (low_out.x + low_res.x + (low_bias.x));
+        low_data.y = low_data.y * mp_scale + (low_out.y + low_res.y + (low_bias.y));
+        high_data.x = high_data.x * mp_scale + (high_out.x + high_res.x + (high_bias.x));
+        high_data.y = high_data.y * mp_scale + (high_out.y + high_res.y + (high_bias.y));
 
         vals_half[0] = __float22half2_rn(low_data);
         vals_half[1] = __float22half2_rn(high_data);
diff --git a/csrc/transformer/inference/csrc/normalize.cu b/csrc/transformer/inference/csrc/normalize.cu
index 1569467e83aa..22c23011ede8 100644
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@@ -100,7 +100,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
     int gid = id >> 5;
     int warp_num = iteration_stride >> 5;
 
-    __half2 inp_reg[8];
+    __half2 inp_reg[NORM_REG];
 
     const __half2* vals_cast = reinterpret_cast<const __half2*>(vals);
     __half2* out_cast = reinterpret_cast<__half2*>(output);
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index eade7cbede71..40b37d53d572 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -313,7 +313,8 @@ def _apply_injection_policy(self,
                                 moe_type='standard',
                                 training_mp_size=1,
                                 checkpoint_dir=None):
-        checkpoint = SDLoaderFactory.get_sd_loader_json(checkpoint_dir)
+        checkpoint = SDLoaderFactory.get_sd_loader_json(
+            checkpoint_dir) if checkpoint_dir is not None else None
         replace_transformer_layer(client_module,
                                   self.module,
                                   triangular_masking=self.triangular_masking,
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 19e34abffa90..b49a8ff043e3 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -765,8 +765,7 @@ def replace_fn(child, _policy, layer_id=0):
 
     if checkpoint is not None:
         for i in range(len(checkpoint)):
-            if not torch.distributed.is_initialized() or torch.distributed.get_rank(
-            ) == 0:
+            if not deepspeed.comm.is_initialized() or deepspeed.comm.get_rank() == 0:
                 print(f"loading checkpoint ({i})")
             sd = torch.load(checkpoint[i], map_location='cpu')
             load_model_with_checkpoint(replaced_module, sd, mp_replace)
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 02d985fc5c80..33c466a8f73d 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -818,7 +818,6 @@ def forward(self,
                 use_cache=False,
                 alibi=None,
                 output_attentions=False):
-        #import pdb;pdb.set_trace()
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
         layer_past = layer_past if layer_past is not None else self.layer_past

From e0dd488fc742e8e40044f45f7a2ef2ad4bc9f79e Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Wed, 13 Jul 2022 05:55:18 +0500
Subject: [PATCH 06/22] fix layer_past; this caused issues when running
 inference on several queries

---
 deepspeed/ops/transformer/inference/transformer_inference.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 33c466a8f73d..bc228cabb245 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -846,8 +846,7 @@ def forward(self,
                                               self.norm_b,
                                               alibi)
             presents = (key, value)
-            self.layer_past = presents
-            #import pdb;pdb.set_trace()
+            self.layer_past = presents if layer_past is None else None
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:

From 02f994569b817af6add04645f5312f133c2f4432 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Thu, 14 Jul 2022 02:27:52 +0500
Subject: [PATCH 07/22] Add support for multi-batch inference

---
 csrc/transformer/inference/csrc/softmax.cu    | 33 +++++++++----------
 .../inference/transformer_inference.py        | 14 +++-----
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index 7f6af09c650b..b210488d56d1 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -66,7 +66,7 @@ __global__ void attn_softmax_v2(__half* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (num_seq)) * (sequence_length);
+        int mask_offset = ((iter_offset / num_seq) % heads) * (sequence_length);
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -416,22 +416,21 @@ void launch_attn_softmax_v2(T* vals,
     const int iterations = (sequence_length - 1) / (reduce_width << 2) + 1;
 
     if (sequence_length <= 32768)
-        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(
-            vals,
-            mask,
-            alibi,
-            layer_scale,
-            triangular,
-            recompute,
-            local_attention,
-            window_size,
-            total_count,
-            (triangular ? (heads * batch_size) : heads),
-            sequence_length,
-            num_seq,
-            scale,
-            iterations,
-            reduce_width);
+        attn_softmax_v2<<<grid_dim, block_dim, 0, stream>>>(vals,
+                                                            mask,
+                                                            alibi,
+                                                            layer_scale,
+                                                            triangular,
+                                                            recompute,
+                                                            local_attention,
+                                                            window_size,
+                                                            total_count,
+                                                            heads,
+                                                            sequence_length,
+                                                            num_seq,
+                                                            scale,
+                                                            iterations,
+                                                            reduce_width);
     else
         throw std::runtime_error("Unsupport Seq_Length!");
 }
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index bc228cabb245..cf4c8f5e536c 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -230,19 +230,15 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
                                                        -1)
 
             # slice alibi tensor until the query length
-            batch_heads = output_size[0] * output_size[1]
-            offset = dist.get_rank() * batch_heads if dist.is_initialized() else 0
-            sliced_alibi = alibi[offset:batch_heads + offset, :, :output_size[3]]
+            offset = dist.get_rank() * output_size[1] if dist.is_initialized() else 0
+            sliced_alibi = alibi[offset:output_size[1] + offset, :, :output_size[3]]
 
             # Raw attention scores. [batch_size * num_heads, q_length, k_length]
-            layer_number = max(1, config.layer_id)
-            norm_factor = (1 / norm_factor) * (1 / norm_factor) / layer_number
             matmul_result = torch.matmul(query_layer.transpose(1,
                                                                0),
                                          key_layer.transpose(1,
-                                                             0).transpose(
-                                                                 1,
-                                                                 2)) * norm_factor
+                                                             0).transpose(1,
+                                                                          2))
 
             # change view to [batch_size, num_heads, q_length, k_length]
             attention_scores = matmul_result.view(*output_size)
@@ -255,7 +251,7 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
                 False,
                 1,
                 False,
-                layer_number)
+                1 / (norm_factor * norm_factor))
 
             # context layer shape: [batch_size, num_heads, q_length, head_dim]
             output_size = (value_layer.size(0),

From 1cef202d897eac1f895543dccf785980f16be172 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Thu, 14 Jul 2022 07:47:44 +0500
Subject: [PATCH 08/22] fix the padding issue for large bach inference

---
 .../transformer/inference/csrc/pt_binding.cpp | 21 +++++++++++++++----
 csrc/transformer/inference/csrc/softmax.cu    | 21 ++++++++++---------
 deepspeed/inference/engine.py                 |  2 +-
 .../inference/transformer_inference.py        |  5 +++++
 4 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 87bb214731ed..0568c1f88035 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -11,6 +11,7 @@ std::array<int, 3> gemm_algos = std::array<int, 3>({99, 99, 99});
 template <typename T>
 at::Tensor ds_softmax(at::Tensor& attn_scores,
                       at::Tensor& attn_mask,
+                      at::Tensor& alibi,
                       bool triangular,
                       bool recompute,
                       bool local_attention,
@@ -32,8 +33,8 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
     if (len > 3) heads = attn_scores_c.size(1);
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
-                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
-                           (T*)attn_mask.data_ptr(),
+                           (T*)(attn_mask.sizes().size() > 1 ? attn_mask.data_ptr() : nullptr),
+                           (T*)alibi.data_ptr(),
                            layer_scale,
                            triangular,
                            recompute,
@@ -147,8 +148,20 @@ void attention_unfused(at::Tensor& prev_key_cont,
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
-    attn_score = ds_softmax<T>(
-        attn_score, attn_mask, triangular, recompute, local_attention, window_size, false, 1.0);
+    launch_attn_softmax_v2((T*)attn_score.data_ptr(),
+                           (T*)(attn_mask.sizes().size() > 1 ? attn_mask.data_ptr() : nullptr),
+                           (T*)nullptr,
+                           1.0,
+                           triangular,
+                           recompute,
+                           local_attention,
+                           window_size,
+                           bsz,
+                           heads,
+                           seq_len,
+                           soft_len,
+                           1.0,
+                           Context::Instance().GetCurrentStream(false));
     alpha = 1.0;
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 k,
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index b210488d56d1..e421b086029a 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -66,7 +66,8 @@ __global__ void attn_softmax_v2(__half* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = ((iter_offset / num_seq) % heads) * (sequence_length);
+        int alibi_offset = ((iter_offset / num_seq) % heads) * (sequence_length);
+        int mask_offset = ((iter_offset / (num_seq * heads))) * (sequence_length);
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -98,7 +99,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                       (data_id + 3) > window_stride)
                                          ? __half2float(vals[data_id + 3])
                                          : minus_infinity;
-                    if (mask && recompute) {
+                    if (mask) {
                         low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
                         high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
@@ -106,13 +107,13 @@ __global__ void attn_softmax_v2(__half* vals,
                     }
                     if (alibi) {
                         low_data[i].x = low_data[i].x * layer_scale +
-                                        __half2float(alibi[data_id + mask_offset]);
+                                        __half2float(alibi[data_id + alibi_offset]);
                         low_data[i].y = low_data[i].y * layer_scale +
-                                        __half2float(alibi[data_id + mask_offset + 1]);
+                                        __half2float(alibi[data_id + alibi_offset + 1]);
                         high_data[i].x = high_data[i].x * layer_scale +
-                                         __half2float(alibi[data_id + mask_offset + 2]);
+                                         __half2float(alibi[data_id + alibi_offset + 2]);
                         high_data[i].y = high_data[i].y * layer_scale +
-                                         __half2float(alibi[data_id + mask_offset + 3]);
+                                         __half2float(alibi[data_id + alibi_offset + 3]);
                     }
                 } else {
                     low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
@@ -128,7 +129,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                          ? __half2float(vals[data_id + 2])
                                          : minus_infinity;
                     high_data[i].y = minus_infinity;
-                    if (mask && recompute) {
+                    if (mask) {
                         low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         if ((data_id + 1) < sequence_length)
                             low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
@@ -137,13 +138,13 @@ __global__ void attn_softmax_v2(__half* vals,
                     }
                     if (alibi) {
                         low_data[i].x = low_data[i].x * layer_scale +
-                                        __half2float(alibi[data_id + mask_offset]);
+                                        __half2float(alibi[data_id + alibi_offset]);
                         if ((data_id + 1) < sequence_length)
                             low_data[i].y = low_data[i].y * layer_scale +
-                                            __half2float(alibi[data_id + mask_offset + 1]);
+                                            __half2float(alibi[data_id + alibi_offset + 1]);
                         if ((data_id + 2) < sequence_length)
                             high_data[i].x = high_data[i].x * layer_scale +
-                                             __half2float(alibi[data_id + mask_offset + 2]);
+                                             __half2float(alibi[data_id + alibi_offset + 2]);
                     }
                 }
                 // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 40b37d53d572..ea29282da744 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -487,6 +487,7 @@ def forward(self, *inputs, **kwargs):
             *inputs: Variable length input list
             **kwargs: variable length keyword arguments
         """
+
         if self.mp_world_size > 1:
             if self.mpu is None:
                 for input in inputs:
@@ -501,7 +502,6 @@ def forward(self, *inputs, **kwargs):
                         if not kwargs[k].is_contiguous():
                             kwargs[k] = kwargs[k].contiguous()
                         dist.broadcast(kwargs[k], 0)
-
             outputs = self.model_orig_fwd(*inputs, **kwargs)
         else:
             if self.enable_cuda_graph:
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index cf4c8f5e536c..a262b4308682 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -243,8 +243,13 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
             # change view to [batch_size, num_heads, q_length, k_length]
             attention_scores = matmul_result.view(*output_size)
 
+            # Convert mask from 1/0 to 0/-inf, when dtype is int64
+            if input_mask.dtype == torch.int64:
+                input_mask = (1 - input_mask).half() * -10000.0
+
             attention_probs = inference_cuda_module.softmax_fp16(
                 attention_scores,
+                input_mask,
                 sliced_alibi,
                 (output_size[-2] > 1),
                 False,

From 644fea44d32384e73ed9dc985a3ccebc46e86b13 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Fri, 15 Jul 2022 04:00:36 +0500
Subject: [PATCH 09/22] fixing some bug in softmax kernel for batch_size>1

---
 .../transformer/inference/csrc/pt_binding.cpp |   2 +-
 csrc/transformer/inference/csrc/softmax.cu    | 100 ++++++++++--------
 .../inference/transformer_inference.py        |   8 +-
 3 files changed, 61 insertions(+), 49 deletions(-)

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 0568c1f88035..6e510ced7726 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -33,7 +33,7 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
     if (len > 3) heads = attn_scores_c.size(1);
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
-                           (T*)(attn_mask.sizes().size() > 1 ? attn_mask.data_ptr() : nullptr),
+                           (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
                            (T*)alibi.data_ptr(),
                            layer_scale,
                            triangular,
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index e421b086029a..c8509812bb3b 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -49,7 +49,7 @@ __global__ void attn_softmax_v2(__half* vals,
 
     float2 low_data[MAX_REG_SIZE];
     float2 high_data[MAX_REG_SIZE];
-
+    const __half zero_h = __float2half(0.f);
     __half2 h_scale = __float2half2_rn(scale);
 
     int wid = threadIdx.x >> 5;
@@ -85,66 +85,78 @@ __global__ void attn_softmax_v2(__half* vals,
             if ((!triangular || ((data_id >> 2) <= seq_id4)) && (data_id >> 2) >= window_stride4 &&
                 data_id < sequence_length) {
                 if ((sequence_length - data_id) >= 4) {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
+                    low_data[i].x = data_id > window_stride
+                                        ? __half2float(vals[data_id]) * layer_scale
+                                        : minus_infinity;
                     low_data[i].y = ((!triangular || ((data_id + 1) <= seq_id)) &&
                                      (data_id + 1) > window_stride)
-                                        ? __half2float(vals[data_id + 1])
+                                        ? __half2float(vals[data_id + 1]) * layer_scale
                                         : minus_infinity;
                     high_data[i].x = ((!triangular || ((data_id + 2) <= seq_id)) &&
                                       (data_id + 2) > window_stride)
-                                         ? __half2float(vals[data_id + 2])
+                                         ? __half2float(vals[data_id + 2]) * layer_scale
                                          : minus_infinity;
                     high_data[i].y = ((!triangular || ((data_id + 3) <= seq_id)) &&
                                       (data_id + 3) > window_stride)
-                                         ? __half2float(vals[data_id + 3])
+                                         ? __half2float(vals[data_id + 3]) * layer_scale
                                          : minus_infinity;
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
-                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
-                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
-                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
-                    }
                     if (alibi) {
-                        low_data[i].x = low_data[i].x * layer_scale +
-                                        __half2float(alibi[data_id + alibi_offset]);
-                        low_data[i].y = low_data[i].y * layer_scale +
-                                        __half2float(alibi[data_id + alibi_offset + 1]);
-                        high_data[i].x = high_data[i].x * layer_scale +
-                                         __half2float(alibi[data_id + alibi_offset + 2]);
-                        high_data[i].y = high_data[i].y * layer_scale +
-                                         __half2float(alibi[data_id + alibi_offset + 3]);
+                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
+                        low_data[i].y =
+                            low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
+                        high_data[i].x =
+                            high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
+                        high_data[i].y =
+                            high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
+                    }
+                    if (mask) {
+                        low_data[i].x = __hlt(mask[data_id + mask_offset], zero_h) ? minus_infinity
+                                                                                   : low_data[i].x;
+                        low_data[i].y = __hlt(mask[data_id + mask_offset + 1], zero_h)
+                                            ? minus_infinity
+                                            : low_data[i].y;
+                        high_data[i].x = __hlt(mask[data_id + mask_offset + 2], zero_h)
+                                             ? minus_infinity
+                                             : high_data[i].x;
+                        high_data[i].y = __hlt(mask[data_id + mask_offset + 3], zero_h)
+                                             ? minus_infinity
+                                             : high_data[i].y;
                     }
                 } else {
-                    low_data[i].x = data_id > window_stride ? __half2float(vals[data_id])
-                                                            : minus_infinity;
+                    low_data[i].x = data_id > window_stride
+                                        ? __half2float(vals[data_id]) * layer_scale
+                                        : minus_infinity;
                     low_data[i].y = (((!triangular || (data_id + 1) <= seq_id) &&
                                       (data_id + 1) > window_stride) &&
                                      (data_id + 1) < sequence_length)
-                                        ? __half2float(vals[data_id + 1])
+                                        ? __half2float(vals[data_id + 1]) * layer_scale
                                         : minus_infinity;
                     high_data[i].x = (((!triangular || (data_id + 2) <= seq_id) &&
                                        (data_id + 2) > window_stride) &&
                                       (data_id + 2) < sequence_length)
-                                         ? __half2float(vals[data_id + 2])
+                                         ? __half2float(vals[data_id + 2]) * layer_scale
                                          : minus_infinity;
-                    high_data[i].y = minus_infinity;
-                    if (mask) {
-                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
+                    if (alibi) {
+                        low_data[i].x = low_data[i].x + __half2float(alibi[data_id + alibi_offset]);
                         if ((data_id + 1) < sequence_length)
-                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
+                            low_data[i].y =
+                                low_data[i].y + __half2float(alibi[data_id + alibi_offset + 1]);
                         if ((data_id + 2) < sequence_length)
-                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
+                            high_data[i].x =
+                                high_data[i].x + __half2float(alibi[data_id + alibi_offset + 2]);
                     }
-                    if (alibi) {
-                        low_data[i].x = low_data[i].x * layer_scale +
-                                        __half2float(alibi[data_id + alibi_offset]);
+                    high_data[i].y = minus_infinity;
+                    if (mask) {
+                        low_data[i].x = __hlt(mask[data_id + mask_offset], zero_h) ? minus_infinity
+                                                                                   : low_data[i].x;
                         if ((data_id + 1) < sequence_length)
-                            low_data[i].y = low_data[i].y * layer_scale +
-                                            __half2float(alibi[data_id + alibi_offset + 1]);
+                            low_data[i].y = __hlt(mask[data_id + mask_offset + 1], zero_h)
+                                                ? minus_infinity
+                                                : low_data[i].y;
                         if ((data_id + 2) < sequence_length)
-                            high_data[i].x = high_data[i].x * layer_scale +
-                                             __half2float(alibi[data_id + alibi_offset + 2]);
+                            high_data[i].x = __hlt(mask[data_id + mask_offset + 2], zero_h)
+                                                 ? minus_infinity
+                                                 : high_data[i].x;
                     }
                 }
                 // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);
@@ -210,14 +222,16 @@ __global__ void attn_softmax_v2(__half* vals,
 
             if (data_id < sequence_length) {
                 if ((sequence_length - data_id) >= 4) {
-                    vals[data_id] = low_data[i].x / sum;
-                    vals[data_id + 1] = low_data[i].y / sum;
-                    vals[data_id + 2] = high_data[i].x / sum;
-                    vals[data_id + 3] = high_data[i].y / sum;
+                    vals[data_id] = __float2half(low_data[i].x / sum);
+                    vals[data_id + 1] = __float2half(low_data[i].y / sum);
+                    vals[data_id + 2] = __float2half(high_data[i].x / sum);
+                    vals[data_id + 3] = __float2half(high_data[i].y / sum);
                 } else {
-                    vals[data_id] = low_data[i].x / sum;
-                    if ((data_id + 1) < sequence_length) vals[data_id + 1] = low_data[i].y / sum;
-                    if ((data_id + 2) < sequence_length) vals[data_id + 2] = high_data[i].x / sum;
+                    vals[data_id] = __float2half(low_data[i].x / sum);
+                    if ((data_id + 1) < sequence_length)
+                        vals[data_id + 1] = __float2half(low_data[i].y / sum);
+                    if ((data_id + 2) < sequence_length)
+                        vals[data_id + 2] = __float2half(high_data[i].x / sum);
                 }
             }
         }
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index a262b4308682..40d2d2910851 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -234,6 +234,7 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
             sliced_alibi = alibi[offset:output_size[1] + offset, :, :output_size[3]]
 
             # Raw attention scores. [batch_size * num_heads, q_length, k_length]
+
             matmul_result = torch.matmul(query_layer.transpose(1,
                                                                0),
                                          key_layer.transpose(1,
@@ -242,14 +243,11 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
 
             # change view to [batch_size, num_heads, q_length, k_length]
             attention_scores = matmul_result.view(*output_size)
-
             # Convert mask from 1/0 to 0/-inf, when dtype is int64
-            if input_mask.dtype == torch.int64:
-                input_mask = (1 - input_mask).half() * -10000.0
-
             attention_probs = inference_cuda_module.softmax_fp16(
                 attention_scores,
-                input_mask,
+                ((1 - input_mask).half() *
+                 -10000.0) if input_mask.dtype == torch.int64 else input_mask,
                 sliced_alibi,
                 (output_size[-2] > 1),
                 False,

From adb0b9729b5010e7c05e81d246670491bcce5181 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Fri, 15 Jul 2022 13:49:30 +0500
Subject: [PATCH 10/22] align alibi&mask addition with HF new changes

---
 .../transformer/inference/csrc/pt_binding.cpp | 20 ++++-
 csrc/transformer/inference/csrc/softmax.cu    | 21 ++++-
 .../inference/includes/custom_cuda_layers.h   |  2 +
 .../inference/transformer_inference.py        | 89 ++++++++++++++++++-
 4 files changed, 123 insertions(+), 9 deletions(-)

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 6e510ced7726..46edd020bc40 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -17,20 +17,24 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                       bool local_attention,
                       int window_size,
                       bool async_op,
-                      float layer_scale)
+                      float layer_scale,
+                      int head_offset)
 {
     auto attn_scores_c = attn_scores.contiguous();
     int bsz = attn_scores_c.size(0);
 
     int seq_len = attn_scores_c.size(1);
     int len = attn_scores_c.sizes().size();
-    if (len > 3) seq_len = attn_scores_c.size(2);
+    if (len > 2) seq_len = attn_scores_c.size(2);
 
     int soft_len = attn_scores_c.size(2);
     if (len > 3) soft_len = attn_scores_c.size(3);
 
     int heads = 1;
-    if (len > 3) heads = attn_scores_c.size(1);
+    if (len > 1) heads = attn_scores_c.size(1);
+
+    int mask_stride = 1;
+    if (attn_mask.sizes().size() > 2 && attn_mask.size(2) > 1) mask_stride = attn_mask.size(2);
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
@@ -44,6 +48,8 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                            heads,
                            seq_len,
                            soft_len,
+                           head_offset,
+                           mask_stride,
                            1.0,
                            Context::Instance().GetCurrentStream(async_op));
 
@@ -127,6 +133,8 @@ void attention_unfused(at::Tensor& prev_key_cont,
     float gemm_beta = 0.0;
     auto attn_score = at::empty({bsz, heads, seq_len, soft_len}, options);
     int k = prev_value_cont.size(2) / heads;
+    int mask_stride = heads;
+    if (attn_mask.sizes().size() > 2 && attn_mask.size(2) == 1) mask_stride *= seq_len;
     cublasSetStream(Context::Instance().GetCublasHandle(), Context::Instance().GetCurrentStream());
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
                                 soft_len,
@@ -160,6 +168,8 @@ void attention_unfused(at::Tensor& prev_key_cont,
                            heads,
                            seq_len,
                            soft_len,
+                           0,
+                           mask_stride,
                            1.0,
                            Context::Instance().GetCurrentStream(false));
     alpha = 1.0;
@@ -252,6 +262,8 @@ void ds_softmax_internal(T* attn_scores,
                          int soft_len,
                          int heads)
 {
+    int mask_stride = heads;
+    if (attn_mask.sizes().size() > 2 && attn_mask.size(2) == 1) mask_stride *= seq_len;
     launch_attn_softmax_v2((T*)attn_scores,
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
                            (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
@@ -264,6 +276,8 @@ void ds_softmax_internal(T* attn_scores,
                            heads,
                            seq_len,
                            soft_len,
+                           0,
+                           mask_stride,
                            1.0,
                            at::cuda::getCurrentCUDAStream());
 }
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index c8509812bb3b..297e21afa35c 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -38,6 +38,8 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int heads,
                                 int sequence_length,
                                 int num_seq,
+                                int head_offset,
+                                int mask_stride,
                                 float scale,
                                 int iterations,
                                 int reduceWidth)
@@ -62,12 +64,15 @@ __global__ void attn_softmax_v2(__half* vals,
     __shared__ float partialSum[MAX_WARP_NUM];
 
     int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
+    int batch_idx = iter_offset / (num_seq * heads);
+    int alibi_offset = batch_idx * heads + head_offset;
+    int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
 
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int alibi_offset = ((iter_offset / num_seq) % heads) * (sequence_length);
-        int mask_offset = ((iter_offset / (num_seq * heads))) * (sequence_length);
+        alibi_offset = (alibi_offset + ((iter_offset / num_seq) % heads)) * sequence_length;
+        mask_offset = mask_offset * sequence_length;
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -251,6 +256,8 @@ __global__ void attn_softmax_v2(float* vals,
                                 int heads,
                                 int sequence_length,
                                 int num_seq,
+                                int head_offset,
+                                int mask_stride,
                                 float scale,
                                 int iterations,
                                 int reduceWidth)
@@ -273,7 +280,7 @@ __global__ void attn_softmax_v2(float* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / (heads * num_seq)) * (sequence_length);
+        int mask_offset = (iter_offset / mask_stride) * (sequence_length);
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 
@@ -420,6 +427,8 @@ void launch_attn_softmax_v2(T* vals,
                             int heads,
                             int num_seq,
                             int sequence_length,
+                            int head_offset,
+                            int mask_stride,
                             float scale,
                             cudaStream_t stream)
 {
@@ -443,6 +452,8 @@ void launch_attn_softmax_v2(T* vals,
                                                             heads,
                                                             sequence_length,
                                                             num_seq,
+                                                            head_offset,
+                                                            mask_stride,
                                                             scale,
                                                             iterations,
                                                             reduce_width);
@@ -462,6 +473,8 @@ template void launch_attn_softmax_v2(float* vals,
                                      int heads,
                                      int num_seq,
                                      int sequence_length,
+                                     int head_offset,
+                                     int mask_stride,
                                      float scale,
                                      cudaStream_t stream);
 template void launch_attn_softmax_v2(__half* vals,
@@ -476,5 +489,7 @@ template void launch_attn_softmax_v2(__half* vals,
                                      int heads,
                                      int num_seq,
                                      int sequence_length,
+                                     int head_offset,
+                                     int mask_stride,
                                      float scale,
                                      cudaStream_t stream);
diff --git a/csrc/transformer/inference/includes/custom_cuda_layers.h b/csrc/transformer/inference/includes/custom_cuda_layers.h
index cfcceee9a3ba..afca0f04c542 100644
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@@ -38,6 +38,8 @@ void launch_attn_softmax_v2(T* vals,
                             int heads,
                             int num_seq,
                             int sequence_length,
+                            int offset,
+                            int mask_stride,
                             float scale,
                             cudaStream_t stream);
 
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 40d2d2910851..eae3352eb3f8 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -13,6 +13,7 @@
 from deepspeed import comm as dist
 # Cuda modules will be imported if needed
 inference_cuda_module = None
+minus_inf = -10000.0
 
 
 class TransformerConfig():
@@ -191,6 +192,83 @@ def split_tensor_along_last_dim(tensor,
             return tensor_list
 
         def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
+            head_dim = hidden_size_per_partition // num_attention_heads_per_partition
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                num_attention_heads_per_partition,
+                3 * head_dim)
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+            (query_layer,
+             key_layer,
+             value_layer) = split_tensor_along_last_dim(mixed_x_layer,
+                                                        3)
+
+            if layer_past is not None:
+                past_key, past_value = layer_past
+                # concatenate along seq_length dimension -> [batch_size, qk_length, num_heads, head_dim]
+                key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=1)
+                value_layer = torch.cat((past_value.type_as(value_layer),
+                                         value_layer),
+                                        dim=1)
+
+            presents = (key_layer, value_layer)
+
+            # # [batch_size*num_heads, head_dim, q_length] x [batch_size*num_heads, head_dim, k_length] -> [batch_size*num_heads, q_length, k_length]
+            matmul_result = torch.bmm(
+                query_layer.transpose(1,
+                                      2).reshape(-1,
+                                                 query_layer.shape[1],
+                                                 query_layer.shape[3]),
+                key_layer.permute(0,
+                                  2,
+                                  3,
+                                  1).reshape(-1,
+                                             key_layer.shape[3],
+                                             key_layer.shape[1]),
+            )
+
+            # change view to [batch_size, num_heads, q_length, k_length]
+            attention_scores = matmul_result.view(-1,
+                                                  num_attention_heads_per_partition,
+                                                  matmul_result.size(1),
+                                                  matmul_result.size(2))
+
+            offset = dist.get_rank(
+            ) * num_attention_heads_per_partition if dist.is_initialized() else 0
+            attention_probs = inference_cuda_module.softmax_fp16(
+                attention_scores,
+                ((1 - input_mask).half() *
+                 minus_inf) if input_mask.dtype == torch.int64 else input_mask,
+                alibi,
+                (config.triangular_masking and (attention_scores.shape[-2] > 1)),
+                False,
+                False,
+                1,
+                False,
+                1 / (norm_factor * norm_factor),
+                offset)
+            # change view [batch_size x num_heads, q_length, k_length]
+            attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
+
+            # matmul: [batch_size * num_heads, q_length, head_dim]
+            context_layer = torch.bmm(
+                attention_probs_reshaped,
+                value_layer.transpose(1,
+                                      2).reshape(-1,
+                                                 value_layer.size(1),
+                                                 value_layer.size(3)))
+
+            # change view [batch_size, num_heads, q_length, head_dim]
+            context_layer = context_layer.view(
+                context_layer.size(0) // num_attention_heads_per_partition,
+                num_attention_heads_per_partition,
+                context_layer.size(1),
+                context_layer.shape[-1])
+
+            context_layer = _transpose_for_context(context_layer)
+
+            return context_layer, presents
+            '''
             # [batch_size, seq_length, 3 x hidden_size] --> [batch_size, seq_length, num_heads, 3 x head_dim]
             head_dim = hidden_size_per_partition // num_attention_heads_per_partition
             new_tensor_shape = mixed_x_layer.size()[:-1] + (
@@ -230,6 +308,7 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
                                                        -1)
 
             # slice alibi tensor until the query length
+            #import pdb;pdb.set_trace()
             offset = dist.get_rank() * output_size[1] if dist.is_initialized() else 0
             sliced_alibi = alibi[offset:output_size[1] + offset, :, :output_size[3]]
 
@@ -292,6 +371,7 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
             context_layer = context_layer.view(*new_context_layer_shape)
 
             return context_layer, presents
+        '''
 
         ###################### End of HF modeling_bloom addition ########################
 
@@ -347,7 +427,8 @@ def compute_attention(qkv_out, input_mask):
                         mixed_query,
                         key_layer,
                         torch.empty(1),
-                        input_mask,
+                        ((1 - input_mask).half() *
+                         minus_inf) if input_mask.dtype == torch.int64 else input_mask,
                         value_layer,
                         torch.empty(1),
                         num_attention_heads_per_partition,
@@ -362,7 +443,8 @@ def compute_attention(qkv_out, input_mask):
                         mixed_query,
                         (key_layer if unfused_mode else past_key.type_as(key_layer)),
                         key_layer,
-                        input_mask,
+                        ((1 - input_mask).half() *
+                         minus_inf) if input_mask.dtype == torch.int64 else input_mask,
                         (value_layer
                          if unfused_mode else past_value.type_as(value_layer)),
                         value_layer,
@@ -395,7 +477,8 @@ def compute_attention(qkv_out, input_mask):
                         sliced_alibi = alibi[offset:batch_heads + offset, :, :]
                     attn_key_value = score_context_func(
                         qkv_out,
-                        input_mask,
+                        ((1 - input_mask).half() *
+                         minus_inf) if input_mask.dtype == torch.int64 else input_mask,
                         config.rotary_dim,
                         config.rotate_half,
                         config.rotate_every_two,

From bd3c0a02555a4edbc1c4f11abd1c9e41fcf11e25 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Fri, 15 Jul 2022 23:28:39 +0500
Subject: [PATCH 11/22] revert back some changes and support for very large
 batch size

---
 .../transformer/inference/csrc/pt_binding.cpp |   2 +-
 .../inference/includes/custom_cuda_layers.h   |   2 +-
 .../inference/transformer_inference.py        | 158 ++++--------------
 3 files changed, 33 insertions(+), 129 deletions(-)

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 46edd020bc40..1419767f0d1f 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -548,7 +548,7 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
 {
     int bsz = input.size(0) * input.size(1);
     T* workspace = (T*)Context::Instance().GetWorkSpace();
-    workspace += (3 * input.size(0) * MAX_OUT_TOKES * input.size(2));
+    workspace += (3 * bsz * input.size(2));
     ds_layernorm_internal<T>(workspace, input, gamma, beta, epsilon);
     // cudaEventRecord(Context::Instance().GetCompEvent(1), Context::Instance().GetCurrentStream());
 
diff --git a/csrc/transformer/inference/includes/custom_cuda_layers.h b/csrc/transformer/inference/includes/custom_cuda_layers.h
index afca0f04c542..530d57012ffb 100644
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <iostream>
 
-#define MAX_OUT_TOKES 128
+#define MAX_OUT_TOKES 1024
 #define MAX_WARP_NUM 32
 #define WARP_SIZE 32
 
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index eae3352eb3f8..c4139aceb61d 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -213,83 +213,6 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
 
             presents = (key_layer, value_layer)
 
-            # # [batch_size*num_heads, head_dim, q_length] x [batch_size*num_heads, head_dim, k_length] -> [batch_size*num_heads, q_length, k_length]
-            matmul_result = torch.bmm(
-                query_layer.transpose(1,
-                                      2).reshape(-1,
-                                                 query_layer.shape[1],
-                                                 query_layer.shape[3]),
-                key_layer.permute(0,
-                                  2,
-                                  3,
-                                  1).reshape(-1,
-                                             key_layer.shape[3],
-                                             key_layer.shape[1]),
-            )
-
-            # change view to [batch_size, num_heads, q_length, k_length]
-            attention_scores = matmul_result.view(-1,
-                                                  num_attention_heads_per_partition,
-                                                  matmul_result.size(1),
-                                                  matmul_result.size(2))
-
-            offset = dist.get_rank(
-            ) * num_attention_heads_per_partition if dist.is_initialized() else 0
-            attention_probs = inference_cuda_module.softmax_fp16(
-                attention_scores,
-                ((1 - input_mask).half() *
-                 minus_inf) if input_mask.dtype == torch.int64 else input_mask,
-                alibi,
-                (config.triangular_masking and (attention_scores.shape[-2] > 1)),
-                False,
-                False,
-                1,
-                False,
-                1 / (norm_factor * norm_factor),
-                offset)
-            # change view [batch_size x num_heads, q_length, k_length]
-            attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
-
-            # matmul: [batch_size * num_heads, q_length, head_dim]
-            context_layer = torch.bmm(
-                attention_probs_reshaped,
-                value_layer.transpose(1,
-                                      2).reshape(-1,
-                                                 value_layer.size(1),
-                                                 value_layer.size(3)))
-
-            # change view [batch_size, num_heads, q_length, head_dim]
-            context_layer = context_layer.view(
-                context_layer.size(0) // num_attention_heads_per_partition,
-                num_attention_heads_per_partition,
-                context_layer.size(1),
-                context_layer.shape[-1])
-
-            context_layer = _transpose_for_context(context_layer)
-
-            return context_layer, presents
-            '''
-            # [batch_size, seq_length, 3 x hidden_size] --> [batch_size, seq_length, num_heads, 3 x head_dim]
-            head_dim = hidden_size_per_partition // num_attention_heads_per_partition
-            new_tensor_shape = mixed_x_layer.size()[:-1] + (
-                num_attention_heads_per_partition,
-                3 * head_dim)
-            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
-
-            # [batch_size, seq_length, num_heads, 3 x head_dim] --> 3  [batch_size, seq_length, num_heads, head_dim]
-            (query_layer,
-             key_layer,
-             value_layer) = split_tensor_along_last_dim(mixed_x_layer,
-                                                        3)
-
-            if layer_past is not None:
-                past_key, past_value = layer_past
-                key_layer = torch.cat((past_key.type_as(key_layer), key_layer), dim=1)
-                value_layer = torch.cat((past_value.type_as(value_layer),
-                                         value_layer),
-                                        dim=1)
-
-            presents = (key_layer, value_layer)
             # [batch_size, head_dim, q_length, k_length]
             output_size = (query_layer.size(0),
                            query_layer.size(2),
@@ -307,71 +230,50 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
                                                        output_size[0] * output_size[1],
                                                        -1)
 
-            # slice alibi tensor until the query length
-            #import pdb;pdb.set_trace()
-            offset = dist.get_rank() * output_size[1] if dist.is_initialized() else 0
-            sliced_alibi = alibi[offset:output_size[1] + offset, :, :output_size[3]]
-
             # Raw attention scores. [batch_size * num_heads, q_length, k_length]
-
             matmul_result = torch.matmul(query_layer.transpose(1,
                                                                0),
                                          key_layer.transpose(1,
                                                              0).transpose(1,
                                                                           2))
-
             # change view to [batch_size, num_heads, q_length, k_length]
             attention_scores = matmul_result.view(*output_size)
-            # Convert mask from 1/0 to 0/-inf, when dtype is int64
+
+            offset = dist.get_rank(
+            ) * num_attention_heads_per_partition if dist.is_initialized() else 0
             attention_probs = inference_cuda_module.softmax_fp16(
                 attention_scores,
                 ((1 - input_mask).half() *
-                 -10000.0) if input_mask.dtype == torch.int64 else input_mask,
-                sliced_alibi,
-                (output_size[-2] > 1),
+                 minus_inf) if input_mask.dtype == torch.int64 else input_mask,
+                alibi,
+                (config.triangular_masking and (attention_scores.shape[-2] > 1)),
                 False,
                 False,
                 1,
                 False,
-                1 / (norm_factor * norm_factor))
-
-            # context layer shape: [batch_size, num_heads, q_length, head_dim]
-            output_size = (value_layer.size(0),
-                           value_layer.size(2),
-                           query_layer.size(0),
-                           value_layer.size(3))
-            # change view [k_length, batch_size x num_heads, head_dim]
-            value_layer = value_layer.transpose(1,
-                                                0).reshape(
-                                                    value_layer.size(1),
-                                                    output_size[0] * output_size[1],
-                                                    -1)
-
+                1 / (norm_factor * norm_factor),
+                offset)
             # change view [batch_size x num_heads, q_length, k_length]
-            attention_probs_reshaped = attention_probs.view(
-                output_size[0] * output_size[1],
-                output_size[2],
-                -1)
+            attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
 
             # matmul: [batch_size * num_heads, q_length, head_dim]
-            context_layer = torch.bmm(attention_probs_reshaped,
-                                      value_layer.transpose(0,
-                                                            1))
+            context_layer = torch.bmm(
+                attention_probs_reshaped,
+                value_layer.transpose(1,
+                                      2).reshape(-1,
+                                                 value_layer.size(1),
+                                                 value_layer.size(3)))
 
             # change view [batch_size, num_heads, q_length, head_dim]
-            context_layer = context_layer.view(*output_size)
-
-            # [batchs_size, num_heads, q_length, head_dim] --> [q_length, batch_size, num_heads, head_dim]
-            context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+            context_layer = context_layer.view(
+                context_layer.size(0) // num_attention_heads_per_partition,
+                num_attention_heads_per_partition,
+                context_layer.size(1),
+                context_layer.shape[-1])
 
-            # [q_length, batch_size, num_heads, head_dim] --> [q_length, batch_size, hidden_size]
-            new_context_layer_shape = context_layer.size()[:-2] + (
-                hidden_size_per_partition,
-            )
-            context_layer = context_layer.view(*new_context_layer_shape)
+            context_layer = _transpose_for_context(context_layer)
 
             return context_layer, presents
-        '''
 
         ###################### End of HF modeling_bloom addition ########################
 
@@ -508,14 +410,16 @@ def selfAttention_fp():
             else:
                 qkv_func = inference_cuda_module.qkv_gemm_fp16 if config.fp16 else \
                                     inference_cuda_module.qkv_gemm_fp32
-                qkv_out = qkv_func(input,
-                                   attn_qkvw,
-                                   (attn_qkvb if attn_qkvb is not None else norm_b),
-                                   norm_w,
-                                   norm_b,
-                                   config.epsilon,
-                                   (attn_qkvb is not None),
-                                   DeepSpeedTransformerInference.layer_id)
+                qkv_out = qkv_func(
+                    input,
+                    attn_qkvw,
+                    (attn_qkvb if attn_qkvb is not None else norm_b),
+                    norm_w,
+                    norm_b,
+                    config.epsilon,
+                    (attn_qkvb is not None),
+                    1 if config.bigscience_bloom else
+                    DeepSpeedTransformerInference.layer_id)
             context_layer, key_layer, value_layer = compute_attention(qkv_out[0] if isinstance(qkv_out, list) else qkv_out, input_mask)
             output = vector_matmul_func(context_layer, attn_ow, False)
 

From 1f92e5572b4e0fd61c3d860f5dad7630326491d5 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Fri, 15 Jul 2022 23:34:11 +0500
Subject: [PATCH 12/22] reduce the max_token_length for now

---
 csrc/transformer/inference/includes/custom_cuda_layers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/transformer/inference/includes/custom_cuda_layers.h b/csrc/transformer/inference/includes/custom_cuda_layers.h
index 530d57012ffb..afca0f04c542 100644
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <iostream>
 
-#define MAX_OUT_TOKES 1024
+#define MAX_OUT_TOKES 128
 #define MAX_WARP_NUM 32
 #define WARP_SIZE 32
 

From 5d1f351b15352129bd0605795fb0a9e66b46a679 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 16 Jul 2022 01:14:43 +0500
Subject: [PATCH 13/22] fix mask-adding

---
 csrc/transformer/inference/csrc/softmax.cu | 26 ++++++----------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index 297e21afa35c..41a1e13709ca 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -115,17 +115,10 @@ __global__ void attn_softmax_v2(__half* vals,
                             high_data[i].y + __half2float(alibi[data_id + alibi_offset + 3]);
                     }
                     if (mask) {
-                        low_data[i].x = __hlt(mask[data_id + mask_offset], zero_h) ? minus_infinity
-                                                                                   : low_data[i].x;
-                        low_data[i].y = __hlt(mask[data_id + mask_offset + 1], zero_h)
-                                            ? minus_infinity
-                                            : low_data[i].y;
-                        high_data[i].x = __hlt(mask[data_id + mask_offset + 2], zero_h)
-                                             ? minus_infinity
-                                             : high_data[i].x;
-                        high_data[i].y = __hlt(mask[data_id + mask_offset + 3], zero_h)
-                                             ? minus_infinity
-                                             : high_data[i].y;
+                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
+                        low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
+                        high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
+                        high_data[i].y += __half2float(mask[data_id + mask_offset + 3]);
                     }
                 } else {
                     low_data[i].x = data_id > window_stride
@@ -152,16 +145,11 @@ __global__ void attn_softmax_v2(__half* vals,
                     }
                     high_data[i].y = minus_infinity;
                     if (mask) {
-                        low_data[i].x = __hlt(mask[data_id + mask_offset], zero_h) ? minus_infinity
-                                                                                   : low_data[i].x;
+                        low_data[i].x += __half2float(mask[data_id + mask_offset]);
                         if ((data_id + 1) < sequence_length)
-                            low_data[i].y = __hlt(mask[data_id + mask_offset + 1], zero_h)
-                                                ? minus_infinity
-                                                : low_data[i].y;
+                            low_data[i].y += __half2float(mask[data_id + mask_offset + 1]);
                         if ((data_id + 2) < sequence_length)
-                            high_data[i].x = __hlt(mask[data_id + mask_offset + 2], zero_h)
-                                                 ? minus_infinity
-                                                 : high_data[i].x;
+                            high_data[i].x += __half2float(mask[data_id + mask_offset + 2]);
                     }
                 }
                 // if(lane == 0) printf("%f , %d, %d \n", low_data[i].x, data_id, seq_id);

From 3c12b899a80885732122ee2425c66ac1591fe0f7 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 16 Jul 2022 05:01:05 +0500
Subject: [PATCH 14/22] fix the large-batch inference for MP > 1

---
 csrc/transformer/inference/csrc/pt_binding.cpp    | 11 ++++++-----
 csrc/transformer/inference/csrc/softmax.cu        | 15 +++++++--------
 .../inference/includes/custom_cuda_layers.h       |  2 +-
 .../inference/transformer_inference.py            |  3 ++-
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 1419767f0d1f..187fa1d23cce 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -18,7 +18,8 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                       int window_size,
                       bool async_op,
                       float layer_scale,
-                      int head_offset)
+                      int head_offset,
+                      int mp_size)
 {
     auto attn_scores_c = attn_scores.contiguous();
     int bsz = attn_scores_c.size(0);
@@ -38,7 +39,7 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
-                           (T*)alibi.data_ptr(),
+                           (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
                            layer_scale,
                            triangular,
                            recompute,
@@ -50,7 +51,7 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
                            soft_len,
                            head_offset,
                            mask_stride,
-                           1.0,
+                           mp_size,
                            Context::Instance().GetCurrentStream(async_op));
 
     return attn_scores_c;
@@ -170,7 +171,7 @@ void attention_unfused(at::Tensor& prev_key_cont,
                            soft_len,
                            0,
                            mask_stride,
-                           1.0,
+                           1,
                            Context::Instance().GetCurrentStream(false));
     alpha = 1.0;
     cublas_strided_batched_gemm(Context::Instance().GetCublasHandle(),
@@ -278,7 +279,7 @@ void ds_softmax_internal(T* attn_scores,
                            soft_len,
                            0,
                            mask_stride,
-                           1.0,
+                           1,
                            at::cuda::getCurrentCUDAStream());
 }
 
diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index 41a1e13709ca..2c6da24fbf1e 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -40,7 +40,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                 int num_seq,
                                 int head_offset,
                                 int mask_stride,
-                                float scale,
+                                int mp_size,
                                 int iterations,
                                 int reduceWidth)
 {
@@ -52,7 +52,6 @@ __global__ void attn_softmax_v2(__half* vals,
     float2 low_data[MAX_REG_SIZE];
     float2 high_data[MAX_REG_SIZE];
     const __half zero_h = __float2half(0.f);
-    __half2 h_scale = __float2half2_rn(scale);
 
     int wid = threadIdx.x >> 5;
     int lane = threadIdx.x & 0x1f;
@@ -65,7 +64,7 @@ __global__ void attn_softmax_v2(__half* vals,
 
     int iter_offset = blockIdx.x * (warp_num / reduce_blocks) + (wid / reduce_blocks);
     int batch_idx = iter_offset / (num_seq * heads);
-    int alibi_offset = batch_idx * heads + head_offset;
+    int alibi_offset = batch_idx * heads * mp_size + head_offset;
     int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
 
     if (iter_offset < total_count) {
@@ -246,7 +245,7 @@ __global__ void attn_softmax_v2(float* vals,
                                 int num_seq,
                                 int head_offset,
                                 int mask_stride,
-                                float scale,
+                                int mp_size,
                                 int iterations,
                                 int reduceWidth)
 {
@@ -417,7 +416,7 @@ void launch_attn_softmax_v2(T* vals,
                             int sequence_length,
                             int head_offset,
                             int mask_stride,
-                            float scale,
+                            int mp_size,
                             cudaStream_t stream)
 {
     int total_count = batch_size * heads * num_seq;
@@ -442,7 +441,7 @@ void launch_attn_softmax_v2(T* vals,
                                                             num_seq,
                                                             head_offset,
                                                             mask_stride,
-                                                            scale,
+                                                            mp_size,
                                                             iterations,
                                                             reduce_width);
     else
@@ -463,7 +462,7 @@ template void launch_attn_softmax_v2(float* vals,
                                      int sequence_length,
                                      int head_offset,
                                      int mask_stride,
-                                     float scale,
+                                     int mp_size,
                                      cudaStream_t stream);
 template void launch_attn_softmax_v2(__half* vals,
                                      __half* mask,
@@ -479,5 +478,5 @@ template void launch_attn_softmax_v2(__half* vals,
                                      int sequence_length,
                                      int head_offset,
                                      int mask_stride,
-                                     float scale,
+                                     int mp_size,
                                      cudaStream_t stream);
diff --git a/csrc/transformer/inference/includes/custom_cuda_layers.h b/csrc/transformer/inference/includes/custom_cuda_layers.h
index afca0f04c542..c2bb30126cd6 100644
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@@ -40,7 +40,7 @@ void launch_attn_softmax_v2(T* vals,
                             int sequence_length,
                             int offset,
                             int mask_stride,
-                            float scale,
+                            int mp_size,
                             cudaStream_t stream);
 
 // Fused bias add with gelu activation
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index c4139aceb61d..a909f2039595 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -252,7 +252,8 @@ def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
                 1,
                 False,
                 1 / (norm_factor * norm_factor),
-                offset)
+                offset,
+                config.mp_size)
             # change view [batch_size x num_heads, q_length, k_length]
             attention_probs_reshaped = attention_probs.view(*matmul_result.shape)
 

From aa5e01ff6c51556517c732c7e9bf8adc53c30169 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 18 Jul 2022 12:02:36 -0700
Subject: [PATCH 15/22] Ds inference/bloom support meta (#2104)

---
 deepspeed/__init__.py                         |  2 +-
 deepspeed/module_inject/load_checkpoint.py    | 27 +++++--
 deepspeed/module_inject/replace_module.py     | 29 ++++---
 .../inference/transformer_inference.py        |  1 +
 deepspeed/utils/__init__.py                   |  1 +
 deepspeed/utils/init_on_device.py             | 81 +++++++++++++++++++
 tests/unit/test_init_on_device.py             | 19 +++++
 7 files changed, 141 insertions(+), 19 deletions(-)
 create mode 100644 deepspeed/utils/init_on_device.py
 create mode 100644 tests/unit/test_init_on_device.py

diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 25229bde06d8..7a18f98a49e8 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -24,7 +24,7 @@
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from .module_inject import replace_transformer_layer, revert_transformer_layer
 
-from .utils import log_dist
+from .utils import log_dist, OnDevice
 from .comm.comm import init_distributed
 
 from .runtime import zero
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index 3c6d4e960d3e..f6722deb582b 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -78,17 +78,23 @@ def load_module_recursive(module, prefix='', level=0):
                 checking_key = prefix + name + '.'
                 if not any(checking_key in item for item in sd.keys()):
                     if hasattr(child, 'weight') and \
-                        hasattr(child.weight, 'ds_id') and \
-                        child.weight.ds_id in all_ds_ids:
+                        (hasattr(child.weight, 'ds_id') and \
+                        child.weight.ds_id in all_ds_ids):
                         prefix1 = all_ds_ids[child.weight.ds_id]
                         if child.__class__ is nn.Linear:
                             child = LinearLayer(weight=all_ds_ids[child.weight.ds_id])
                             setattr(module, name, child)
                     continue
-                if len(list(child.parameters())) > 0 and list(
-                        child.parameters())[0].numel() == 0:
+                child_params = list(child.parameters())
+                if len(child_params) > 0 and (child_params[0].numel() == 0
+                                              or child_params[0].is_meta):
+                    if child.weight.is_meta:
+                        ds_shape = child.weight.shape
+                    else:
+                        ds_shape = child.weight.ds_shape
+
                     if child.__class__ is nn.LayerNorm:
-                        child = Normalize(dim=child.weight.ds_shape[-1],
+                        child = Normalize(dim=ds_shape[-1],
                                           dtype=child.weight.dtype,
                                           eps=child.eps)
                         setattr(module, name, child)
@@ -96,7 +102,7 @@ def load_module_recursive(module, prefix='', level=0):
                         ds_id = None
                         if hasattr(child.weight, 'ds_id'):
                             ds_id = child.weight.ds_id
-                        child = EmbeddingLayer(weight_shape=child.weight.ds_shape,
+                        child = EmbeddingLayer(weight_shape=ds_shape,
                                                dtype=child.weight.dtype)
                         if ds_id is not None:
                             all_ds_ids[ds_id] = child.weight
@@ -109,5 +115,14 @@ def load_module_recursive(module, prefix='', level=0):
                                       level + 1)
 
     load_module_recursive(r_module)
+
+    #XXX: hack to tie embedding w. lm_head for BLOOM, need to revist soon
+    embedding_weight = None
+    for n, p in r_module.named_parameters():
+        if "word_embeddings." in n:
+            embedding_weight = p
+    assert hasattr(r_module, 'lm_head'), "attempting to set lm_head but it doesn't exist"
+    r_module.lm_head.weight = embedding_weight
+
     del sd
     sd = None
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index b49a8ff043e3..4cbb32b622cd 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -365,8 +365,9 @@ def transpose(data):
             mpl_block = new_module.mlp
 
             if attn_linear_layer:
-                if qkvw.numel() == 0:
-                    if qkvw.ds_tensor.numel() < attn_block.attn_qkvw.numel():
+                if qkvw.numel() == 0 or qkvw.is_meta:
+                    if qkvw.is_meta or qkvw.ds_tensor.numel(
+                    ) < attn_block.attn_qkvw.numel():
                         pass
                     else:
                         with GatheredParameters([qkvw,
@@ -419,8 +420,9 @@ def _transpose(x):
             #                   transformer_config.mp_size)
 
             if mlp_linear_layer:
-                if not moe and _4hh_w.numel() == 0:
-                    if _4hh_w.ds_tensor.numel() < mpl_block.inter_w.numel():
+                if not moe and (_4hh_w.numel() == 0 or _4hh_w.is_meta):
+                    if _4hh_w.is_meta or _4hh_w.ds_tensor.numel(
+                    ) < mpl_block.inter_w.numel():
                         pass
                     else:
                         with GatheredParameters([_h4h_w,
@@ -443,8 +445,8 @@ def _transpose(x):
                 _res_4hh_w.data = transpose(_res_4hh_w.data)
                 _res_coef.data = transpose(_res_coef.data)
 
-            if qkvw.numel() == 0:
-                if qkvw.ds_tensor.numel() < attn_block.attn_qkvw.numel():
+            if qkvw.is_meta or qkvw.numel() == 0 or qkvw.is_meta:
+                if qkvw.is_meta or qkvw.ds_tensor.numel() < attn_block.attn_qkvw.numel():
                     pass
                 else:
                     with GatheredParameters([attn_qkvw,
@@ -498,8 +500,9 @@ def _transpose(x):
                     new_module.res_coef.data = _res_coef.to(torch.cuda.current_device())
             else:
 
-                if _4hh_w.numel() == 0:
-                    if _4hh_w.ds_tensor.numel() < mpl_block.inter_w.numel():
+                if _4hh_w.numel() == 0 or _4hh_w.is_meta:
+                    if _4hh_w.is_meta or _4hh_w.ds_tensor.numel(
+                    ) < mpl_block.inter_w.numel():
                         pass
                     else:
                         with GatheredParameters([_h4h_w,
@@ -524,8 +527,9 @@ def _transpose(x):
                     mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b)
                     mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w)
                     mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b)
-                if attn_nw.numel() == 0:
-                    if attn_nw.ds_tensor.numel() < new_module.mlp.attn_nw.numel():
+                if attn_nw.is_meta or attn_nw.numel() == 0:
+                    if attn_nw.is_meta or attn_nw.ds_tensor.numel(
+                    ) < new_module.mlp.attn_nw.numel():
                         pass
                     else:
                         with GatheredParameters([attn_nw, attn_nb], modifier_rank=0):
@@ -550,8 +554,9 @@ def _transpose(x):
                     else:
                         new_module.mlp.attn_nb.data.copy_(
                             attn_nb.to(torch.cuda.current_device()))
-            if input_nw.numel() == 0:
-                if input_nw.ds_tensor.numel() < new_module.norm_w.numel():
+            if input_nw.is_meta or input_nw.numel() == 0:
+                if input_nw.is_meta or input_nw.ds_tensor.numel(
+                ) < new_module.norm_w.numel():
                     pass
                 else:
                     with GatheredParameters([input_nw, input_nb], modifier_rank=0):
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index a909f2039595..9c7ef120f6bd 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -192,6 +192,7 @@ def split_tensor_along_last_dim(tensor,
             return tensor_list
 
         def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
+            # [batch_size, seq_length, 3 x hidden_size] --> [batch_size, seq_length, num_heads, 3 x head_dim]
             head_dim = hidden_size_per_partition // num_attention_heads_per_partition
             new_tensor_shape = mixed_x_layer.size()[:-1] + (
                 num_attention_heads_per_partition,
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index efe18c264316..fe7461002932 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -1,5 +1,6 @@
 from .logging import logger, log_dist
 #from .distributed import init_distributed
+from .init_on_device import OnDevice
 from .groups import *
 from .nvtx import instrument_w_nvtx
 from deepspeed.runtime.dataloader import RepeatingLoader
diff --git a/deepspeed/utils/init_on_device.py b/deepspeed/utils/init_on_device.py
new file mode 100644
index 000000000000..d0324766749a
--- /dev/null
+++ b/deepspeed/utils/init_on_device.py
@@ -0,0 +1,81 @@
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+'''
+import torch
+from typing import Callable
+from torch import Tensor
+from packaging import version as pkg_version
+
+
+class OnDevice(object):
+    """
+    Create modules/tensors w. specific devices and dtypes. Examples:
+
+    Create MyModule which consists of many different sub-modules and parameters. In this case we can create
+    MyModule as a collection of 'meta' tensors by passing `device='meta'` or we can create the module _directly_
+    on a CUDA device by passing `device=f'cuda:{local_rank}'` (where `local_rank` is the local GPU id.
+
+    with OnDevice(dtype=torch.float16, device='meta'):
+        model = MyModel()
+
+    with OnDevice(dtype=torch.float16, device=f'cuda:{local_rank}'):
+        model = MyModel()
+
+    """
+
+    _orig_torch_empty = torch.empty
+    _orig_torch_zeros = torch.zeros
+    _orig_torch_ones = torch.ones
+    _orig_torch_full = torch.full
+
+    def __init__(self, dtype, device="meta", enabled=True):
+        self.dtype = dtype
+        self.enabled = enabled
+        self.device = device
+
+        if device == "meta":
+            if pkg_version.parse('1.10') > pkg_version.parse(torch.__version__):
+                raise NotImplementedError(
+                    "Meta tensor support is not available, please upgrade to torch 1.10+"
+                )
+
+    def fp_tensor_constructor(self,
+                              fn: Callable,
+                              target_fp_dtype: torch.dtype) -> Callable:
+        def wrapped_fn(*args, **kwargs) -> Tensor:
+            if kwargs.get("device", None) is None:
+                kwargs['device'] = self.device
+            tensor: Tensor = fn(*args, **kwargs)
+            if tensor.is_floating_point():
+                tensor = tensor.to(target_fp_dtype)
+            return tensor
+
+        return wrapped_fn
+
+    def get_new_tensor_fn_for_dtype(self, dtype: torch.dtype) -> Callable:
+        def new_tensor(cls, *args) -> Tensor:
+            tensor = OnGPU._orig_torch_empty(0, device=self.device).new_empty(*args)
+            if tensor.is_floating_point():
+                tensor = tensor.to(dtype)
+            return tensor
+
+        return new_tensor
+
+    def __enter__(self):
+        if not self.enabled:
+            return
+        torch.Tensor.__old_new__ = torch.Tensor.__new__
+        torch.Tensor.__new__ = self.get_new_tensor_fn_for_dtype(self.dtype)
+        torch.empty = self.fp_tensor_constructor(self._orig_torch_empty, self.dtype)
+        torch.zeros = self.fp_tensor_constructor(self._orig_torch_zeros, self.dtype)
+        torch.ones = self.fp_tensor_constructor(self._orig_torch_ones, self.dtype)
+        torch.full = self.fp_tensor_constructor(self._orig_torch_full, self.dtype)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        if not self.enabled:
+            return
+        torch.Tensor.__new__ = torch.Tensor.__old_new__
+        torch.empty = self._orig_torch_empty
+        torch.zeros = self._orig_torch_zeros
+        torch.ones = self._orig_torch_ones
+        torch.full = self._orig_torch_full
diff --git a/tests/unit/test_init_on_device.py b/tests/unit/test_init_on_device.py
new file mode 100644
index 000000000000..2306d315f9e4
--- /dev/null
+++ b/tests/unit/test_init_on_device.py
@@ -0,0 +1,19 @@
+import torch
+import pytest
+from .simple_model import SimpleModel
+from deepspeed import OnDevice
+from packaging import version as pkg_version
+
+
+@pytest.mark.parametrize('device', ['meta', 'cuda:0'])
+def test_on_device(device):
+    if device == "meta" and pkg_version.parse(
+            torch.__version__) < pkg_version.parse("1.10"):
+        pytest.skip("meta tensors only became stable after torch 1.10")
+
+    with OnDevice(dtype=torch.half, device=device):
+        model = SimpleModel(4)
+
+    for p in model.parameters():
+        assert p.device == torch.device(device)
+        assert p.dtype == torch.half

From b6503ed134f8dcdb52faa5ce044bfba74975095a Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 19 Jul 2022 00:17:18 +0500
Subject: [PATCH 16/22] fix the Bert and GPT-J unit tests

---
 deepspeed/module_inject/replace_module.py     | 30 ++++++++-----------
 .../inference/transformer_inference.py        |  4 +--
 2 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 4cbb32b622cd..5a9cb040626a 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -527,33 +527,27 @@ def _transpose(x):
                     mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b)
                     mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w)
                     mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b)
-                if attn_nw.is_meta or attn_nw.numel() == 0:
-                    if attn_nw.is_meta or attn_nw.ds_tensor.numel(
-                    ) < new_module.mlp.attn_nw.numel():
-                        pass
-                    else:
-                        with GatheredParameters([attn_nw, attn_nb], modifier_rank=0):
-                            if attn_nw is None:
-                                new_module.mlp.attn_nw = attn_nw
-                            else:
+
+                if attn_nw is None:
+                    new_module.mlp.attn_nw = attn_nw
+                    new_module.mlp.attn_nb = attn_nb
+                else:
+                    if attn_nw.is_meta or attn_nw.numel() == 0:
+                        if attn_nw.is_meta or attn_nw.ds_tensor.numel(
+                        ) < new_module.mlp.attn_nw.numel():
+                            pass
+                        else:
+                            with GatheredParameters([attn_nw, attn_nb], modifier_rank=0):
                                 new_module.mlp.attn_nw.data.copy_(
                                     attn_nw.to(torch.cuda.current_device()))
-                            if attn_nb is None:
-                                new_module.mlp.attn_nb = attn_nb
-                            else:
                                 new_module.mlp.attn_nb.data.copy_(
                                     attn_nb.to(torch.cuda.current_device()))
-                else:
-                    if attn_nw is None:
-                        new_module.mlp.attn_nw = attn_nw
                     else:
                         new_module.mlp.attn_nw.data.copy_(
                             attn_nw.to(torch.cuda.current_device()))
-                    if attn_nb is None:
-                        new_module.mlp.attn_nb = attn_nb
-                    else:
                         new_module.mlp.attn_nb.data.copy_(
                             attn_nb.to(torch.cuda.current_device()))
+
             if input_nw.is_meta or input_nw.numel() == 0:
                 if input_nw.is_meta or input_nw.ds_tensor.numel(
                 ) < new_module.norm_w.numel():
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 9c7ef120f6bd..806fa968f362 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -192,7 +192,6 @@ def split_tensor_along_last_dim(tensor,
             return tensor_list
 
         def backup_attention(mixed_x_layer, layer_past, alibi, input_mask, norm_factor):
-            # [batch_size, seq_length, 3 x hidden_size] --> [batch_size, seq_length, num_heads, 3 x head_dim]
             head_dim = hidden_size_per_partition // num_attention_heads_per_partition
             new_tensor_shape = mixed_x_layer.size()[:-1] + (
                 num_attention_heads_per_partition,
@@ -379,6 +378,7 @@ def compute_attention(qkv_out, input_mask):
                         offset = dist.get_rank() * batch_heads if dist.is_initialized(
                         ) else 0
                         sliced_alibi = alibi[offset:batch_heads + offset, :, :]
+
                     attn_key_value = score_context_func(
                         qkv_out,
                         ((1 - input_mask).half() *
@@ -392,7 +392,7 @@ def compute_attention(qkv_out, input_mask):
                         config.local_attention,
                         config.window_size,
                         no_masking,
-                        config.layer_id,
+                        config.layer_id if config.bigscience_bloom else 1,
                         DeepSpeedTransformerInference.layer_id,
                         sliced_alibi if alibi is not None else torch.empty(1))
                     context_layer, key_layer, value_layer = attn_key_value

From a9459d6d7ecbfd880ff651ab968bfb174e4571b8 Mon Sep 17 00:00:00 2001
From: Jeff Rasley <jerasley@microsoft.com>
Date: Mon, 18 Jul 2022 12:34:05 -0700
Subject: [PATCH 17/22] fix for OneDevice

---
 deepspeed/utils/init_on_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/utils/init_on_device.py b/deepspeed/utils/init_on_device.py
index d0324766749a..1b51efe60384 100644
--- a/deepspeed/utils/init_on_device.py
+++ b/deepspeed/utils/init_on_device.py
@@ -54,7 +54,7 @@ def wrapped_fn(*args, **kwargs) -> Tensor:
 
     def get_new_tensor_fn_for_dtype(self, dtype: torch.dtype) -> Callable:
         def new_tensor(cls, *args) -> Tensor:
-            tensor = OnGPU._orig_torch_empty(0, device=self.device).new_empty(*args)
+            tensor = OnDevice._orig_torch_empty(0, device=self.device).new_empty(*args)
             if tensor.is_floating_point():
                 tensor = tensor.to(dtype)
             return tensor

From 72aba56c5aca1babd816ac6ea5c980926837604c Mon Sep 17 00:00:00 2001
From: Michael Wyatt <michaelwyatt@microsoft.com>
Date: Tue, 19 Jul 2022 02:18:27 +0500
Subject: [PATCH 18/22] added bloom inference tests

---
 tests/unit/test_inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/test_inference.py b/tests/unit/test_inference.py
index c3dbc49055b2..006fe6cc884f 100644
--- a/tests/unit/test_inference.py
+++ b/tests/unit/test_inference.py
@@ -52,6 +52,7 @@ def lm_eval_imports():
     "distilgpt2",
     "Norod78/hebrew-bad_wiki-gpt_neo-tiny",
     "EleutherAI/gpt-j-6B",
+    "bigscience/bloom-350m",
 ]
 _all_models = HfApi().list_models()
 

From 332f69dbf3450d6a778a003b95d374d1a38fab31 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 19 Jul 2022 02:21:36 +0500
Subject: [PATCH 19/22] fixing the masking stride for the GPT models

---
 csrc/transformer/inference/csrc/pt_binding.cpp            | 8 ++++----
 .../ops/transformer/inference/transformer_inference.py    | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index 187fa1d23cce..360beaa10284 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -35,7 +35,7 @@ at::Tensor ds_softmax(at::Tensor& attn_scores,
     if (len > 1) heads = attn_scores_c.size(1);
 
     int mask_stride = 1;
-    if (attn_mask.sizes().size() > 2 && attn_mask.size(2) > 1) mask_stride = attn_mask.size(2);
+    if (attn_mask.sizes().size() > 2) mask_stride = attn_mask.size(2);
 
     launch_attn_softmax_v2((T*)attn_scores_c.data_ptr(),
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
@@ -263,8 +263,8 @@ void ds_softmax_internal(T* attn_scores,
                          int soft_len,
                          int heads)
 {
-    int mask_stride = heads;
-    if (attn_mask.sizes().size() > 2 && attn_mask.size(2) == 1) mask_stride *= seq_len;
+    int mask_stride = 1;
+    if (attn_mask.sizes().size() > 2) mask_stride = attn_mask.size(2);
     launch_attn_softmax_v2((T*)attn_scores,
                            (attn_mask.sizes().size() > 1 ? (T*)attn_mask.data_ptr() : nullptr),
                            (alibi.sizes().size() > 1 ? (T*)alibi.data_ptr() : nullptr),
@@ -302,7 +302,7 @@ void attention_unfused(T* prev_key_cont,
                        at::Tensor& alibi,
                        int layer_id)
 {
-    float layer_scale = std::max(1, layer_id);
+    float layer_scale = alibi.sizes().size() > 1 ? std::max(1, layer_id) : 1.0;
     float alpha = norm_factor * norm_factor / layer_scale;
     float gemm_beta = 0.0;
     T* workspace = (T*)output + bsz * seq_len * heads * k;
diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 806fa968f362..20d3215cd53e 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -381,7 +381,7 @@ def compute_attention(qkv_out, input_mask):
 
                     attn_key_value = score_context_func(
                         qkv_out,
-                        ((1 - input_mask).half() *
+                        ((1 - input_mask).to(qkv_out.dype) *
                          minus_inf) if input_mask.dtype == torch.int64 else input_mask,
                         config.rotary_dim,
                         config.rotate_half,
@@ -392,7 +392,7 @@ def compute_attention(qkv_out, input_mask):
                         config.local_attention,
                         config.window_size,
                         no_masking,
-                        config.layer_id if config.bigscience_bloom else 1,
+                        config.layer_id,
                         DeepSpeedTransformerInference.layer_id,
                         sliced_alibi if alibi is not None else torch.empty(1))
                     context_layer, key_layer, value_layer = attn_key_value

From d38464e0633c90a4352111f105f699306b64d797 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 19 Jul 2022 02:34:53 +0500
Subject: [PATCH 20/22] revert back some changes on replace_module

---
 deepspeed/module_inject/replace_module.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index 5a9cb040626a..306feac322ea 100755
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -357,8 +357,10 @@ def replace_with_policy(child,
             # transpose it here to reduce inference cost!
             def transpose(data):
                 # temp move to cpu to avoid requiring extra GPU memory during the reshape
+                data = data.to('cpu')
                 data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
                 data = data.reshape(data.shape[-1], data.shape[-2])
+                data.to(torch.cuda.current_device())
                 return data
 
             attn_block = new_module.attention

From ac2d092e8d1a37daf633ba7dae5e5ab87d23db08 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 19 Jul 2022 03:23:48 +0500
Subject: [PATCH 21/22] fix fp32 softmax

---
 csrc/transformer/inference/csrc/softmax.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/csrc/transformer/inference/csrc/softmax.cu b/csrc/transformer/inference/csrc/softmax.cu
index 2c6da24fbf1e..dcc90c3b5cbb 100644
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@@ -267,7 +267,10 @@ __global__ void attn_softmax_v2(float* vals,
     if (iter_offset < total_count) {
         vals += (iter_offset * sequence_length);
 
-        int mask_offset = (iter_offset / mask_stride) * (sequence_length);
+        int batch_idx = iter_offset / (num_seq * heads);
+        int alibi_offset = batch_idx * heads * mp_size + head_offset;
+        int mask_offset = batch_idx * mask_stride + (iter_offset % mask_stride);
+
         int seq_id = iter_offset % num_seq;
         int seq_id4 = seq_id >> 2;
 

From 1e3ea74c19d7e777a4e67035b1b11ea05dc60192 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Tue, 19 Jul 2022 03:43:27 +0500
Subject: [PATCH 22/22] allocate tensors initially on cpu at inference-api

---
 .../inference/transformer_inference.py        | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/deepspeed/ops/transformer/inference/transformer_inference.py b/deepspeed/ops/transformer/inference/transformer_inference.py
index 20d3215cd53e..342f4e1b595f 100755
--- a/deepspeed/ops/transformer/inference/transformer_inference.py
+++ b/deepspeed/ops/transformer/inference/transformer_inference.py
@@ -486,26 +486,27 @@ def __init__(self,
         data_type = torch.half if config.fp16 else torch.float
         self.config.layer_id = DeepSpeedSelfAttention.num_layers
         DeepSpeedSelfAttention.num_layers = DeepSpeedSelfAttention.num_layers + 1
+        device = torch.cuda.current_device() if config.bigscience_bloom else 'cpu'
         self.attn_qkvw = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         (self.config.hidden_size // self.config.mp_size) * 3,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.attn_qkvb = nn.Parameter(
             torch.empty((self.config.hidden_size // self.config.mp_size) * 3,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
 
         self.attn_ow = nn.Parameter(
             torch.empty(self.config.hidden_size // self.config.mp_size,
                         self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
 
         self.attn_ob = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
 
         self.num_attention_heads_per_partition = self.config.heads // self.config.mp_size
         self.hidden_size_per_partition = self.config.hidden_size // self.config.mp_size
@@ -663,32 +664,33 @@ def __init__(self,
 
         self.config = config
         data_type = torch.half if config.fp16 else torch.float
+        device = torch.cuda.current_device() if config.bigscience_bloom else 'cpu'
         self.attn_nw = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.attn_nb = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.inter_w = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         self.config.intermediate_size // self.config.mp_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.inter_b = nn.Parameter(
             torch.empty(self.config.intermediate_size // self.config.mp_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.output_w = nn.Parameter(
             torch.empty((self.config.intermediate_size // self.config.mp_size),
                         self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.output_b = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
 
         # used for quantization
         self.q_scales = q_scales
@@ -781,14 +783,15 @@ def __init__(self,
                                 merge_count,
                                 mlp_extra_grouping)
 
+        device = torch.cuda.current_device() if config.bigscience_bloom else 'cpu'
         self.norm_w = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.norm_b = nn.Parameter(
             torch.empty(self.config.hidden_size,
                         dtype=data_type,
-                        device=torch.cuda.current_device()))
+                        device=device))
         self.layer_past = None
 
     def forward(self,