From 6082e352fa4f8d70deba8be48bf082daa43a69b7 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 27 Aug 2022 12:44:55 +0500
Subject: [PATCH 1/2] fix the bias-add issue when using tensor-parallelism

---
 csrc/transformer/inference/csrc/gelu.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index ab44f223ac0f..8391058b913a 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -389,16 +389,16 @@ __global__ void gptj_residual_add(__half* input,
             __half2* attnbias_half = reinterpret_cast<__half2*>(&attn_bias_vec);
             float2 attn_low_bias = __half22float2(attnbias_half[0]);
             float2 attn_high_bias = __half22float2(attnbias_half[1]);
-            low_data.x += attn_low_bias.x;
-            low_data.y += attn_low_bias.y;
-            high_data.x += attn_high_bias.x;
-            high_data.y += attn_high_bias.y;
+            low_data.x += attn_low_bias.x * mp_scale;
+            low_data.y += attn_low_bias.y * mp_scale;
+            high_data.x += attn_high_bias.x * mp_scale;
+            high_data.y += attn_high_bias.y * mp_scale;
         }
 
-        low_data.x = low_data.x * mp_scale + (low_out.x + low_res.x + (low_bias.x));
-        low_data.y = low_data.y * mp_scale + (low_out.y + low_res.y + (low_bias.y));
-        high_data.x = high_data.x * mp_scale + (high_out.x + high_res.x + (high_bias.x));
-        high_data.y = high_data.y * mp_scale + (high_out.y + high_res.y + (high_bias.y));
+        low_data.x = low_res.x + low_out.x + (low_data.x + low_bias.x) * mp_scale;
+        low_data.y = low_res.y + low_out.y + (low_data.y + low_bias.y) * mp_scale;
+        high_data.x = high_res.x + high_out.x + (high_data.x + high_bias.x) * mp_scale;
+        high_data.y = high_res.y + high_out.y + (high_data.y + high_bias.y) * mp_scale;
 
         vals_half[0] = __float22half2_rn(low_data);
         vals_half[1] = __float22half2_rn(high_data);

From 9eea4ee4abf27e0242a562381f92babbb3718841 Mon Sep 17 00:00:00 2001
From: Reza Yazdani <reyazda@microsoft.com>
Date: Sat, 27 Aug 2022 12:48:56 +0500
Subject: [PATCH 2/2] fixing float32 kernel

---
 csrc/transformer/inference/csrc/gelu.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/csrc/transformer/inference/csrc/gelu.cu b/csrc/transformer/inference/csrc/gelu.cu
index 8391058b913a..ccd5469826d0 100644
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@@ -327,15 +327,15 @@ __global__ void gptj_residual_add(float* input,
 
         if (attnbias) {
             float4 attn_bias = attnbias_cast[offset % intermediate_size];
-            data.x += attn_bias.x;
-            data.y += attn_bias.y;
-            data.z += attn_bias.z;
-            data.w += attn_bias.w;
+            data.x += attn_bias.x * mp_scale;
+            data.y += attn_bias.y * mp_scale;
+            data.z += attn_bias.z * mp_scale;
+            data.w += attn_bias.w * mp_scale;
         }
-        data.x = data.x * mp_scale + (out.x + res_vec.x + bias_data.x);
-        data.y = data.y * mp_scale + (out.y + res_vec.y + bias_data.y);
-        data.z = data.z * mp_scale + (out.z + res_vec.z + bias_data.z);
-        data.w = data.w * mp_scale + (out.w + res_vec.w + bias_data.w);
+        data.x = out.x + res_vec.x + (data.x + bias_data.x) * mp_scale;
+        data.y = out.y + res_vec.y + (data.y + bias_data.y) * mp_scale;
+        data.z = out.z + res_vec.z + (data.z + bias_data.z) * mp_scale;
+        data.w = out.w + res_vec.w + (data.w + bias_data.w) * mp_scale;
 
         output_cast[offset] = data;
     }