Use vector load for HIP FP16 in Vec4T

Summary: Before this diff, HIP does 4 sequential scalar loads for the half input in TBE's Vec4T. This diff does a vector load for 4 halves. Reviewed By: jspark1105 Differential Revision: D39267283 fbshipit-source-id: 089451de9b79a0219ae5aef9b41bbfcb292f8ce2
pytorch · Sep 8, 2022 · 3294fa0 · 3294fa0
1 parent f2c7c11
commit 3294fa0
Showing 1 changed file with 15 additions and 4 deletions.
diff --git a/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh b/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
@@ -98,10 +98,21 @@ struct Vec4T<float> {
 
   DEVICE_INLINE Vec4T(const at::Half* p) {
 #ifdef __HIP_PLATFORM_HCC__
-    acc.x = __half2float(p[0]);
-    acc.y = __half2float(p[1]);
-    acc.z = __half2float(p[2]);
-    acc.w = __half2float(p[3]);
+    union U {
+      half2 h[2];
+      uint2 ui;
+    } tmp_out;
+
+    // uint2 = 2 uints = 8 bytes
+    tmp_out.ui = *reinterpret_cast<uint2 const*>(p);
+
+    float2 a = __half22float2(tmp_out.h[0]);
+    float2 b = __half22float2(tmp_out.h[1]);
+
+    acc.x = a.x;
+    acc.y = a.y;
+    acc.z = b.x;
+    acc.w = b.y;
 #else
     Half4 out;
 #if CUDA_VERSION >= 9000