Skip to content

Commit

Permalink
Use vector load for HIP FP16 in Vec4T
Browse files Browse the repository at this point in the history
Summary:
Before this diff, HIP does 4 sequential scalar loads for the half
input in TBE's Vec4T.  This diff does a vector load for 4 halves.

Reviewed By: jspark1105

Differential Revision: D39267283

fbshipit-source-id: 089451de9b79a0219ae5aef9b41bbfcb292f8ce2
  • Loading branch information
sryap committed Sep 8, 2022
1 parent f2c7c11 commit 3294fa0
Showing 1 changed file with 15 additions and 4 deletions.
19 changes: 15 additions & 4 deletions fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,21 @@ struct Vec4T<float> {

DEVICE_INLINE Vec4T(const at::Half* p) {
#ifdef __HIP_PLATFORM_HCC__
acc.x = __half2float(p[0]);
acc.y = __half2float(p[1]);
acc.z = __half2float(p[2]);
acc.w = __half2float(p[3]);
union U {
half2 h[2];
uint2 ui;
} tmp_out;

// uint2 = 2 uints = 8 bytes
tmp_out.ui = *reinterpret_cast<uint2 const*>(p);

float2 a = __half22float2(tmp_out.h[0]);
float2 b = __half22float2(tmp_out.h[1]);

acc.x = a.x;
acc.y = a.y;
acc.z = b.x;
acc.w = b.y;
#else
Half4 out;
#if CUDA_VERSION >= 9000
Expand Down

0 comments on commit 3294fa0

Please sign in to comment.