diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_device_functions.h b/third_party/amd/backend/include/hip/amd_detail/amd_device_functions.h index 2a5cf48a0397..c4837ad64c4d 100644 --- a/third_party/amd/backend/include/hip/amd_detail/amd_device_functions.h +++ b/third_party/amd/backend/include/hip/amd_detail/amd_device_functions.h @@ -266,14 +266,14 @@ __device__ static inline int __mul24(int x, int y) { } __device__ static inline long long __mul64hi(long long int x, long long int y) { - ulong x0 = (ulong)x & 0xffffffffUL; - long x1 = x >> 32; - ulong y0 = (ulong)y & 0xffffffffUL; - long y1 = y >> 32; - ulong z0 = x0*y0; - long t = x1*y0 + (z0 >> 32); - long z1 = t & 0xffffffffL; - long z2 = t >> 32; + unsigned long long x0 = (unsigned long long)x & 0xffffffffUL; + long long x1 = x >> 32; + unsigned long long y0 = (unsigned long long)y & 0xffffffffUL; + long long y1 = y >> 32; + unsigned long long z0 = x0*y0; + long long t = x1*y0 + (z0 >> 32); + long long z1 = t & 0xffffffffL; + long long z2 = t >> 32; z1 = x0*y1 + z1; return x1*y1 + z2 + (z1 >> 32); } @@ -300,14 +300,14 @@ __device__ static inline int __umul24(unsigned int x, unsigned int y) { __device__ static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) { - ulong x0 = x & 0xffffffffUL; - ulong x1 = x >> 32; - ulong y0 = y & 0xffffffffUL; - ulong y1 = y >> 32; - ulong z0 = x0*y0; - ulong t = x1*y0 + (z0 >> 32); - ulong z1 = t & 0xffffffffUL; - ulong z2 = t >> 32; + unsigned long long x0 = x & 0xffffffffUL; + unsigned long long x1 = x >> 32; + unsigned long long y0 = y & 0xffffffffUL; + unsigned long long y1 = y >> 32; + unsigned long long z0 = x0*y0; + unsigned long long t = x1*y0 + (z0 >> 32); + unsigned long long z1 = t & 0xffffffffUL; + unsigned long long z2 = t >> 32; z1 = x0*y1 + z1; return x1*y1 + z2 + (z1 >> 32); } @@ -322,11 +322,6 @@ __device__ static inline unsigned int __usad(unsigned int x, unsigned int y, uns return __ockl_sadd_u32(x, y, z); } -__device__ static inline unsigned int __lane_id() { - return __builtin_amdgcn_mbcnt_hi( - -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); -} - __device__ static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);}; @@ -339,6 +334,7 @@ HIP specific device functions #if !defined(__HIPCC_RTC__) #include "amd_warp_functions.h" +#include "amd_warp_sync_functions.h" #endif #define MASK1 0x00ff00ff @@ -687,34 +683,6 @@ void __named_sync() { __builtin_amdgcn_s_barrier(); } #endif // __HIP_DEVICE_COMPILE__ -// warp vote function __all __any __ballot -__device__ -inline -int __all(int predicate) { - return __ockl_wfall_i32(predicate); -} - -__device__ -inline -int __any(int predicate) { - return __ockl_wfany_i32(predicate); -} - -// XXX from llvm/include/llvm/IR/InstrTypes.h -#define ICMP_NE 33 - -__device__ -inline -unsigned long long int __ballot(int predicate) { - return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); -} - -__device__ -inline -unsigned long long int __ballot64(int predicate) { - return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); -} - // hip.amdgcn.bc - lanemask __device__ inline @@ -877,6 +845,10 @@ int __syncthreads_or(int predicate) #if (defined(__GFX10__) || defined(__GFX11__)) #define HW_ID_WGP_ID_SIZE 4 #define HW_ID_WGP_ID_OFFSET 10 + #if (defined(__AMDGCN_CUMODE__)) + #define HW_ID_CU_ID_SIZE 1 + #define HW_ID_CU_ID_OFFSET 8 + #endif #else #define HW_ID_CU_ID_SIZE 4 #define HW_ID_CU_ID_OFFSET 8 @@ -933,6 +905,10 @@ unsigned __smid(void) GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID)); unsigned sa_id = __builtin_amdgcn_s_getreg( GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID)); + #if (defined(__AMDGCN_CUMODE__)) + unsigned cu_id = __builtin_amdgcn_s_getreg( + GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID)); + #endif #else #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) unsigned xcc_id = __builtin_amdgcn_s_getreg( @@ -945,6 +921,9 @@ unsigned __smid(void) unsigned temp = se_id; temp = (temp << HW_ID_SA_ID_SIZE) | sa_id; temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id; + #if (defined(__AMDGCN_CUMODE__)) + temp = (temp << HW_ID_CU_ID_SIZE) | cu_id; + #endif return temp; //TODO : CU Mode impl #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_hip_atomic.h b/third_party/amd/backend/include/hip/amd_detail/amd_hip_atomic.h index ef719f3713c6..d6e4d8186909 100644 --- a/third_party/amd/backend/include/hip/amd_detail/amd_hip_atomic.h +++ b/third_party/amd/backend/include/hip/amd_detail/amd_hip_atomic.h @@ -612,11 +612,17 @@ float atomicMin(float* addr, float val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMin(addr, val); #else + typedef union u_hold { + float a; + unsigned int b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x80000000U == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value > val) { + while (!done && (value > val || (neg_zero && value == 0.0f))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -625,7 +631,7 @@ float atomicMin(float* addr, float val) { unsigned int *uaddr = (unsigned int *)addr; unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __uint_as_float(value) > val) { + while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) { done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -658,11 +664,17 @@ double atomicMin(double* addr, double val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMin(addr, val); #else + typedef union u_hold { + double a; + unsigned long long b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x8000000000000000ULL == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value > val) { + while (!done && (value > val || (neg_zero && value == 0.0))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -671,7 +683,8 @@ double atomicMin(double* addr, double val) { unsigned long long *uaddr = (unsigned long long *)addr; unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __longlong_as_double(value) > val) { + while (!done && + (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) { done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -856,11 +869,17 @@ float atomicMax(float* addr, float val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMax(addr, val); #else + typedef union u_hold { + float a; + unsigned int b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x80000000U == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value < val) { + while (!done && (value < val || (neg_zero && value == 0.0f))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -869,7 +888,7 @@ float atomicMax(float* addr, float val) { unsigned int *uaddr = (unsigned int *)addr; unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __uint_as_float(value) < val) { + while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) { done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -902,11 +921,17 @@ double atomicMax(double* addr, double val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMax(addr, val); #else + typedef union u_hold { + double a; + unsigned long long b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x8000000000000000ULL == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value < val) { + while (!done && (value < val || (neg_zero && value == 0.0))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -915,7 +940,8 @@ double atomicMax(double* addr, double val) { unsigned long long *uaddr = (unsigned long long *)addr; unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __longlong_as_double(value) < val) { + while (!done && + (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) { done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -977,7 +1003,7 @@ unsigned int atomicDec(unsigned int* address, unsigned int val) #else return __builtin_amdgcn_atomic_dec32(address, val, __ATOMIC_RELAXED, "agent"); #endif // __gfx941__ - + } __device__ diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_hip_bf16.h b/third_party/amd/backend/include/hip/amd_detail/amd_hip_bf16.h index 204269a849c6..cfaa5412a3aa 100644 --- a/third_party/amd/backend/include/hip/amd_detail/amd_hip_bf16.h +++ b/third_party/amd/backend/include/hip/amd_detail/amd_hip_bf16.h @@ -1,7 +1,7 @@ /** * MIT License * - * Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -81,6 +81,17 @@ * To use these functions, include the header file \p hip_bf16.h in your program. */ +/** + * \defgroup HIP_INTRINSIC_BFLOAT16_RAW Bfloat16 Raw Struct + * \ingroup HIP_INTRINSIC_BFLOAT16 + * To use these functions, include the header file \p hip_bf16.h in your program. + */ + +/** + * \defgroup HIP_INTRINSIC_BFLOAT162_RAW Bfloat162 Raw Struct + * \ingroup HIP_INTRINSIC_BFLOAT16 + * To use these functions, include the header file \p hip_bf16.h in your program. + */ #ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_ #define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_BF16_H_ @@ -93,13 +104,30 @@ #include "device_library_decls.h" // ocml conversion functions #include "math_fwd.h" // ocml device functions +#define __BF16_DEVICE__ __device__ #if defined(__HIPCC_RTC__) -#define __HOST_DEVICE__ __device__ static +#define __BF16_HOST_DEVICE__ __BF16_DEVICE__ #else #include #include #include -#define __HOST_DEVICE__ __host__ __device__ static inline +#define __BF16_HOST_DEVICE__ __host__ __BF16_DEVICE__ +#endif +#define __BF16_DEVICE_STATIC__ __BF16_DEVICE__ static inline +#define __BF16_HOST_DEVICE_STATIC__ __BF16_HOST_DEVICE__ static inline + +#if defined(__AVX512VL__) and defined(__AVX512BF16__) and not defined(__HIP_DEVICE_COMPILE__) +// Enable with -mavx512vl -mavx512bf16 +#if defined(__MINGW64__) +#include +#else +#include +#endif +#define HIP_BF16_AVX512_OP 1 +static_assert(sizeof(__bf16) == sizeof(unsigned short), + "sizeof __bf16 should match sizeof unsigned short"); +#else +#define HIP_BF16_AVX512_OP 0 #endif #define HIPRT_ONE_BF16 __float2bfloat16(1.0f) @@ -118,72 +146,361 @@ static_assert(CHAR_BIT == 8, "byte size should be of 8 bits"); #endif static_assert(sizeof(unsigned short) == 2, "size of unsigned short should be 2 bytes"); -/*! \brief Struct to represent a 16 bit brain floating point number. */ -struct __hip_bfloat16 { - unsigned short data; +/** + * \ingroup HIP_INTRINSIC_BFLOAT16_RAW + * \brief represents raw bfloat16 type + */ +typedef struct __attribute__((aligned(2))) { + unsigned short x; +} __hip_bfloat16_raw; + +/** + * \ingroup HIP_INTRINSIC_BFLOAT162_RAW + * \brief represents raw bfloat16x2 vector type + */ +typedef struct __attribute__((aligned(4))) { + unsigned short x; + unsigned short y; +} __hip_bfloat162_raw; + +/** + * \defgroup HIP_INTRINSIC_BFLOAT16_STRUCT + * \ingroup HIP_INTRINSIC_BFLOAT16 + * \brief Struct to represent a 16 bit brain floating point number. + * @{ + */ +struct __attribute__((aligned(2))) __hip_bfloat16 { + private: + __BF16_HOST_DEVICE_STATIC__ float bfloatraw_2_float(unsigned short val) { +#if HIP_BF16_AVX512_OP + union { + unsigned short us; + __bf16 bf16; + } u = {val}; + return _mm_cvtsbh_ss(u.bf16); +#else + unsigned int uval = val << 16; + union { + unsigned int u32; + float fp32; + } u = {uval}; + return u.fp32; +#endif + } + __BF16_HOST_DEVICE_STATIC__ unsigned short float_2_bfloatraw(float f) { +#if HIP_BF16_AVX512_OP + union { + __bf16 bf16; + unsigned short us; + } u = {_mm_cvtness_sbh(f)}; + return u.us; +#else + union { + float fp32; + unsigned int u32; + } u = {f}; + if (~u.u32 & 0x7f800000) { + // When the exponent bits are not all 1s, then the value is zero, normal, + // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus + // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). + // This causes the bfloat16's mantissa to be incremented by 1 if the 16 + // least significant bits of the float mantissa are greater than 0x8000, + // or if they are equal to 0x8000 and the least significant bit of the + // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when + // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already + // has the value 0x7f, then incrementing it causes it to become 0x00 and + // the exponent is incremented by one, which is the next higher FP value + // to the unrounded bfloat16 value. When the bfloat16 value is subnormal + // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up + // to a normal value with an exponent of 0x01 and a mantissa of 0x00. + // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, + // incrementing it causes it to become an exponent of 0xFF and a mantissa + // of 0x00, which is Inf, the next higher value to the unrounded value. + u.u32 += 0x7fff + ((u.u32 >> 16) & 1); // Round to nearest, round to even + } else if (u.u32 & 0xffff) { + // When all of the exponent bits are 1, the value is Inf or NaN. + // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero + // mantissa bit. Quiet NaN is indicated by the most significant mantissa + // bit being 1. Signaling NaN is indicated by the most significant + // mantissa bit being 0 but some other bit(s) being 1. If any of the + // lower 16 bits of the mantissa are 1, we set the least significant bit + // of the bfloat16 mantissa, in order to preserve signaling NaN in case + // the bloat16's mantissa bits are all 0. + u.u32 |= 0x10000; // Preserve signaling NaN + } + return static_cast(u.u32 >> 16); +#endif + } + + __BF16_HOST_DEVICE_STATIC__ unsigned short double_2_bfloatraw(double d_in) { + union { + float fp32; + unsigned int u32; + } u = {static_cast(d_in)}; + double d = u.fp32; + + // Round to odd + if ((d_in > 0.0 && d > d_in) || (d_in < 0.0 && d < d_in)) { + u.u32--; + u.u32 |= 1; + } + + return float_2_bfloatraw(u.fp32); + } + + protected: + /*! \brief raw representation of bfloat16 */ + unsigned short __x; + + public: + // TODO: SWDEV-452411 + // Need to add constructor of __hip_bfloat16 from + // unsigned long long + // long long + // long + // unsigned long + // Casting directly to double might lead to double rounding. + + /*! \brief create __hip_bfloat16 from an unsigned int */ + __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned int val) + : __x(double_2_bfloatraw(static_cast(val))) {} + + /*! \brief create __hip_bfloat16 from a int */ + __BF16_HOST_DEVICE__ __hip_bfloat16(int val) + : __x(double_2_bfloatraw(static_cast(val))) {} + + /*! \brief create __hip_bfloat16 from an unsigned short */ + __BF16_HOST_DEVICE__ __hip_bfloat16(unsigned short val) + : __x(float_2_bfloatraw(static_cast(val))) {} + + /*! \brief create __hip_bfloat16 from a short */ + __BF16_HOST_DEVICE__ __hip_bfloat16(short val) + : __x(float_2_bfloatraw(static_cast(val))) {} + + /*! \brief create __hip_bfloat16 from a double */ + __BF16_HOST_DEVICE__ __hip_bfloat16(const double val) : __x(double_2_bfloatraw(val)) {} + + /*! \brief create __hip_bfloat16 from a float */ + __BF16_HOST_DEVICE__ __hip_bfloat16(const float val) : __x(float_2_bfloatraw(val)) {} + + /*! \brief create __hip_bfloat16 from a __hip_bfloat16_raw */ + __BF16_HOST_DEVICE__ __hip_bfloat16(const __hip_bfloat16_raw& val) : __x(val.x) {} + + /*! \brief default constructor */ + __BF16_HOST_DEVICE__ __hip_bfloat16() = default; + + /*! \brief return a __hip_bfloat16_raw */ + __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const { return __hip_bfloat16_raw{__x}; } + + /*! \brief return a __hip_bfloat16_raw cv qualifier */ + __BF16_HOST_DEVICE__ operator __hip_bfloat16_raw() const volatile { + return __hip_bfloat16_raw{__x}; + } + + /*! \brief return false if bfloat value is +0.0 or -0.0, returns true otherwise */ + __BF16_HOST_DEVICE__ operator bool() const { + auto val = bfloatraw_2_float(__x); + return val != 0.0f && val != -0.0f; + } + + /*! \brief return a casted char from underlying float val */ + __BF16_HOST_DEVICE__ operator char() const { return static_cast(bfloatraw_2_float(__x)); } + + /*! \brief return a float */ + __BF16_HOST_DEVICE__ operator float() const { return bfloatraw_2_float(__x); } + + /*! \brief return a casted int casted from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator int() const { return static_cast(bfloatraw_2_float(__x)); } + + /*! \brief return a casted long casted from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator long() const { return static_cast(bfloatraw_2_float(__x)); } + + /*! \brief return a casted long long casted from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator long long() const { + return static_cast(bfloatraw_2_float(__x)); + } + + /*! \brief return a casted short casted from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator short() const { return static_cast(bfloatraw_2_float(__x)); } + + /*! \brief return a casted signed char from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator signed char() const { + return static_cast(bfloatraw_2_float(__x)); + } + + /*! \brief return a casted unsigned char casted from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator unsigned char() const { + return static_cast(bfloatraw_2_float(__x)); + } + + /*! \brief return a casted unsigned int casted from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator unsigned int() const { + return static_cast(bfloatraw_2_float(__x)); + } + + /*! \brief return a casted unsigned from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator unsigned long() const { + return static_cast(bfloatraw_2_float(__x)); + } + + /*! \brief return a casted unsigned long long from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator unsigned long long() const { + return static_cast(bfloatraw_2_float(__x)); + } + + /*! \brief return a casted unsigned short from float of underlying bfloat16 value */ + __BF16_HOST_DEVICE__ operator unsigned short() const { + return static_cast(bfloatraw_2_float(__x)); + } + + // TODO: SWDEV-452411 add operator which converts unsigned long long and long long to bfloat + + /*! \brief assign value from an unsigned int */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned int val) { + __x = float_2_bfloatraw(static_cast(val)); + return *this; + } + + /*! \brief assign value from a int */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(int val) { + __x = float_2_bfloatraw(static_cast(val)); + return *this; + } + + /*! \brief assign value from an unsigned short */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(unsigned short val) { + __x = float_2_bfloatraw(static_cast(val)); + return *this; + } + + /*! \brief assign value from a short int */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(short val) { + __x = float_2_bfloatraw(static_cast(val)); + return *this; + } + + /*! \brief assign value from a double */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const double f) { + __x = float_2_bfloatraw(static_cast(f)); + return *this; + } + + /*! \brief assign value from a float */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const float f) { + __x = float_2_bfloatraw(f); + return *this; + } + + /*! \brief assign value from a __hip_bfloat16_raw */ + __BF16_HOST_DEVICE__ __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) { + __x = hr.x; + return *this; + } + + /*! \brief assign value from a __hip_bfloat16_raw volatile */ + __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=(const __hip_bfloat16_raw& hr) volatile { + __x = hr.x; + return *this; + } + + /*! \brief assign value from a __hip_bfloat16_raw cv qualifier */ + __BF16_HOST_DEVICE__ volatile __hip_bfloat16& operator=( + const volatile __hip_bfloat16_raw& hr) volatile { + __x = hr.x; + return *this; + } }; +/**@}*/ + +/** + * \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT + * \ingroup HIP_INTRINSIC_BFLOAT16 + * \brief Struct to represent a two 16 bit brain floating point number. + * @{ + */ +struct __attribute__((aligned(4))) __hip_bfloat162 { + public: + __hip_bfloat16 x; /*! \brief raw representation of bfloat16 */ + __hip_bfloat16 y; /*! \brief raw representation of bfloat16 */ + + + public: + /*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */ + __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162_raw& h2r) + : x(__hip_bfloat16(__hip_bfloat16_raw{h2r.x})), + y(__hip_bfloat16(__hip_bfloat16_raw{h2r.y})) {} + + /*! \brief copy constructor of __hip_bfloat162 */ + __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat162& val) { + __hip_bfloat162_raw hr = val; + x = __hip_bfloat16_raw{hr.x}; + y = __hip_bfloat16_raw{hr.y}; + } + + /*! \brief create __hip_bfloat162 from two __hip_bfloat16 */ + __BF16_HOST_DEVICE__ __hip_bfloat162(const __hip_bfloat16& a, const __hip_bfloat16& b) + : x(a), y(b) {} + + /*! \brief default constructor of __hip_bfloat162 */ + __BF16_HOST_DEVICE__ __hip_bfloat162() = default; + + /*! \brief return a __hip_bfloat162_raw */ + __BF16_HOST_DEVICE__ operator __hip_bfloat162_raw() const { + __hip_bfloat16_raw l = x; + __hip_bfloat16_raw r = y; + return __hip_bfloat162_raw{l.x, r.x}; + } -/*! \brief Struct to represent two 16 bit brain floating point numbers. */ -struct __hip_bfloat162 { - __hip_bfloat16 x; - __hip_bfloat16 y; + /*! \brief return a float2 */ + __BF16_HOST_DEVICE__ operator float2() const { +#if HIP_BF16_AVX512_OP + union { + __hip_bfloat162_raw raw2; + __bf16 bf162[2]; + static_assert(sizeof(__bf16[2]) == sizeof(__hip_bfloat162_raw)); + } u; + u.raw2 = *this; + __m128bh pbf16{u.bf162[0], u.bf162[1], 0, 0}; + __m128 pf32 = _mm_cvtpbh_ps(pbf16); + float2 ret(pf32[0], pf32[1]); +#else + float2 ret(x, y); +#endif + return ret; + } + + /*! \brief assign value from __hip_bfloat162_raw */ + __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162_raw& h2r) { + x = __hip_bfloat16(__hip_bfloat16_raw{h2r.x}); + y = __hip_bfloat16(__hip_bfloat16_raw{h2r.y}); + return *this; + } + + /*! \brief assign value from __hip_bfloat162 */ + __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __hip_bfloat162& src) { + __hip_bfloat162_raw hr = src; + x = __hip_bfloat16(__hip_bfloat16_raw{hr.x}); + y = __hip_bfloat16(__hip_bfloat16_raw{hr.y}); + return *this; + } }; +/**@}*/ /** * \ingroup HIP_INTRINSIC_BFLOAT16_CONV * \brief Converts bfloat16 to float */ -__HOST_DEVICE__ inline float __bfloat162float(__hip_bfloat16 a) { - unsigned int uval = 0; - uval = a.data << 16; - union { - unsigned int u32; - float fp32; - } u = {uval}; - return u.fp32; +__BF16_HOST_DEVICE_STATIC__ float __bfloat162float(__hip_bfloat16 a) { + float ret = a; + return ret; } /** * \ingroup HIP_INTRINSIC_BFLOAT16_CONV * \brief Converts float to bfloat16 */ -__HOST_DEVICE__ __hip_bfloat16 __float2bfloat16(float f) { - __hip_bfloat16 ret; - union { - float fp32; - unsigned int u32; - } u = {f}; - if (~u.u32 & 0x7f800000) { - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus - // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). - // This causes the bfloat16's mantissa to be incremented by 1 if the 16 - // least significant bits of the float mantissa are greater than 0x8000, - // or if they are equal to 0x8000 and the least significant bit of the - // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already - // has the value 0x7f, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded bfloat16 value. When the bfloat16 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - u.u32 += 0x7fff + ((u.u32 >> 16) & 1); // Round to nearest, round to even - } else if (u.u32 & 0xffff) { - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 16 bits of the mantissa are 1, we set the least significant bit - // of the bfloat16 mantissa, in order to preserve signaling NaN in case - // the bloat16's mantissa bits are all 0. - u.u32 |= 0x10000; // Preserve signaling NaN - } - - ret.data = (u.u32 >> 16); +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __float2bfloat16(float f) { + __hip_bfloat16 ret{f}; return ret; } @@ -191,43 +508,51 @@ __HOST_DEVICE__ __hip_bfloat16 __float2bfloat16(float f) { * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Converts and moves bfloat162 to float2 */ -__HOST_DEVICE__ float2 __bfloat1622float2(const __hip_bfloat162 a) { - return float2{__bfloat162float(a.x), __bfloat162float(a.y)}; +__BF16_HOST_DEVICE_STATIC__ float2 __bfloat1622float2(const __hip_bfloat162 a) { + float2 ret = a; + return ret; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Moves bfloat16 value to bfloat162 */ -__HOST_DEVICE__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a) { - return __hip_bfloat162{a, a}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __bfloat162bfloat162(const __hip_bfloat16 a) { + return __hip_bfloat162(a, a); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Reinterprets bits in a __hip_bfloat16 as a signed short integer */ -__HOST_DEVICE__ short int __bfloat16_as_short(const __hip_bfloat16 h) { return (short)h.data; } +__BF16_HOST_DEVICE_STATIC__ short int __bfloat16_as_short(const __hip_bfloat16 h) { + short ret = h; + return ret; +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Reinterprets bits in a __hip_bfloat16 as an unsigned signed short integer */ -__HOST_DEVICE__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h) { return h.data; } +__BF16_HOST_DEVICE_STATIC__ unsigned short int __bfloat16_as_ushort(const __hip_bfloat16 h) { + unsigned short ret = h; + return ret; +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Convert double to __hip_bfloat16 */ -__HOST_DEVICE__ __hip_bfloat16 __double2bfloat16(const double a) { - return __float2bfloat16((float)a); +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __double2bfloat16(const double a) { + __hip_bfloat16 ret{a}; + return ret; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Convert float2 to __hip_bfloat162 */ -__HOST_DEVICE__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) { return __hip_bfloat162{__float2bfloat16(a.x), __float2bfloat16(a.y)}; } @@ -235,97 +560,117 @@ __HOST_DEVICE__ __hip_bfloat162 __float22bfloat162_rn(const float2 a) { * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Combine two __hip_bfloat16 to __hip_bfloat162 */ -__HOST_DEVICE__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, const __hip_bfloat16 b) { - return __hip_bfloat162{a, b}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __halves2bfloat162(const __hip_bfloat16 a, + const __hip_bfloat16 b) { + return __hip_bfloat162(a, b); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Returns high 16 bits of __hip_bfloat162 */ -__HOST_DEVICE__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a) { return a.y; } +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __high2bfloat16(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __hip_bfloat16(__hip_bfloat16_raw{hr.y}); +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Returns high 16 bits of __hip_bfloat162 */ -__HOST_DEVICE__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a) { - return __hip_bfloat162{a.y, a.y}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __high2bfloat162(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __hip_bfloat162(__hip_bfloat16_raw{hr.y}, __hip_bfloat16_raw{hr.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Converts high 16 bits of __hip_bfloat162 to float and returns the result */ -__HOST_DEVICE__ float __high2float(const __hip_bfloat162 a) { return __bfloat162float(a.y); } +__BF16_HOST_DEVICE_STATIC__ float __high2float(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __bfloat162float(__hip_bfloat16(__hip_bfloat16_raw{hr.y})); +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Extracts high 16 bits from each and combines them */ -__HOST_DEVICE__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, - const __hip_bfloat162 b) { - return __hip_bfloat162{a.y, b.y}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __highs2bfloat162(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hip_bfloat162_raw{hr_a.y, hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Returns low 16 bits of __hip_bfloat162 */ -__HOST_DEVICE__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a) { return a.x; } +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __low2bfloat16(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __hip_bfloat16(hr.x); +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Returns low 16 bits of __hip_bfloat162 */ -__HOST_DEVICE__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a) { - return __hip_bfloat162{a.x, a.x}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __low2bfloat162(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __hip_bfloat162(hr.x, hr.x); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Converts low 16 bits of __hip_bfloat162 to float and returns the result */ -__HOST_DEVICE__ float __low2float(const __hip_bfloat162 a) { return __bfloat162float(a.x); } +__BF16_HOST_DEVICE_STATIC__ float __low2float(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __bfloat162float(__hip_bfloat16(__hip_bfloat16_raw{hr.x})); +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Swaps both halves */ -__HOST_DEVICE__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a) { - return __hip_bfloat162{a.y, a.x}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lowhigh2highlow(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr = a; + return __hip_bfloat162(__hip_bfloat162_raw{hr.y, hr.x}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Extracts low 16 bits from each and combines them */ -__HOST_DEVICE__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{a.x, b.x}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __lows2bfloat162(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hip_bfloat162_raw{hr_a.x, hr_b.x}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Reinterprets short int into a bfloat16 */ -__HOST_DEVICE__ __hip_bfloat16 __short_as_bfloat16(const short int a) { - return __hip_bfloat16{(unsigned short)a}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __short_as_bfloat16(const short int a) { + return __hip_bfloat16(a); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_CONV * \brief Reinterprets unsigned short int into a bfloat16 */ -__HOST_DEVICE__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a) { - return __hip_bfloat16{a}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __ushort_as_bfloat16(const unsigned short int a) { + return __hip_bfloat16(a); } - /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Adds two bfloat16 values */ -__HOST_DEVICE__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b)); } @@ -333,7 +678,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hadd(const __hip_bfloat16 a, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Subtracts two bfloat16 values */ -__HOST_DEVICE__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __float2bfloat16(__bfloat162float(a) - __bfloat162float(b)); } @@ -341,7 +686,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hsub(const __hip_bfloat16 a, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Divides two bfloat16 values */ -__HOST_DEVICE__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __float2bfloat16(__bfloat162float(a) / __bfloat162float(b)); } @@ -349,8 +694,8 @@ __HOST_DEVICE__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Performs FMA of given bfloat16 values */ -__device__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, - const __hip_bfloat16 c) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, + const __hip_bfloat16 c) { return __float2bfloat16( __ocml_fma_f32(__bfloat162float(a), __bfloat162float(b), __bfloat162float(c))); } @@ -359,7 +704,7 @@ __device__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip_bfloat16 b, * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Multiplies two bfloat16 values */ -__HOST_DEVICE__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b)); } @@ -367,85 +712,110 @@ __HOST_DEVICE__ __hip_bfloat16 __hmul(const __hip_bfloat16 a, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Negate a bfloat16 value */ -__HOST_DEVICE__ __hip_bfloat16 __hneg(const __hip_bfloat16 a) { - auto ret = a; - ret.data ^= 0x8000; - return ret; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hneg(const __hip_bfloat16 a) { + __hip_bfloat16_raw hr = a; + hr.x ^= 0x8000; + return __hip_bfloat16(hr); } /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Returns absolute of a bfloat16 */ -__HOST_DEVICE__ __hip_bfloat16 __habs(const __hip_bfloat16 a) { - auto ret = a; - ret.data &= 0x7FFF; - return ret; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __habs(const __hip_bfloat16 a) { + __hip_bfloat16_raw hr = a; + hr.x &= 0x7FFF; + return __hip_bfloat16(hr); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Divides bfloat162 values */ -__HOST_DEVICE__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{__float2bfloat16(__bfloat162float(a.x) / __bfloat162float(b.x)), - __float2bfloat16(__bfloat162float(a.y) / __bfloat162float(b.y))}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __h2div(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__float2bfloat16(__bfloat162float(__hip_bfloat16_raw{hr_a.x}) / + __bfloat162float(__hip_bfloat16_raw{hr_b.x})), + __float2bfloat16(__bfloat162float(__hip_bfloat16_raw{hr_a.y}) / + __bfloat162float(__hip_bfloat16_raw{hr_b.y}))); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Returns absolute of a bfloat162 */ -__HOST_DEVICE__ __hip_bfloat162 __habs2(const __hip_bfloat162 a) { - return __hip_bfloat162{__habs(a.x), __habs(a.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __habs2(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr_a = a; + return __hip_bfloat162(__habs(__hip_bfloat16_raw{hr_a.x}), __habs(__hip_bfloat16_raw{hr_a.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Adds two bfloat162 values */ -__HOST_DEVICE__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{__hadd(a.x, b.x), __hadd(a.y, b.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hadd(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}), + __hadd(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Performs FMA of given bfloat162 values */ -__device__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, - const __hip_bfloat162 c) { - return __hip_bfloat162{__hfma(a.x, b.x, c.x), __hfma(a.y, b.y, c.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __hip_bfloat162 b, + const __hip_bfloat162 c) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + __hip_bfloat162_raw hr_c = c; + return __hip_bfloat162( + __hfma(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}, __hip_bfloat16_raw{hr_c.x}), + __hfma(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}, __hip_bfloat16_raw{hr_c.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Multiplies two bfloat162 values */ -__HOST_DEVICE__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{__hmul(a.x, b.x), __hmul(a.y, b.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmul2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hmul(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}), + __hmul(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Converts a bfloat162 into negative */ -__HOST_DEVICE__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a) { - return __hip_bfloat162{__hneg(a.x), __hneg(a.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hneg2(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr_a = a; + return __hip_bfloat162(__hneg(__hip_bfloat16_raw{hr_a.x}), __hneg(__hip_bfloat16_raw{hr_a.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Subtracts two bfloat162 values */ -__HOST_DEVICE__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{__hsub(a.x, b.x), __hsub(a.y, b.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hsub2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hsub(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}), + __hsub(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to multiply two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16 operator*(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator*(const __hip_bfloat16& l, + const __hip_bfloat16& r) { return __hmul(l, r); } @@ -453,7 +823,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator*(const __hip_bfloat16& l, const __hip_bf * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to multiply-assign two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat16& r) { l = __hmul(l, r); return l; } @@ -462,13 +832,14 @@ __HOST_DEVICE__ __hip_bfloat16& operator*=(__hip_bfloat16& l, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to unary+ on a __hip_bfloat16 number */ -__HOST_DEVICE__ __hip_bfloat16 operator+(const __hip_bfloat16& l) { return l; } +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l) { return l; } /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to add two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16 operator+(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator+(const __hip_bfloat16& l, + const __hip_bfloat16& r) { return __hadd(l, r); } @@ -476,13 +847,14 @@ __HOST_DEVICE__ __hip_bfloat16 operator+(const __hip_bfloat16& l, const __hip_bf * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to negate a __hip_bfloat16 number */ -__HOST_DEVICE__ __hip_bfloat16 operator-(const __hip_bfloat16& l) { return __hneg(l); } +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l) { return __hneg(l); } /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to subtract two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16 operator-(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator-(const __hip_bfloat16& l, + const __hip_bfloat16& r) { return __hsub(l, r); } @@ -490,7 +862,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator-(const __hip_bfloat16& l, const __hip_bf * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to post increment a __hip_bfloat16 number */ -__HOST_DEVICE__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) { auto ret = l; l = __hadd(l, HIPRT_ONE_BF16); return ret; @@ -500,7 +872,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator++(__hip_bfloat16& l, const int) { * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to pre increment a __hip_bfloat16 number */ -__HOST_DEVICE__ __hip_bfloat16& operator++(__hip_bfloat16& l) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator++(__hip_bfloat16& l) { l = __hadd(l, HIPRT_ONE_BF16); return l; } @@ -509,7 +881,7 @@ __HOST_DEVICE__ __hip_bfloat16& operator++(__hip_bfloat16& l) { * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to post decrement a __hip_bfloat16 number */ -__HOST_DEVICE__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) { auto ret = l; l = __hsub(l, HIPRT_ONE_BF16); return ret; @@ -519,7 +891,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator--(__hip_bfloat16& l, const int) { * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to pre decrement a __hip_bfloat16 number */ -__HOST_DEVICE__ __hip_bfloat16& operator--(__hip_bfloat16& l) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator--(__hip_bfloat16& l) { l = __hsub(l, HIPRT_ONE_BF16); return l; } @@ -528,7 +900,7 @@ __HOST_DEVICE__ __hip_bfloat16& operator--(__hip_bfloat16& l) { * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to add-assign two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat16& r) { l = __hadd(l, r); return l; } @@ -537,7 +909,7 @@ __HOST_DEVICE__ __hip_bfloat16& operator+=(__hip_bfloat16& l, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to subtract-assign two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat16& r) { l = __hsub(l, r); return l; } @@ -546,7 +918,8 @@ __HOST_DEVICE__ __hip_bfloat16& operator-=(__hip_bfloat16& l, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to divide two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16 operator/(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 operator/(const __hip_bfloat16& l, + const __hip_bfloat16& r) { return __hdiv(l, r); } @@ -554,7 +927,7 @@ __HOST_DEVICE__ __hip_bfloat16 operator/(const __hip_bfloat16& l, const __hip_bf * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Operator to divide-assign two __hip_bfloat16 numbers */ -__HOST_DEVICE__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat16& r) { l = __hdiv(l, r); return l; } @@ -563,7 +936,8 @@ __HOST_DEVICE__ __hip_bfloat16& operator/=(__hip_bfloat16& l, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to multiply two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162 operator*(const __hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator*(const __hip_bfloat162& l, + const __hip_bfloat162& r) { return __hmul2(l, r); } @@ -571,7 +945,8 @@ __HOST_DEVICE__ __hip_bfloat162 operator*(const __hip_bfloat162& l, const __hip_ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to multiply-assign two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162& operator*=(__hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator*=(__hip_bfloat162& l, + const __hip_bfloat162& r) { l = __hmul2(l, r); return l; } @@ -580,13 +955,14 @@ __HOST_DEVICE__ __hip_bfloat162& operator*=(__hip_bfloat162& l, const __hip_bflo * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to unary+ on a __hip_bfloat162 number */ -__HOST_DEVICE__ __hip_bfloat162 operator+(const __hip_bfloat162& l) { return l; } +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l) { return l; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to add two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162 operator+(const __hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator+(const __hip_bfloat162& l, + const __hip_bfloat162& r) { return __hadd2(l, r); } @@ -594,13 +970,16 @@ __HOST_DEVICE__ __hip_bfloat162 operator+(const __hip_bfloat162& l, const __hip_ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to negate a __hip_bfloat162 number */ -__HOST_DEVICE__ __hip_bfloat162 operator-(const __hip_bfloat162& l) { return __hneg2(l); } +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l) { + return __hneg2(l); +} /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to subtract two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162 operator-(const __hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator-(const __hip_bfloat162& l, + const __hip_bfloat162& r) { return __hsub2(l, r); } @@ -608,7 +987,7 @@ __HOST_DEVICE__ __hip_bfloat162 operator-(const __hip_bfloat162& l, const __hip_ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to post increment a __hip_bfloat162 number */ -__HOST_DEVICE__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) { auto ret = l; l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16}); return ret; @@ -618,7 +997,7 @@ __HOST_DEVICE__ __hip_bfloat162 operator++(__hip_bfloat162& l, const int) { * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to pre increment a __hip_bfloat162 number */ -__HOST_DEVICE__ __hip_bfloat162& operator++(__hip_bfloat162& l) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator++(__hip_bfloat162& l) { l = __hadd2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16}); return l; } @@ -627,7 +1006,7 @@ __HOST_DEVICE__ __hip_bfloat162& operator++(__hip_bfloat162& l) { * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to post decrement a __hip_bfloat162 number */ -__HOST_DEVICE__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) { auto ret = l; l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16}); return ret; @@ -637,7 +1016,7 @@ __HOST_DEVICE__ __hip_bfloat162 operator--(__hip_bfloat162& l, const int) { * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to pre decrement a __hip_bfloat162 number */ -__HOST_DEVICE__ __hip_bfloat162& operator--(__hip_bfloat162& l) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator--(__hip_bfloat162& l) { l = __hsub2(l, {HIPRT_ONE_BF16, HIPRT_ONE_BF16}); return l; } @@ -646,7 +1025,8 @@ __HOST_DEVICE__ __hip_bfloat162& operator--(__hip_bfloat162& l) { * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to add-assign two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162& operator+=(__hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator+=(__hip_bfloat162& l, + const __hip_bfloat162& r) { l = __hadd2(l, r); return l; } @@ -655,7 +1035,8 @@ __HOST_DEVICE__ __hip_bfloat162& operator+=(__hip_bfloat162& l, const __hip_bflo * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to subtract-assign two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162& operator-=(__hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator-=(__hip_bfloat162& l, + const __hip_bfloat162& r) { l = __hsub2(l, r); return l; } @@ -664,7 +1045,8 @@ __HOST_DEVICE__ __hip_bfloat162& operator-=(__hip_bfloat162& l, const __hip_bflo * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to divide two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162 operator/(const __hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 operator/(const __hip_bfloat162& l, + const __hip_bfloat162& r) { return __h2div(l, r); } @@ -672,7 +1054,8 @@ __HOST_DEVICE__ __hip_bfloat162 operator/(const __hip_bfloat162& l, const __hip_ * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Operator to divide-assign two __hip_bfloat162 numbers */ -__HOST_DEVICE__ __hip_bfloat162& operator/=(__hip_bfloat162& l, const __hip_bfloat162& r) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162& operator/=(__hip_bfloat162& l, + const __hip_bfloat162& r) { l = __h2div(l, r); return l; } @@ -681,7 +1064,7 @@ __HOST_DEVICE__ __hip_bfloat162& operator/=(__hip_bfloat162& l, const __hip_bflo * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values */ -__HOST_DEVICE__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __bfloat162float(a) == __bfloat162float(b); } @@ -689,7 +1072,7 @@ __HOST_DEVICE__ bool __heq(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - unordered equal */ -__HOST_DEVICE__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) { return !(__bfloat162float(a) < __bfloat162float(b)) && !(__bfloat162float(a) > __bfloat162float(b)); } @@ -698,7 +1081,7 @@ __HOST_DEVICE__ bool __hequ(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - greater than */ -__HOST_DEVICE__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __bfloat162float(a) > __bfloat162float(b); } @@ -706,7 +1089,7 @@ __HOST_DEVICE__ bool __hgt(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - unordered greater than */ -__HOST_DEVICE__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) { return !(__bfloat162float(a) <= __bfloat162float(b)); } @@ -714,7 +1097,7 @@ __HOST_DEVICE__ bool __hgtu(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - greater than equal */ -__HOST_DEVICE__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __bfloat162float(a) >= __bfloat162float(b); } @@ -722,7 +1105,7 @@ __HOST_DEVICE__ bool __hge(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - unordered greater than equal */ -__HOST_DEVICE__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) { return !(__bfloat162float(a) < __bfloat162float(b)); } @@ -730,7 +1113,7 @@ __HOST_DEVICE__ bool __hgeu(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - not equal */ -__HOST_DEVICE__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __bfloat162float(a) != __bfloat162float(b); } @@ -738,7 +1121,7 @@ __HOST_DEVICE__ bool __hne(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - unordered not equal */ -__HOST_DEVICE__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) { return !(__bfloat162float(a) == __bfloat162float(b)); } @@ -746,7 +1129,7 @@ __HOST_DEVICE__ bool __hneu(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - return max */ -__HOST_DEVICE__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat16 b) { #if __HIP_DEVICE_COMPILE__ return __float2bfloat16(__ocml_fmax_f32(__bfloat162float(a), __bfloat162float(b))); #else @@ -758,7 +1141,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hmax(const __hip_bfloat16 a, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - return min */ -__HOST_DEVICE__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat16 b) { #if __HIP_DEVICE_COMPILE__ return __float2bfloat16(__ocml_fmin_f32(__bfloat162float(a), __bfloat162float(b))); #else @@ -770,7 +1153,7 @@ __HOST_DEVICE__ __hip_bfloat16 __hmin(const __hip_bfloat16 a, const __hip_bfloat * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - less than operator */ -__HOST_DEVICE__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __bfloat162float(a) < __bfloat162float(b); } @@ -778,7 +1161,7 @@ __HOST_DEVICE__ bool __hlt(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - unordered less than */ -__HOST_DEVICE__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) { return !(__bfloat162float(a) >= __bfloat162float(b)); } @@ -786,7 +1169,7 @@ __HOST_DEVICE__ bool __hltu(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - less than equal */ -__HOST_DEVICE__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) { return __bfloat162float(a) <= __bfloat162float(b); } @@ -794,7 +1177,7 @@ __HOST_DEVICE__ bool __hle(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Compare two bfloat162 values - unordered less than equal */ -__HOST_DEVICE__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) { +__BF16_HOST_DEVICE_STATIC__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) { return !(__bfloat162float(a) > __bfloat162float(b)); } @@ -802,208 +1185,282 @@ __HOST_DEVICE__ bool __hleu(const __hip_bfloat16 a, const __hip_bfloat16 b) { * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Checks if number is inf */ -__HOST_DEVICE__ int __hisinf(const __hip_bfloat16 a) { - unsigned short sign = a.data & 0x8000U; -#if __HIP_DEVICE_COMPILE__ - int res = __ocml_isinf_f32(__bfloat162float(a)); -#else - int res = std::isinf(__bfloat162float(a)) ? 1 : 0; -#endif - return (res == 0) ? res : ((sign != 0U) ? -res : res); +__BF16_HOST_DEVICE_STATIC__ int __hisinf(const __hip_bfloat16 a) { + __hip_bfloat16_raw hr = a; + return !(~hr.x & 0x7f80) && !(hr.x & 0x7f); } /** * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Checks if number is nan */ -__HOST_DEVICE__ bool __hisnan(const __hip_bfloat16 a) { -#if __HIP_DEVICE_COMPILE__ - return __ocml_isnan_f32(__bfloat162float(a)); -#else - return std::isnan(__bfloat162float(a)); -#endif +__BF16_HOST_DEVICE_STATIC__ bool __hisnan(const __hip_bfloat16 a) { + __hip_bfloat16_raw hr = a; + return !(~hr.x & 0x7f80) && +(hr.x & 0x7f); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Checks if two numbers are equal */ -__HOST_DEVICE__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __heq(a.x, b.x) && __heq(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbeq2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __heq(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __heq(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Checks if two numbers are equal - unordered */ -__HOST_DEVICE__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hequ(a.x, b.x) && __hequ(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbequ2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hequ(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hequ(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a >= b */ -__HOST_DEVICE__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hge(a.x, b.x) && __hge(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbge2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hge(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hge(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a >= b - unordered */ -__HOST_DEVICE__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hgeu(a.x, b.x) && __hgeu(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbgeu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hgeu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hgeu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a > b */ -__HOST_DEVICE__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hgt(a.x, b.x) && __hgt(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hgt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hgt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a > b - unordered */ -__HOST_DEVICE__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hgtu(a.x, b.x) && __hgtu(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbgtu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hgtu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hgtu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a <= b */ -__HOST_DEVICE__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hle(a.x, b.x) && __hle(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hble2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hle(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hle(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a <= b - unordered */ -__HOST_DEVICE__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hleu(a.x, b.x) && __hleu(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbleu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hleu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hleu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a < b */ -__HOST_DEVICE__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hlt(a.x, b.x) && __hlt(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hblt2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hlt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hlt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a < b - unordered */ -__HOST_DEVICE__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hltu(a.x, b.x) && __hltu(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbltu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hltu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) && + __hltu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a != b */ -__HOST_DEVICE__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hne(a.x, b.x) && __hne(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbne2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hne(__hip_bfloat16(__hip_bfloat16_raw{hr_a.x}), + __hip_bfloat16(__hip_bfloat16_raw{hr_b.x})) && + __hne(__hip_bfloat16(__hip_bfloat16_raw{hr_a.y}), __hip_bfloat16(__hip_bfloat16_raw{hr_b.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a != b */ -__HOST_DEVICE__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hneu(a.x, b.x) && __hneu(a.y, b.y); +__BF16_HOST_DEVICE_STATIC__ bool __hbneu2(const __hip_bfloat162 a, const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hneu(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) || + __hneu(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a != b, returns 1.0 if equal, otherwise 0.0 */ -__HOST_DEVICE__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{{__heq(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__heq(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __heq2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162{ + {__heq(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}, + {__heq(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a >= b, returns 1.0 if greater than equal, otherwise 0.0 */ -__HOST_DEVICE__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{{__hge(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__hge(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hge2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162{ + {__hge(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}, + {__hge(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a > b, returns 1.0 if greater than equal, otherwise 0.0 */ -__HOST_DEVICE__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{{__hgt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__hgt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hgt2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162{ + {__hgt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}, + {__hgt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16 + : HIPRT_ONE_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a is NaN, returns 1.0 if NaN, otherwise 0.0 */ -__HOST_DEVICE__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a) { - return __hip_bfloat162{{__hisnan(a.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__hisnan(a.y) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hisnan2(const __hip_bfloat162 a) { + __hip_bfloat162_raw hr_a = a; + return __hip_bfloat162{{__hisnan(__hip_bfloat16_raw{hr_a.x}) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, + {__hisnan(__hip_bfloat16_raw{hr_a.y}) ? HIPRT_ONE_BF16 : HIPRT_ONE_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a <= b, returns 1.0 if greater than equal, otherwise 0.0 */ -__HOST_DEVICE__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{{__hle(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__hle(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hle2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162{ + {__hle(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}, + {__hle(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Check for a < b, returns 1.0 if greater than equal, otherwise 0.0 */ -__HOST_DEVICE__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{{__hlt(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__hlt(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hlt2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162{ + {__hlt(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}, + {__hlt(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Returns max of two elements */ -__HOST_DEVICE__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{__hmax(a.x, b.x), __hmax(a.y, b.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmax2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hmax(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}), + __hmax(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Returns min of two elements */ -__HOST_DEVICE__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{__hmin(a.x, b.x), __hmin(a.y, b.y)}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hmin2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162(__hmin(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}), + __hmin(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Checks for not equal to */ -__HOST_DEVICE__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, const __hip_bfloat162 b) { - return __hip_bfloat162{{__hne(a.x, b.x) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}, - {__hne(a.y, b.y) ? HIPRT_ONE_BF16 : HIPRT_ZERO_BF16}}; +__BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hne2(const __hip_bfloat162 a, + const __hip_bfloat162 b) { + __hip_bfloat162_raw hr_a = a; + __hip_bfloat162_raw hr_b = b; + return __hip_bfloat162{ + {__hne(__hip_bfloat16_raw{hr_a.x}, __hip_bfloat16_raw{hr_b.x}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}, + {__hne(__hip_bfloat16_raw{hr_a.y}, __hip_bfloat16_raw{hr_b.y}) ? HIPRT_ONE_BF16 + : HIPRT_ZERO_BF16}}; } /** * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform an equal compare on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r) { return __heq(l, r); } @@ -1011,7 +1468,7 @@ __HOST_DEVICE__ bool operator==(const __hip_bfloat16& l, const __hip_bfloat16& r * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform a not equal on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r) { return __hne(l, r); } @@ -1019,7 +1476,7 @@ __HOST_DEVICE__ bool operator!=(const __hip_bfloat16& l, const __hip_bfloat16& r * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform a less than on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) { return __hlt(l, r); } @@ -1027,7 +1484,7 @@ __HOST_DEVICE__ bool operator<(const __hip_bfloat16& l, const __hip_bfloat16& r) * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform a less than equal on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r) { return __hle(l, r); } @@ -1035,7 +1492,7 @@ __HOST_DEVICE__ bool operator<=(const __hip_bfloat16& l, const __hip_bfloat16& r * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform a greater than on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) { return __hgt(l, r); } @@ -1043,7 +1500,7 @@ __HOST_DEVICE__ bool operator>(const __hip_bfloat16& l, const __hip_bfloat16& r) * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform a greater than equal on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r) { +__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r) { return __hge(l, r); } @@ -1051,55 +1508,60 @@ __HOST_DEVICE__ bool operator>=(const __hip_bfloat16& l, const __hip_bfloat16& r * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Operator to perform an equal compare on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator==(const __hip_bfloat162& l, const __hip_bfloat162& r) { - return __heq(l.x, r.x) && __heq(l.y, r.y); +__BF16_HOST_DEVICE_STATIC__ bool operator==(const __hip_bfloat162& l, const __hip_bfloat162& r) { + float2 ret = __heq2(l, r); + return ret.x != 0.0f && ret.y != 0.0f; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Operator to perform a not equal on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator!=(const __hip_bfloat162& l, const __hip_bfloat162& r) { - return __hne(l.x, r.x) || __hne(l.y, r.y); +__BF16_HOST_DEVICE_STATIC__ bool operator!=(const __hip_bfloat162& l, const __hip_bfloat162& r) { + return !(l == r); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Operator to perform a less than on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator<(const __hip_bfloat162& l, const __hip_bfloat162& r) { - return __hlt(l.x, r.x) && __hlt(l.y, r.y); +__BF16_HOST_DEVICE_STATIC__ bool operator<(const __hip_bfloat162& l, const __hip_bfloat162& r) { + float2 fl = l, fr = r; + return fl.x < fr.x && fl.x < fr.y; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Operator to perform a less than equal on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator<=(const __hip_bfloat162& l, const __hip_bfloat162& r) { - return __hle(l.x, r.x) && __hle(l.y, r.y); +__BF16_HOST_DEVICE_STATIC__ bool operator<=(const __hip_bfloat162& l, const __hip_bfloat162& r) { + float2 fl = l, fr = r; + return fl.x <= fr.x && fl.x <= fr.y; } /** * \ingroup HIP_INTRINSIC_BFLOAT162_COMP * \brief Operator to perform a greater than on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator>(const __hip_bfloat162& l, const __hip_bfloat162& r) { - return __hgt(l.x, r.x) && __hgt(l.y, r.y); +__BF16_HOST_DEVICE_STATIC__ bool operator>(const __hip_bfloat162& l, const __hip_bfloat162& r) { + float2 fl = l, fr = r; + return fl.x > fr.x && fl.x > fr.y; } /** * \ingroup HIP_INTRINSIC_BFLOAT16_COMP * \brief Operator to perform a greater than equal on two __hip_bfloat16 numbers */ -__HOST_DEVICE__ bool operator>=(const __hip_bfloat162& l, const __hip_bfloat162& r) { - return __hge(l.x, r.x) && __hge(l.y, r.y); +__BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hip_bfloat162& r) { + float2 fl = l, fr = r; + return fl.x >= fr.x && fl.x >= fr.y; } /** * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate ceil of bfloat16 */ -__device__ __hip_bfloat16 hceil(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hceil(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_ceil_f32(__bfloat162float(h))); } @@ -1107,7 +1569,7 @@ __device__ __hip_bfloat16 hceil(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate cosine of bfloat16 */ -__device__ __hip_bfloat16 hcos(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hcos(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_cos_f32(__bfloat162float(h))); } @@ -1115,7 +1577,7 @@ __device__ __hip_bfloat16 hcos(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate exponential of bfloat16 */ -__device__ __hip_bfloat16 hexp(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_exp_f32(__bfloat162float(h))); } @@ -1123,7 +1585,7 @@ __device__ __hip_bfloat16 hexp(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate exponential 10 of bfloat16 */ -__device__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_exp10_f32(__bfloat162float(h))); } @@ -1131,7 +1593,7 @@ __device__ __hip_bfloat16 hexp10(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate exponential 2 of bfloat16 */ -__device__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_exp2_f32(__bfloat162float(h))); } @@ -1139,7 +1601,7 @@ __device__ __hip_bfloat16 hexp2(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate floor of bfloat16 */ -__device__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_floor_f32(__bfloat162float(h))); } @@ -1147,7 +1609,7 @@ __device__ __hip_bfloat16 hfloor(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate natural log of bfloat16 */ -__device__ __hip_bfloat16 hlog(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_log_f32(__bfloat162float(h))); } @@ -1155,7 +1617,7 @@ __device__ __hip_bfloat16 hlog(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate log 10 of bfloat16 */ -__device__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_log10_f32(__bfloat162float(h))); } @@ -1163,7 +1625,7 @@ __device__ __hip_bfloat16 hlog10(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate log 2 of bfloat16 */ -__device__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_log2_f32(__bfloat162float(h))); } @@ -1171,7 +1633,7 @@ __device__ __hip_bfloat16 hlog2(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate reciprocal */ -__device__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) { return __float2bfloat16(1.0f / (__bfloat162float(h))); } @@ -1179,7 +1641,7 @@ __device__ __hip_bfloat16 hrcp(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Round to nearest int */ -__device__ __hip_bfloat16 hrint(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hrint(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_rint_f32(__bfloat162float(h))); } @@ -1187,7 +1649,7 @@ __device__ __hip_bfloat16 hrint(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Reciprocal square root */ -__device__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_rsqrt_f32(__bfloat162float(h))); } @@ -1195,7 +1657,7 @@ __device__ __hip_bfloat16 hrsqrt(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate sin of bfloat16 */ -__device__ __hip_bfloat16 hsin(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hsin(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_sin_f32(__bfloat162float(h))); } @@ -1203,7 +1665,7 @@ __device__ __hip_bfloat16 hsin(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate sqrt of bfloat16 */ -__device__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_sqrt_f32(__bfloat162float(h))); } @@ -1211,7 +1673,7 @@ __device__ __hip_bfloat16 hsqrt(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate truncate of bfloat16 */ -__device__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) { +__BF16_DEVICE_STATIC__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) { return __float2bfloat16(__ocml_trunc_f32(__bfloat162float(h))); } @@ -1219,119 +1681,134 @@ __device__ __hip_bfloat16 htrunc(const __hip_bfloat16 h) { * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate ceil of bfloat162 */ -__device__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h) { - return __hip_bfloat162{hceil(h.x), hceil(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2ceil(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hceil(__hip_bfloat16_raw{hr.x}), hceil(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate cosine of bfloat162 */ -__device__ __hip_bfloat162 h2cos(const __hip_bfloat162 h) { - return __hip_bfloat162{hcos(h.x), hcos(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2cos(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hcos(__hip_bfloat16_raw{hr.x}), hcos(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate exponential of bfloat162 */ -__device__ __hip_bfloat162 h2exp(const __hip_bfloat162 h) { - return __hip_bfloat162{hexp(h.x), hexp(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hexp(__hip_bfloat16_raw{hr.x}), hexp(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate exponential 10 of bfloat162 */ -__device__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h) { - return __hip_bfloat162{hexp10(h.x), hexp10(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp10(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hexp10(__hip_bfloat16_raw{hr.x}), hexp10(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate exponential 2 of bfloat162 */ -__device__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h) { - return __hip_bfloat162{hexp2(h.x), hexp2(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2exp2(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hexp2(__hip_bfloat16_raw{hr.x}), hexp2(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate floor of bfloat162 */ -__device__ __hip_bfloat162 h2floor(const __hip_bfloat162 h) { - return __hip_bfloat162{hfloor(h.x), hfloor(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2floor(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hfloor(__hip_bfloat16_raw{hr.x}), hfloor(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate natural log of bfloat162 */ -__device__ __hip_bfloat162 h2log(const __hip_bfloat162 h) { - return __hip_bfloat162{hlog(h.x), hlog(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hlog(__hip_bfloat16_raw{hr.x}), hlog(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate log 10 of bfloat162 */ -__device__ __hip_bfloat162 h2log10(const __hip_bfloat162 h) { - return __hip_bfloat162{hlog10(h.x), hlog10(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log10(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hlog10(__hip_bfloat16_raw{hr.x}), hlog10(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate log 2 of bfloat162 */ -__device__ __hip_bfloat162 h2log2(const __hip_bfloat162 h) { - return __hip_bfloat162{hlog2(h.x), hlog2(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2log2(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hlog2(__hip_bfloat16_raw{hr.x}), hlog2(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate vector reciprocal */ -__device__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h) { - return __hip_bfloat162{hrcp(h.x), hrcp(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rcp(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hrcp(__hip_bfloat16_raw{hr.x}), hrcp(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate vector round to nearest int */ -__device__ __hip_bfloat162 h2rint(const __hip_bfloat162 h) { - return __hip_bfloat162{hrint(h.x), hrint(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rint(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hrint(__hip_bfloat16_raw{hr.x}), hrint(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate vector reciprocal square root */ -__device__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h) { - return __hip_bfloat162{hrsqrt(h.x), hrsqrt(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2rsqrt(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hrsqrt(__hip_bfloat16_raw{hr.x}), hrsqrt(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate sin of bfloat162 */ -__device__ __hip_bfloat162 h2sin(const __hip_bfloat162 h) { - return __hip_bfloat162{hsin(h.x), hsin(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sin(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hsin(__hip_bfloat16_raw{hr.x}), hsin(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate sqrt of bfloat162 */ -__device__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h) { - return __hip_bfloat162{hsqrt(h.x), hsqrt(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2sqrt(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(hsqrt(__hip_bfloat16_raw{hr.x}), hsqrt(__hip_bfloat16_raw{hr.y})); } /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Calculate truncate of bfloat162 */ -__device__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) { - return __hip_bfloat162{htrunc(h.x), htrunc(h.y)}; +__BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) { + __hip_bfloat162_raw hr = h; + return __hip_bfloat162(htrunc(__hip_bfloat16_raw{hr.x}), htrunc(__hip_bfloat16_raw{hr.y})); } #endif diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_hip_cooperative_groups.h b/third_party/amd/backend/include/hip/amd_detail/amd_hip_cooperative_groups.h index 8b1a0c067db1..c01039a7e1cc 100644 --- a/third_party/amd/backend/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/third_party/amd/backend/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -216,12 +216,18 @@ class thread_block : public thread_group { if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) { __hip_assert(false && "invalid tile size"); } + + auto block_size = size(); + auto rank = thread_rank(); + auto partitions = (block_size + tile_size - 1) / tile_size; + auto tail = (partitions * tile_size) - block_size; + auto partition_size = tile_size - tail * (rank >= (partitions - 1) * tile_size); + thread_group tiledGroup = thread_group(internal::cg_tiled_group, partition_size); - thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size); tiledGroup.coalesced_info.tiled_info.size = tile_size; tiledGroup.coalesced_info.tiled_info.is_tiled = true; - tiledGroup.coalesced_info.tiled_info.meta_group_rank = thread_rank() / tile_size; - tiledGroup.coalesced_info.tiled_info.meta_group_size = (size() + tile_size - 1) / tile_size; + tiledGroup.coalesced_info.tiled_info.meta_group_rank = rank / tile_size; + tiledGroup.coalesced_info.tiled_info.meta_group_size = partitions; return tiledGroup; } diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_hip_fp8.h b/third_party/amd/backend/include/hip/amd_detail/amd_hip_fp8.h new file mode 100644 index 000000000000..e54c70241701 --- /dev/null +++ b/third_party/amd/backend/include/hip/amd_detail/amd_hip_fp8.h @@ -0,0 +1,1391 @@ +/** + * MIT License + * + * Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** + * \file + * \brief amd_hip_fp8.h header, for AMD fp8 data types + */ + +#ifndef _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ +#define _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ + +#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && __HIP_DEVICE_COMPILE__ +#define HIP_FP8_CVT_FAST_PATH 1 +#else +#define HIP_FP8_CVT_FAST_PATH 0 +#endif + +#if !defined(__HIPCC_RTC__) +#include +#include + +#include "host_defines.h" // __hip_internal:: +#include "amd_hip_vector_types.h" // float2 etc +#include "amd_hip_fp16.h" // __half_raw +#include "amd_hip_bf16.h" // bf16 +#include "math_fwd.h" // ocml device functions +#endif // !defined(__HIPCC_RTC__) + +#if defined(__HIPCC_RTC__) +#define __FP8_HOST_DEVICE__ __device__ +#define __FP8_HOST_DEVICE_STATIC__ __FP8_HOST_DEVICE__ static +#else +#define __FP8_HOST_DEVICE__ __host__ __device__ +#define __FP8_HOST_DEVICE_STATIC__ __FP8_HOST_DEVICE__ static inline +#endif // __HIPCC_RTC__ + +#if !defined(__HIPCC_RTC__) +static_assert(CHAR_BIT == 8, "byte size should be of 8 bits"); +#endif +static_assert(sizeof(unsigned char) == 1); +static_assert(sizeof(unsigned short int) == 2); +static_assert(sizeof(unsigned int) == 4); + +/** + * \brief Describes FP8 interpretation + */ +enum __hip_fp8_interpretation_t { + __HIP_E4M3_FNUZ = 0, /**< Standard FP8 */ + __HIP_E5M2_FNUZ = 1, /**< BF8 */ +}; + +/** + * \brief Describes saturation behavior + */ +enum __hip_saturation_t { + __HIP_NOSAT = 0, /**< No saturation */ + __HIP_SATFINITE = 1, /**< Saturate to finite */ +}; + +/** \typedef __hip_fp8_storage_t + * + * \brief type to store single fp8 number + */ +typedef unsigned char __hip_fp8_storage_t; + + +/** \typedef __hip_fp8x2_storage_t + * + * \brief type to store two fp8 numbers + */ +typedef unsigned short int __hip_fp8x2_storage_t; + + +/** \typedef __hip_fp8x4_storage_t + * + * \brief type to store four fp8 numbers + */ +typedef unsigned int __hip_fp8x4_storage_t; + +namespace internal { +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L39 +// This has been modified to add double types conversion as well +template +__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t cast_to_f8(T _x, int wm, int we, bool clip = false, + bool stoch = false, + unsigned int rng = 0) { + constexpr bool is_half = __hip_internal::is_same::value; + constexpr bool is_float = __hip_internal::is_same::value; + constexpr bool is_double = __hip_internal::is_same::value; + static_assert(is_half || is_float || is_double, "Only half, float and double can be cast to f8"); + + const int mfmt = (sizeof(T) == 8) ? 52 : ((sizeof(T) == 4) ? 23 : 10); + unsigned long long x; + + if (sizeof(T) == 8) + x = reinterpret_cast(_x); + else if (sizeof(T) == 4) + x = reinterpret_cast(_x); + else + x = reinterpret_cast(_x); + + + unsigned long long head, mantissa; + int exponent, bias; + unsigned int sign; + + if (sizeof(T) == 8) { + head = x & 0xFFF0000000000000ull; + mantissa = x & 0xFFFFFFFFFFFFFull; + exponent = (head >> 52) & 0x7FF; + sign = head >> 63; + bias = 1023; + } else if (sizeof(T) == 4) { + head = x & 0xFF800000; + mantissa = x & 0x7FFFFF; + exponent = (head >> 23) & 0xFF; + sign = head >> 31; + bias = 127; + } else { + head = x & 0xFC00; + mantissa = x & 0x3FF; + exponent = (head >> 10) & 0x1F; + sign = head >> 15; + bias = 15; + } + + unsigned int signed_inf = (sign << 7) + (((1 << we) - 1) << wm); + + // Deal with inf and NaNs + if (negative_zero_nan) { + if (sizeof(T) == 8) { + if ((x & 0x7FF0000000000000ull) == 0x7FF0000000000000ull) return 0x80; + } else if (sizeof(T) == 4) { + if ((x & 0x7F800000) == 0x7F800000) return 0x80; + } else { + if ((x & 0x7C00) == 0x7C00) return 0x80; + } + } else { + if (sizeof(T) == 8) { + if ((x & 0x7FF0000000000000ull) == 0x7FF0000000000000ull) + return signed_inf + (mantissa != 0 ? 1 : 0); + } else if (sizeof(T) == 4) { + if ((x & 0x7F800000) == 0x7F800000) return signed_inf + (mantissa != 0 ? 1 : 0); + } else { + if ((x & 0x7C00) == 0x7C00) return signed_inf + (mantissa != 0 ? 1 : 0); + } + } + + if (x == 0) { + return 0; + } + + // First need to check if it is normal or denorm as there is a difference of implict 1 + // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift + // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for + // RNE, no need to add rng. Then probably need to check whether there is carry and adjust + // exponent and mantissa again + + // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits + const int f8_bias = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0); + const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal + // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias) + // f8_exponent is the converted f8 exponent with bias encoding + // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent, + // the difference needs to be adjusted and mantissa shifted + int act_exponent, f8_exponent, exponent_diff; + + if (exponent == 0) { // fp32/fp16 is in denormal. + /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16 +here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal has +exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that there are some numbers in +fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers +where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal. In +this case, the fp16 mantissa should be shift left by 1 */ + act_exponent = exponent - bias + 1; + exponent_diff = f8_denormal_act_exponent - + act_exponent; // actual exponent is exponent-bias+1 as it is denormal + } else { // fp32/fp16 is normal with implicit 1 + act_exponent = exponent - bias; + if (act_exponent <= f8_denormal_act_exponent) { + /* This is the case where fp32/fp16 is normal but it is in f8 denormal range. +For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16 +actual exponent is -7, it is actually larger due to the implict 1, +Therefore it needs to be adjust to -6 and mantissa shift right by 1. +So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */ + exponent_diff = f8_denormal_act_exponent - act_exponent; + } else { // both fp32/fp16 and f8 are in normal range + exponent_diff = 0; // exponent_diff=0 does not mean there is no difference for this case, + // act_exponent could be larger. Just that it does not need shift mantissa + } + mantissa += (1ull << mfmt); // Add the implicit 1 into mantissa + } + + bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) == + (1ull << (mfmt - wm + exponent_diff - 1)); + /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we shift +right as shift right could rip off some residual part and make something not midpoint look like +midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than midpoint, but +after shift right by 4 bits, it would look like midpoint. +*/ + + if (exponent_diff > 0) + mantissa >>= exponent_diff; + else if (exponent_diff == -1) + mantissa <<= -exponent_diff; + bool implicit_one = mantissa & (1ull << mfmt); + // if there is no implict 1, it means the f8 is denormal and need to adjust to denorm exponent + f8_exponent = + (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1); + + // Now we have the exponent and mantissa adjusted + unsigned long long drop_mask = (1ull << (mfmt - wm)) - 1; + bool odd = + mantissa & (1ull << (mfmt - wm)); // if the least significant bit that is not truncated is 1 + mantissa += + (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1ull) : mantissa)) & drop_mask; + + // Now we deal with overflow + if (f8_exponent == 0) { + if ((1ull << mfmt) & mantissa) { + f8_exponent = 1; // denormal overflow to become normal, promote exponent + } + } else { + if ((1ull << (mfmt + 1)) & mantissa) { + mantissa >>= 1; + f8_exponent++; + } + } + + mantissa >>= (mfmt - wm); + + // above range: quantize to maximum possible float of the same sign + const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2); + if (f8_exponent > max_exp) { + if (clip) { + mantissa = (1 << wm) - 1; + f8_exponent = max_exp; + } else { + return signed_inf; + } + } + + if (f8_exponent == 0 && mantissa == 0) return negative_zero_nan ? 0 : (sign << 7); + mantissa &= (1 << wm) - 1; + return (sign << 7) | (f8_exponent << wm) | mantissa; +} + +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220 +// This has been modified to handle double types as well +template +__FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we) { + constexpr bool is_half = __hip_internal::is_same::value; + constexpr bool is_float = __hip_internal::is_same::value; + constexpr bool is_double = __hip_internal::is_same::value; + static_assert(is_half || is_float || is_double, "only half, float and double are supported"); + + constexpr int weo = is_half ? 5 : (is_float ? 8 : 11); + constexpr int wmo = is_half ? 10 : (is_float ? 23 : 52); + + T fInf, fNegInf, fNaN, fNeg0; + if (is_half) { + const unsigned short int ihInf = 0x7C00; + const unsigned short int ihNegInf = 0xFC00; + const unsigned short int ihNaN = 0x7C01; + const unsigned short int ihNeg0 = 0x8000; + fInf = reinterpret_cast(ihInf); + fNegInf = reinterpret_cast(ihNegInf); + fNaN = reinterpret_cast(ihNaN); + fNeg0 = reinterpret_cast(ihNeg0); + } else if (is_float) { + const unsigned int ifInf = 0x7F800000; + const unsigned int ifNegInf = 0xFF800000; + const unsigned int ifNaN = 0x7F800001; + const unsigned int ifNeg0 = 0x80000000; + fInf = reinterpret_cast(ifInf); + fNegInf = reinterpret_cast(ifNegInf); + fNaN = reinterpret_cast(ifNaN); + fNeg0 = reinterpret_cast(ifNeg0); + } else if (is_double) { + const unsigned long long ifInf = 0x7FF0000000000000ull; + const unsigned long long ifNegInf = 0xFFF0000000000000ull; + const unsigned long long ifNaN = 0x7FF0000000000001ull; + const unsigned long long ifNeg0 = 0x8000000000000000ull; + fInf = reinterpret_cast(ifInf); + fNegInf = reinterpret_cast(ifNegInf); + fNaN = reinterpret_cast(ifNaN); + fNeg0 = reinterpret_cast(ifNeg0); + } + + if (x == 0) { + return 0; + } + + unsigned long long sign = x >> 7; + unsigned long long mantissa = x & ((1 << wm) - 1); + int exponent = (x & 0x7F) >> wm; + if (negative_zero_nan) { + if (x == 0x80) return fNaN; + } else { + if (x == 0x80) return fNeg0; + if (exponent == ((1 << we) - 1)) return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN; + } + + typename __hip_internal::conditional< + sizeof(T) == 2, unsigned short int, + typename __hip_internal::conditional::type>::type retval; + + if (we == 5 && is_half && !negative_zero_nan) { + retval = x << 8; + return reinterpret_cast(retval); + } + + const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0); + + // subnormal input + if (exponent == 0) { +#if __HIP_DEVICE_COMPILE__ + // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above + int sh = 1 + __clz(mantissa) - (32 - wm); +#else + int sh = 1 + __builtin_clz(mantissa) - (32 - wm); +#endif + mantissa <<= sh; + exponent += 1 - sh; + mantissa &= ((1ull << wm) - 1); + } + exponent += exp_low_cutoff - 1; + mantissa <<= wmo - wm; + + // subnormal output (occurs when T=half, we=5, negative_zero_nan=true) + if (exponent <= 0) { + mantissa |= 1 << wmo; + mantissa >>= 1 - exponent; + exponent = 0; + } + + if (sizeof(T) == 2) + retval = (sign << 15) | (exponent << 10) | mantissa; + else if (sizeof(T) == 4) + retval = (sign << 31) | (exponent << 23) | mantissa; + else + retval = (sign << 63) | (static_cast(exponent) << 52) | mantissa; + return reinterpret_cast(retval); +} + +#if HIP_FP8_CVT_FAST_PATH +// The conversion function is from rocblas +// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_float8.h#L79 +template +static __device__ __hip_fp8_storage_t cast_to_f8_from_f32(float v, bool saturate, + __hip_fp8_interpretation_t interpret, + unsigned int rng = 0) { + __hip_fp8_storage_t i8data; + union { + float fval; + unsigned int i32val; + unsigned char i8val[4]; // NOTE: not endian independent + } val; + + unsigned int ival = 0; + val.fval = v; + + if (saturate) { + if (interpret == __HIP_E4M3_FNUZ) { + if ((val.i32val & 0x7F800000) != 0x7F800000) { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0); + } + } else { + if ((val.i32val & 0x7F800000) != 0x7F800000) { /// propagate NAN/INF, no clipping + val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0); + } + } + } + + if (stochastic_rounding) { + ival = interpret == __HIP_E4M3_FNUZ + ? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0) + : __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos + val.i32val = ival; + i8data = val.i8val[0]; // little endian + } else { // RNE CVT + ival = interpret == __HIP_E4M3_FNUZ + ? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false) + : __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0 + val.i32val = ival; + i8data = val.i8val[0]; + } + return i8data; +} + +static __device__ __hip_fp8x2_storage_t +cast_to_f8x2_from_f32x2(float2 v, bool saturate, __hip_fp8_interpretation_t interpret) { + union { + static_assert(sizeof(float2) == sizeof(unsigned int[2])); + static_assert(sizeof(float2) == sizeof(unsigned short[4])); + float2 fval; + unsigned int i32val[2]; + unsigned short i16val[4]; + } f2val; + + f2val.fval = v; + + if (saturate) { /// propagate NAN/INF, no clipping + if ((f2val.i32val[0] & 0x7F800000) != 0x7F800000) { + f2val.fval.x = __builtin_amdgcn_fmed3f(f2val.fval.x, 240.0, -240.0); + } + if ((f2val.i32val[1] & 0x7F800000) != 0x7F800000) { + f2val.fval.y = __builtin_amdgcn_fmed3f(f2val.fval.x, 240.0, -240.0); + } + } + + f2val.i32val[0] = interpret == __HIP_E4M3_FNUZ + ? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false) + : __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false); + + return static_cast<__hip_fp8x2_storage_t>(f2val.i16val[0]); +} + +static __device__ float cast_to_f32_from_f8(__hip_fp8_storage_t v, + __hip_fp8_interpretation_t interpret) { + union { + unsigned int i32val; + unsigned char i8val[4]; + } val; + val.i8val[0] = v; + + float fval = interpret == __HIP_E4M3_FNUZ ? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0) + : __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0); + return fval; +} + +static __device__ float2 cast_to_f32x2_from_f8x2(__hip_fp8x2_storage_t v, + __hip_fp8_interpretation_t interpret) { + union { + unsigned int i32val; + unsigned short i16val[2]; + } val; + val.i16val[0] = v; + + auto f2 = interpret == __HIP_E4M3_FNUZ ? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false) + : __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false); + return float2{f2[0], f2[1]}; +} +#endif // HIP_FP8_CVT_FAST_PATH + +/* For fp8 fnuz types, finite and NaN values are supported. Zero is unsigned. +Inf are not supported. This gives us one additional number to represent. +NaN are represented by 1-0000-000 or 1-00000-00 */ +__FP8_HOST_DEVICE_STATIC__ bool hip_fp8_fnuz_is_nan(__hip_fp8_storage_t a) { + return static_cast(a) == 0x80; +} +} // namespace internal + +/** + * \brief convert float to @p __hip_fp8_storage_t + * + * \param f float number + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t __hip_cvt_float_to_fp8( + const float f, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) { +#if HIP_FP8_CVT_FAST_PATH + return internal::cast_to_f8_from_f32(f, sat == __HIP_SATFINITE, type); +#else // HIP_FP8_CVT_FAST_PATH + int we = type == __HIP_E4M3_FNUZ ? 4 : 5; + int wm = type == __HIP_E4M3_FNUZ ? 3 : 2; + return internal::cast_to_f8(f, wm, we, sat == __HIP_SATFINITE); +#endif // HIP_FP8_CVT_FAST_PATH +} + +/** + * \brief convert float2 to @p __hip_fp8x2_storage_t + * + * \param f2 float2 number + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8x2_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t __hip_cvt_float2_to_fp8x2( + const float2 f2, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) { +#if HIP_FP8_CVT_FAST_PATH + return internal::cast_to_f8x2_from_f32x2(f2, sat == __HIP_SATFINITE, type); +#else + return static_cast<__hip_fp8x2_storage_t>( + static_cast(__hip_cvt_float_to_fp8(f2.y, sat, type)) << 8 | + static_cast(__hip_cvt_float_to_fp8(f2.x, sat, type))); +#endif +} + +/** + * \brief convert double to @p __hip_fp8_storage_t + * + * \param d double val + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t __hip_cvt_double_to_fp8( + const double d, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) { + int we = type == __HIP_E4M3_FNUZ ? 4 : 5; + int wm = type == __HIP_E4M3_FNUZ ? 3 : 2; + return internal::cast_to_f8(d, wm, we, sat == __HIP_SATFINITE); +} + +/** + * \brief convert double2 to @p __hip_fp8x2_storage_t + * + * \param d2 double2 val + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8x2_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t __hip_cvt_double2_to_fp8x2( + const double2 d2, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) { + return static_cast<__hip_fp8x2_storage_t>( + static_cast(__hip_cvt_double_to_fp8(d2.y, sat, type)) << 8 | + static_cast(__hip_cvt_double_to_fp8(d2.x, sat, type))); +} + +/** + * \brief convert __hip_bfloat16_raw to @p __hip_fp8_storage_t + * + * \param hr __hip_bfloat16_raw val + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t +__hip_cvt_bfloat16raw_to_fp8(const __hip_bfloat16_raw hr, const __hip_saturation_t sat, + const __hip_fp8_interpretation_t type) { + float fval = __hip_bfloat16(hr); + return __hip_cvt_float_to_fp8(fval, sat, type); +} + +/** + * \brief convert double2 to @p __hip_fp8x2_storage_t + * + * \param hr __hip_bfloat162_raw value + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8x2_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t +__hip_cvt_bfloat16raw2_to_fp8x2(const __hip_bfloat162_raw hr, const __hip_saturation_t sat, + const __hip_fp8_interpretation_t type) { + float2 f2 = __hip_bfloat162(hr); + return __hip_cvt_float2_to_fp8x2(f2, sat, type); +} + +/** + * \brief convert @p __hip_fp8_storage_t to __half_raw + * + * \param x __hip_fp8_storage_t val + * \param type interpretation of fp8 + * \return __half_raw + */ +__FP8_HOST_DEVICE_STATIC__ __half_raw +__hip_cvt_fp8_to_halfraw(const __hip_fp8_storage_t x, const __hip_fp8_interpretation_t type) { + unsigned int we = type == __HIP_E4M3_FNUZ ? 4 : 5; + unsigned int wm = type == __HIP_E4M3_FNUZ ? 3 : 2; + return __half_raw{internal::cast_from_f8<_Float16, true>(x, wm, we)}; +} + +/** + * \brief convert @p __hip_fp8x2_storage_t to __half2_raw + * + * \param x __hip_fp8x2_storage_t val + * \param type interpretation of fp8 + * \return __half2_raw + */ +__FP8_HOST_DEVICE_STATIC__ __half2_raw +__hip_cvt_fp8x2_to_halfraw2(const __hip_fp8x2_storage_t x, const __hip_fp8_interpretation_t type) { + __half2 ret(static_cast<__half>( + __hip_cvt_fp8_to_halfraw(static_cast<__hip_fp8_storage_t>(x & 0xFF), type)), + static_cast<__half>( + __hip_cvt_fp8_to_halfraw(static_cast<__hip_fp8_storage_t>(x >> 8), type))); + return static_cast<__half2_raw>(ret); +} + +/** + * \brief convert __half_raw to @p __hip_fp8_storage_t + * + * \param x __half_raw value + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8_storage_t __hip_cvt_halfraw_to_fp8( + const __half_raw x, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) { + return __hip_cvt_float_to_fp8(__half2float(__half(x)), sat, type); +} + +/** + * \brief convert __half2_raw to @p __hip_fp8x2_storage_t + * + * \param x __half2_raw value + * \param sat saturation of fp8 + * \param type interpretation of fp8 + * \return __hip_fp8x2_storage_t + */ +__FP8_HOST_DEVICE_STATIC__ __hip_fp8x2_storage_t __hip_cvt_halfraw2_to_fp8x2( + const __half2_raw x, const __hip_saturation_t sat, const __hip_fp8_interpretation_t type) { + return __hip_cvt_float2_to_fp8x2(__half22float2(__half2(x)), sat, type); +} + +/** + * \brief struct representing single fp8 number with e4m3 interpretation + * + */ +struct __hip_fp8_e4m3_fnuz { + __hip_fp8_storage_t __x; //! raw storage of fp8 number + constexpr static __hip_saturation_t __default_saturation = __HIP_SATFINITE; + constexpr static __hip_fp8_interpretation_t __default_interpret = __HIP_E4M3_FNUZ; + constexpr static unsigned int __we = 4; + constexpr static unsigned int __wm = 3; + + // TODO: SWDEV-452411 + // Add cast from unsigned long long, long long to fp8 + + /*! create fp8 e4m3 from long */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const long int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from int */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from short int */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const short int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from unsigned long */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const unsigned long int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from unsigned int */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const unsigned int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from unsigned short */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const unsigned short int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from double */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const double f) + : __x(__hip_cvt_double_to_fp8(f, __default_saturation, __default_interpret)) {} + + /*! create fp8 e4m3 from float */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const float f) + : __x(__hip_cvt_float_to_fp8(f, __default_saturation, __default_interpret)) {} + + /*! create fp8 e4m3 from __hip_bfloat16 */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const __hip_bfloat16 f) + : __x(__hip_cvt_float_to_fp8(static_cast(f), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e4m3 from __half */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz(const __half f) + : __x(__hip_cvt_halfraw_to_fp8(static_cast<__half_raw>(f), __default_saturation, + __default_interpret)) {} + + /*! default construct fp8 e4m3 */ + __FP8_HOST_DEVICE__ __hip_fp8_e4m3_fnuz() = default; + + /*! convert fp8 e4m3 to __half */ + __FP8_HOST_DEVICE__ operator __half() const { + return __half(__hip_cvt_fp8_to_halfraw(__x, __default_interpret)); + } + + /*! convert fp8 e4m3 to __hip_bfloat16 */ + __FP8_HOST_DEVICE__ operator __hip_bfloat16() const { + float f = *this; + return __hip_bfloat16(f); + } + + /*! convert fp8 e4m3 to bool, return false if value is 0, true otherwise */ + __FP8_HOST_DEVICE__ operator bool() const { + // it can be 0x00 (+0.0) since 0x80 will be nan + return !(static_cast(__x) == 0); + } + + /*! convert fp8 e4m3 to char, clamp number to CHAR_MIN/CHAR_MAX if its out of range */ + __FP8_HOST_DEVICE__ operator char() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + auto fval = internal::cast_from_f8(__x, __wm, __we); + auto llval = static_cast(fval); + if (llval <= CHAR_MIN) { + return CHAR_MIN; + } else if (llval >= CHAR_MAX) { + return CHAR_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to double */ + __FP8_HOST_DEVICE__ operator double() const { + return internal::cast_from_f8(__x, __wm, __we); + } + + /*! convert fp8 e4m3 to float */ + __FP8_HOST_DEVICE__ operator float() const { +#if HIP_FP8_CVT_FAST_PATH + return internal::cast_to_f32_from_f8(__x, __default_interpret); +#else + return internal::cast_from_f8(__x, __wm, __we); +#endif + } + + /*! convert fp8 e4m3 to int, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + return static_cast(fval); + } + + /*! convert fp8 e4m3 to long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + return static_cast(fval); + } + + /*! convert fp8 e4m3 to long long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator long long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + return static_cast(fval); + } + + /*! convert fp8 e4m3 to short int, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator short int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= SHRT_MIN) { + return SHRT_MIN; + } else if (llval >= SHRT_MAX) { + return SHRT_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to signed char, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator signed char() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= SCHAR_MIN) { + return SCHAR_MIN; + } else if (llval >= SCHAR_MAX) { + return SCHAR_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to unsigned char, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned char() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } else if (llval >= UCHAR_MAX) { + return UCHAR_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to unsigned int, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to unsigned long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to long long int, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned long long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } + + /*! convert fp8 e4m3 to unsigned short, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned short int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } +}; + +/** + * \brief struct representing two fp8 numbers with e4m3 interpretation + * + */ +struct __hip_fp8x2_e4m3_fnuz { + __hip_fp8x2_storage_t __x; //! raw storage of two fp8 numbers + static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE; + static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E4M3_FNUZ; + static constexpr unsigned int __we = 4; + static constexpr unsigned int __wm = 3; + + /*! create fp8x2 e4m3 type from double2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const double2 val) + : __x(__hip_cvt_double2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! create fp8x2 e4m3 type from float2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const float2 val) + : __x(__hip_cvt_float2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! create fp8x2 e4m3 type from __hip_bfloat162 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const __hip_bfloat162 val) + : __x(__hip_cvt_bfloat16raw2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! create fp8x2 e4m3 type from __half2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz(const __half2 val) + : __x(__hip_cvt_halfraw2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! Default construct of fp8x2 e4m3 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e4m3_fnuz() = default; + + /*! convert fp8x2 e4m3 to __half2 */ + __FP8_HOST_DEVICE__ operator __half2() const { + return __half2(__hip_cvt_fp8x2_to_halfraw2(__x, __default_interpret)); + } + + /*! convert fp8x2 e4m3 to float2 */ + __FP8_HOST_DEVICE__ operator float2() const { +#if HIP_FP8_CVT_FAST_PATH + return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret); +#else + return float2(internal::cast_from_f8(static_cast<__hip_fp8_storage_t>(__x & 0xFF), + __wm, __we), + internal::cast_from_f8(static_cast<__hip_fp8_storage_t>(__x >> 8), + __wm, __we)); +#endif + } +}; + +/** + * \brief struct representing four fp8 numbers with e4m3 interpretation + * + */ +struct __hip_fp8x4_e4m3_fnuz { + __hip_fp8x4_storage_t __x; //! raw storage of four fp8 numbers + static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE; + static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E4M3_FNUZ; + static constexpr unsigned int __we = 4; + static constexpr unsigned int __wm = 3; + + /*! create fp8x4 e4m3 type from double4 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const double4 val) + : __x{reinterpret_cast<__hip_fp8x4_storage_t>( + static_cast(reinterpret_cast(__hip_cvt_double_to_fp8( + val.x, __default_saturation, __default_interpret)) | + reinterpret_cast(__hip_cvt_double_to_fp8( + val.y, __default_saturation, __default_interpret)) + << 8 | + reinterpret_cast(__hip_cvt_double_to_fp8( + val.z, __default_saturation, __default_interpret)) + << 16 | + reinterpret_cast(__hip_cvt_double_to_fp8( + val.w, __default_saturation, __default_interpret)) + << 24))} {} + + /*! create fp8x4 e4m3 type from float4 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const float4 val) + : __x{reinterpret_cast<__hip_fp8x4_storage_t>( + static_cast(reinterpret_cast(__hip_cvt_float_to_fp8( + val.x, __default_saturation, __default_interpret)) | + reinterpret_cast(__hip_cvt_float_to_fp8( + val.y, __default_saturation, __default_interpret)) + << 8 | + reinterpret_cast(__hip_cvt_float_to_fp8( + val.z, __default_saturation, __default_interpret)) + << 16 | + reinterpret_cast(__hip_cvt_float_to_fp8( + val.w, __default_saturation, __default_interpret)) + << 24))} {} + + /*! create fp8x4 e4m3 type from two __hip_bfloat162 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const __hip_bfloat162 low, const __hip_bfloat162 high) + : __x(reinterpret_cast<__hip_fp8x4_storage_t>(static_cast( + reinterpret_cast( + __hip_cvt_bfloat16raw2_to_fp8x2(high, __default_saturation, __default_interpret)) | + reinterpret_cast( + __hip_cvt_bfloat16raw2_to_fp8x2(low, __default_saturation, __default_interpret)) + << 16))) {} + + /*! create fp8x4 e4m3 type from two __half2 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz(const __half2 low, const __half2 high) + : __x(reinterpret_cast<__hip_fp8x4_storage_t>( + static_cast(reinterpret_cast(__hip_cvt_halfraw2_to_fp8x2( + high, __default_saturation, __default_interpret)) | + reinterpret_cast(__hip_cvt_halfraw2_to_fp8x2( + low, __default_saturation, __default_interpret)) + << 16))) {} + + /*! Default construct fp8x4 e4m3 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e4m3_fnuz() = default; + + /*! convert fp8x4 e4m3 to float4 */ + __FP8_HOST_DEVICE__ operator float4() const { + auto x = __x; // bypass const + auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E + auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1); +#if HIP_FP8_CVT_FAST_PATH + float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret); + float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret); +#else + float2 high = float2(internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we), + internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>(fp8x2_high >> 8), __wm, __we)); + float2 low = float2(internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>((fp8x2_low << 8) >> 8), __wm, __we), + internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we)); +#endif + return float4(low.x, low.y, high.x, high.y); + } +}; + +/** + * \brief struct representing one fp8 number with e5m2 interpretation + * + */ +struct __hip_fp8_e5m2_fnuz { + __hip_fp8_storage_t __x; //! raw storage of one fp8 numbers + static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE; + static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E5M2_FNUZ; + static constexpr unsigned int __we = 5; + static constexpr unsigned int __wm = 2; + + + // TODO: SWDEV-452411 + // Add cast from unsigned long long, long long to fp8 + + /*! create fp8 e5m2 type from long */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const long int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from int */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from short int */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const short int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from unsigned long */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const unsigned long int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from unsigned int */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const unsigned int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from unsigned short */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const unsigned short int val) + : __x(__hip_cvt_float_to_fp8(static_cast(val), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from double */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const double f) + : __x(__hip_cvt_double_to_fp8(f, __default_saturation, __default_interpret)) {} + + /*! create fp8 e5m2 type from float */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const float f) + : __x(__hip_cvt_float_to_fp8(f, __default_saturation, __default_interpret)) {} + + /*! create fp8 e5m2 type from __hip_bfloat16 */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const __hip_bfloat16 f) + : __x(__hip_cvt_float_to_fp8(static_cast(f), __default_saturation, + __default_interpret)) {} + + /*! create fp8 e5m2 type from __hip_bfloat16 */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz(const __half f) + : __x(__hip_cvt_halfraw_to_fp8(static_cast<__half_raw>(f), __default_saturation, + __default_interpret)) {} + + /*! default construct fp8 e5m2 */ + __FP8_HOST_DEVICE__ __hip_fp8_e5m2_fnuz() = default; + + /*! convert fp8 e5m2 to float */ + __FP8_HOST_DEVICE__ operator float() const { +#if HIP_FP8_CVT_FAST_PATH + return internal::cast_to_f32_from_f8(__x, __default_interpret); +#else + return internal::cast_from_f8(__x, __wm, __we); +#endif + } + + /*! convert fp8 e5m2 to __half */ + __FP8_HOST_DEVICE__ operator __half() const { + return __half(__hip_cvt_fp8_to_halfraw(__x, __default_interpret)); + } + + /*! convert fp8 e5m2 to __hip_bfloat16 */ + __FP8_HOST_DEVICE__ operator __hip_bfloat16() const { + float f = *this; + return __hip_bfloat16(f); + } + + /*! convert fp8 e4m3 to bool, return false if value is 0, true otherwise */ + __FP8_HOST_DEVICE__ operator bool() const { + // it can be 0x00 (+0.0) since 0x80 will be nan + return !(static_cast(__x) == 0); + } + + /*! convert fp8 e5m2 to char, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator char() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= CHAR_MIN) { + return CHAR_MIN; + } else if (llval >= CHAR_MAX) { + return CHAR_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to double */ + __FP8_HOST_DEVICE__ operator double() const { + return internal::cast_from_f8(__x, __wm, __we); + } + + /*! convert fp8 e5m2 to int, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + return static_cast(fval); + } + + /*! convert fp8 e5m2 to long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + return static_cast(fval); + } + + /*! convert fp8 e5m2 to long long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator long long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + return static_cast(fval); + } + + /*! convert fp8 e5m2 to short, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator short int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= SHRT_MIN) { + return SHRT_MIN; + } else if (llval >= SHRT_MAX) { + return SHRT_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to signed char, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator signed char() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= SCHAR_MIN) { + return SCHAR_MIN; + } else if (llval >= SCHAR_MAX) { + return SCHAR_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to unsigned char, clamp out of bound values, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned char() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } else if (llval >= UCHAR_MAX) { + return UCHAR_MAX; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to unsigned int, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to unsigned long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to unsigned long long, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned long long int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } + + /*! convert fp8 e5m2 to unsigned short, return 0 if value is NaN */ + __FP8_HOST_DEVICE__ operator unsigned short int() const { + if (internal::hip_fp8_fnuz_is_nan(__x)) { + return 0; + } + + float fval = *this; + auto llval = static_cast(fval); + if (llval <= 0) { + return 0; + } + return static_cast(fval); + } +}; + +/** + * \brief struct representing two fp8 numbers with e5m2 interpretation + * + */ +struct __hip_fp8x2_e5m2_fnuz { + __hip_fp8x2_storage_t __x; //! raw storage of two fp8 numbers + static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE; + static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E5M2_FNUZ; + static constexpr unsigned int __we = 5; + static constexpr unsigned int __wm = 2; + + /*! create fp8x2 e5m2 type from double2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const double2 val) + : __x(__hip_cvt_double2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! create fp8x2 e5m2 type from float2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const float2 val) + : __x(__hip_cvt_float2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! create fp8x2 e5m2 type from __hip_bfloat162 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const __hip_bfloat162 val) + : __x(__hip_cvt_bfloat16raw2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! create fp8x2 e5m2 type from __half2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz(const __half2 val) + : __x(__hip_cvt_halfraw2_to_fp8x2(val, __default_saturation, __default_interpret)) {} + + /*! default construct fp8x2 e5m2 */ + __FP8_HOST_DEVICE__ __hip_fp8x2_e5m2_fnuz() = default; + + /*! convert fp8x2 e5m2 to __half2 */ + __FP8_HOST_DEVICE__ operator __half2() const { + return __half2(__hip_cvt_fp8x2_to_halfraw2(__x, __default_interpret)); + } + + /*! convert fp8x2 e5m2 to float2 */ + __FP8_HOST_DEVICE__ operator float2() const { +#if HIP_FP8_CVT_FAST_PATH + return internal::cast_to_f32x2_from_f8x2(__x, __default_interpret); +#else + return float2(internal::cast_from_f8(static_cast<__hip_fp8_storage_t>(__x & 0xFF), + __wm, __we), + internal::cast_from_f8(static_cast<__hip_fp8_storage_t>(__x >> 8), + __wm, __we)); +#endif + } +}; + +/** + * \brief struct representing four fp8 numbers with e5m2 interpretation + * + */ +struct __hip_fp8x4_e5m2_fnuz { + __hip_fp8x4_storage_t __x; //! raw storage of four fp8 numbers + static constexpr __hip_saturation_t __default_saturation = __HIP_SATFINITE; + static constexpr __hip_fp8_interpretation_t __default_interpret = __HIP_E5M2_FNUZ; + static constexpr unsigned int __we = 5; + static constexpr unsigned int __wm = 2; + + /*! create fp8x4 e5m2 type from double4 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const double4 val) + : __x(reinterpret_cast<__hip_fp8x4_storage_t>( + static_cast(reinterpret_cast(__hip_cvt_double_to_fp8( + val.x, __default_saturation, __default_interpret)) | + reinterpret_cast(__hip_cvt_double_to_fp8( + val.y, __default_saturation, __default_interpret)) + << 8 | + reinterpret_cast(__hip_cvt_double_to_fp8( + val.z, __default_saturation, __default_interpret)) + << 16 | + reinterpret_cast(__hip_cvt_double_to_fp8( + val.w, __default_saturation, __default_interpret)) + << 24))) {} + + /*! create fp8x4 e5m2 type from float4 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const float4 val) + : __x(reinterpret_cast<__hip_fp8x4_storage_t>( + static_cast(reinterpret_cast(__hip_cvt_float_to_fp8( + val.x, __default_saturation, __default_interpret)) | + reinterpret_cast(__hip_cvt_float_to_fp8( + val.y, __default_saturation, __default_interpret)) + << 8 | + reinterpret_cast(__hip_cvt_float_to_fp8( + val.z, __default_saturation, __default_interpret)) + << 16 | + reinterpret_cast(__hip_cvt_float_to_fp8( + val.w, __default_saturation, __default_interpret)) + << 24))) {} + + /*! create fp8x4 e5m2 type from two __hip_bfloat162 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const __hip_bfloat162 low, const __hip_bfloat162 high) + : __x(reinterpret_cast<__hip_fp8x4_storage_t>(static_cast( + reinterpret_cast( + __hip_cvt_bfloat16raw2_to_fp8x2(high, __default_saturation, __default_interpret)) | + reinterpret_cast( + __hip_cvt_bfloat16raw2_to_fp8x2(low, __default_saturation, __default_interpret)) + << 16))) {} + + /*! create fp8x4 e5m2 type from two __half2 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz(const __half2 low, const __half2 high) + : __x(reinterpret_cast<__hip_fp8x4_storage_t>( + static_cast(reinterpret_cast(__hip_cvt_halfraw2_to_fp8x2( + high, __default_saturation, __default_interpret)) | + reinterpret_cast(__hip_cvt_halfraw2_to_fp8x2( + low, __default_saturation, __default_interpret)) + << 16))) {} + + /* default construct fp8x4 e5m2 */ + __FP8_HOST_DEVICE__ __hip_fp8x4_e5m2_fnuz() = default; + + /*! convert fp8x4 e5m2 to float4 */ + __FP8_HOST_DEVICE__ operator float4() const { + auto x = __x; // bypass const + auto fp8x2_low = *reinterpret_cast<__hip_fp8x2_storage_t*>(&x); // Little E + auto fp8x2_high = *(reinterpret_cast<__hip_fp8x2_storage_t*>(&x) + 1); +#if HIP_FP8_CVT_FAST_PATH + float2 high = internal::cast_to_f32x2_from_f8x2(fp8x2_high, __default_interpret); + float2 low = internal::cast_to_f32x2_from_f8x2(fp8x2_low, __default_interpret); +#else + float2 high = float2(internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>((fp8x2_high << 8) >> 8), __wm, __we), + internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>(fp8x2_high >> 8), __wm, __we)); + float2 low = float2(internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>((fp8x2_low << 8) >> 8), __wm, __we), + internal::cast_from_f8( + static_cast<__hip_fp8_storage_t>(fp8x2_low >> 8), __wm, __we)); +#endif + return float4(low.x, low.y, high.x, high.y); + } +}; + +#endif // _HIP_INCLUDE_HIP_AMD_DETAIL_HIP_FP8_H_ diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_hip_gl_interop.h b/third_party/amd/backend/include/hip/amd_detail/amd_hip_gl_interop.h index e5b6dc3a359c..740e37a6db47 100644 --- a/third_party/amd/backend/include/hip/amd_detail/amd_hip_gl_interop.h +++ b/third_party/amd/backend/include/hip/amd_detail/amd_hip_gl_interop.h @@ -50,7 +50,7 @@ typedef enum hipGLDeviceList { typedef unsigned int GLuint; /** GLenum as uint.*/ typedef unsigned int GLenum; -/* +/** * @} */ @@ -99,10 +99,10 @@ hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint bu */ hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target, unsigned int flags); -/* +/** * @} */ #if defined(__cplusplus) } #endif /* __cplusplus */ -#endif /* HIP_INCLUDE_AMD_HIP_GL_INTEROP_H */ +#endif /* HIP_INCLUDE_AMD_HIP_GL_INTEROP_H */ \ No newline at end of file diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_warp_functions.h b/third_party/amd/backend/include/hip/amd_detail/amd_warp_functions.h index 559ab20b3399..98f8896cd91d 100644 --- a/third_party/amd/backend/include/hip/amd_detail/amd_warp_functions.h +++ b/third_party/amd/backend/include/hip/amd_detail/amd_warp_functions.h @@ -75,6 +75,50 @@ __device__ static inline int __hip_move_dpp_N(int src) { static constexpr int warpSize = __AMDGCN_WAVEFRONT_SIZE; +// warp vote function __all __any __ballot +__device__ +inline +int __all(int predicate) { + return __ockl_wfall_i32(predicate); +} + +__device__ +inline +int __any(int predicate) { + return __ockl_wfany_i32(predicate); +} + +// XXX from llvm/include/llvm/IR/InstrTypes.h +#define ICMP_NE 33 + +__device__ +inline +unsigned long long int __ballot(int predicate) { + return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); +} + +__device__ +inline +unsigned long long int __ballot64(int predicate) { + return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); +} + +// See amd_warp_sync_functions.h for an explanation of this preprocessor flag. +#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS +// Since threads in a wave do not make independent progress, __activemask() +// always returns the exact active mask, i.e, all active threads in the wave. +__device__ +inline +unsigned long long __activemask() { + return __ballot(true); +} +#endif // HIP_ENABLE_WARP_SYNC_BUILTINS + +__device__ static inline unsigned int __lane_id() { + return __builtin_amdgcn_mbcnt_hi( + -1, __builtin_amdgcn_mbcnt_lo(-1, 0)); +} + __device__ inline int __shfl(int var, int src_lane, int width = warpSize) { diff --git a/third_party/amd/backend/include/hip/amd_detail/amd_warp_sync_functions.h b/third_party/amd/backend/include/hip/amd_detail/amd_warp_sync_functions.h new file mode 100644 index 000000000000..8ef0b2e1d73e --- /dev/null +++ b/third_party/amd/backend/include/hip/amd_detail/amd_warp_sync_functions.h @@ -0,0 +1,288 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a +// preview to allow end-users to adapt to the new interface involving 64-bit +// masks. These are disabled by default, and can be enabled by setting the macro +// below. The builtins will be enabled unconditionally in ROCm 6.3. +// +// This arrangement also applies to the __activemask() builtin defined in +// amd_warp_functions.h. +#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS + +#if !defined(__HIPCC_RTC__) +#include "amd_warp_functions.h" +#include "hip_assert.h" +#endif + +template +__device__ inline +T __hip_readfirstlane(T val) { + // In theory, behaviour is undefined when reading from a union member other + // than the member that was last assigned to, but it works in practice because + // we rely on the compiler to do the reasonable thing. + union { + unsigned long long l; + T d; + } u; + u.d = val; + // NOTE: The builtin returns int, so we first cast it to unsigned int and only + // then extend it to 64 bits. + unsigned long long lower = (unsigned)__builtin_amdgcn_readfirstlane(u.l); + unsigned long long upper = + (unsigned)__builtin_amdgcn_readfirstlane(u.l >> 32); + u.l = (upper << 32) | lower; + return u.d; +} + +// When compiling for wave32 mode, ignore the upper half of the 64-bit mask. +#define __hip_adjust_mask_for_wave32(MASK) \ + do { \ + if (warpSize == 32) MASK &= 0xFFFFFFFF; \ + } while (0) + +// We use a macro to expand each builtin into a waterfall that implements the +// mask semantics: +// +// 1. The mask argument may be divergent. +// 2. Each active thread must have its own bit set in its own mask value. +// 3. For a given mask value, all threads that are mentioned in the mask must +// execute the same static instance of the builtin with the same mask. +// 4. The union of all mask values supplied at a static instance must be equal +// to the activemask at the program point. +// +// Thus, the mask argument partitions the set of currently active threads in the +// wave into disjoint subsets that cover all active threads. +// +// Implementation notes: +// --------------------- +// +// We implement this as a waterfall loop that executes the builtin for each +// subset separately. The return value is a divergent value across the active +// threads. The value for inactive threads is defined by each builtin +// separately. +// +// As long as every mask value is non-zero, we don't need to check if a lane +// specifies itself in the mask; that is done by the later assertion where all +// chosen lanes must be in the chosen mask. + +#define __hip_check_mask(MASK) \ + do { \ + __hip_assert(MASK && "mask must be non-zero"); \ + bool done = false; \ + while (__any(!done)) { \ + if (!done) { \ + auto chosen_mask = __hip_readfirstlane(MASK); \ + if (MASK == chosen_mask) { \ + __hip_assert(MASK == __ballot(true) && \ + "all threads specified in the mask" \ + " must execute the same operation with the same mask"); \ + done = true; \ + } \ + } \ + } \ + } while(0) + +#define __hip_do_sync(RETVAL, FUNC, MASK, ...) \ + do { \ + __hip_assert(MASK && "mask must be non-zero"); \ + bool done = false; \ + while (__any(!done)) { \ + if (!done) { \ + auto chosen_mask = __hip_readfirstlane(MASK); \ + if (MASK == chosen_mask) { \ + __hip_assert(MASK == __ballot(true) && \ + "all threads specified in the mask" \ + " must execute the same operation with the same mask"); \ + RETVAL = FUNC(__VA_ARGS__); \ + done = true; \ + } \ + } \ + } \ + } while(0) + +// __all_sync, __any_sync, __ballot_sync + +template +__device__ inline +unsigned long long __ballot_sync(MaskT mask, int predicate) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + __hip_check_mask(mask); + return __ballot(predicate) & mask; +} + +template +__device__ inline +int __all_sync(MaskT mask, int predicate) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + return __ballot_sync(mask, predicate) == mask; +} + +template +__device__ inline +int __any_sync(MaskT mask, int predicate) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + return __ballot_sync(mask, predicate) != 0; +} + +// __match_any, __match_all and sync variants + +template +__device__ inline +unsigned long long __match_any(T value) { + static_assert( + (__hip_internal::is_integral::value || __hip_internal::is_floating_point::value) && + (sizeof(T) == 4 || sizeof(T) == 8), + "T can be int, unsigned int, long, unsigned long, long long, unsigned " + "long long, float or double."); + bool done = false; + unsigned long long retval = 0; + + while (__any(!done)) { + if (!done) { + T chosen = __hip_readfirstlane(value); + if (chosen == value) { + retval = __activemask(); + done = true; + } + } + } + + return retval; +} + +template +__device__ inline +unsigned long long __match_any_sync(MaskT mask, T value) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + __hip_check_mask(mask); + return __match_any(value) & mask; +} + +template +__device__ inline +unsigned long long __match_all(T value, int* pred) { + static_assert( + (__hip_internal::is_integral::value || __hip_internal::is_floating_point::value) && + (sizeof(T) == 4 || sizeof(T) == 8), + "T can be int, unsigned int, long, unsigned long, long long, unsigned " + "long long, float or double."); + T first = __hip_readfirstlane(value); + if (__all(first == value)) { + *pred = true; + return __activemask(); + } else { + *pred = false; + return 0; + } +} + +template +__device__ inline +unsigned long long __match_all_sync(MaskT mask, T value, int* pred) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + MaskT retval = 0; + __hip_adjust_mask_for_wave32(mask); + __hip_do_sync(retval, __match_all, mask, value, pred); + return retval; +} + +// various variants of shfl + +template +__device__ inline +T __shfl_sync(MaskT mask, T var, int srcLane, + int width = __AMDGCN_WAVEFRONT_SIZE) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + __hip_check_mask(mask); + return __shfl(var, srcLane, width); +} + +template +__device__ inline +T __shfl_up_sync(MaskT mask, T var, unsigned int delta, + int width = __AMDGCN_WAVEFRONT_SIZE) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + __hip_check_mask(mask); + return __shfl_up(var, delta, width); +} + +template +__device__ inline +T __shfl_down_sync(MaskT mask, T var, unsigned int delta, + int width = __AMDGCN_WAVEFRONT_SIZE) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + __hip_check_mask(mask); + return __shfl_down(var, delta, width); +} + +template +__device__ inline +T __shfl_xor_sync(MaskT mask, T var, int laneMask, + int width = __AMDGCN_WAVEFRONT_SIZE) { + static_assert( + __hip_internal::is_integral::value && sizeof(MaskT) == 8, + "The mask must be a 64-bit integer. " + "Implicitly promoting a smaller integer is almost always an error."); + __hip_adjust_mask_for_wave32(mask); + __hip_check_mask(mask); + return __shfl_xor(var, laneMask, width); +} + +#undef __hip_do_sync +#undef __hip_check_mask +#undef __hip_adjust_mask_for_wave32 + +#endif // HIP_ENABLE_WARP_SYNC_BUILTINS diff --git a/third_party/amd/backend/include/hip/amd_detail/hip_api_trace.hpp b/third_party/amd/backend/include/hip/amd_detail/hip_api_trace.hpp index 2152d519ebe1..768c62e09857 100644 --- a/third_party/amd/backend/include/hip/amd_detail/hip_api_trace.hpp +++ b/third_party/amd/backend/include/hip/amd_detail/hip_api_trace.hpp @@ -1,5 +1,5 @@ /* - Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -23,6 +23,46 @@ #include +// Define some version macros for the API table. Use similar naming conventions to HSA-runtime +// (MAJOR and STEP versions). Three groups at this time: +// +// (A) HIP_API_TABLE_* defines for versioning for API table structure +// (B) HIP_RUNTIME_API_TABLE_* defines for versioning the HipDispatchTable struct +// (C) HIP_COMPILER_API_TABLE_* defines for versioning the HipCompilerDispatchTable struct +// +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! IMPORTANT !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// +// 1. When new functions are added to the API table, always add the new function pointer to the +// end of the table and increment the dispatch table's step version number. NEVER re-arrange +// the order of the member variables in a dispatch table. This will break the ABI. +// 2. In dire circumstances, if the type of an existing member variable in a dispatch +// table has be changed because a data type has been changed/removed, increment the dispatch +// table's major version number. If the function pointer type can no longer be declared, DO +// NOT REMOVE IT! Make the function pointer type void* and have it always be set to a nullptr. +// +// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// +// The major version number should (ideally) never need to be incremented. +// - Increment the HIP_API_TABLE_MAJOR_VERSION for fundamental changes to the API table structs. +// - Increment the HIP_RUNTIME_API_TABLE_MAJOR_VERSION for fundamental changes to the +// HipDispatchTable struct, such as a *change* to type/name an existing member variable. DO NOT +// REMOVE IT. +// - Increment the HIP_COMPILER_API_TABLE_MAJOR_VERSION for fundamental changes to the +// HipCompilerDispatchTable struct, such as a *change* to type/name an existing member variable. +// DO NOT REMOVE IT. +#define HIP_API_TABLE_MAJOR_VERSION 0 +#define HIP_COMPILER_API_TABLE_MAJOR_VERSION 0 +#define HIP_RUNTIME_API_TABLE_MAJOR_VERSION 0 + +// The step version number should be changed whenever the size of the API table struct(s) change. +// - Increment the HIP_API_TABLE_STEP_VERSION when/if new API table structs are added +// - Increment the HIP_RUNTIME_API_TABLE_STEP_VERSION when new runtime API functions are added +// - Increment the HIP_COMPILER_API_TABLE_STEP_VERSION when new compiler API functions are added +// - Reset any of the *_STEP_VERSION defines to zero if the corresponding *_MAJOR_VERSION increases +#define HIP_API_TABLE_STEP_VERSION 0 +#define HIP_COMPILER_API_TABLE_STEP_VERSION 0 +#define HIP_RUNTIME_API_TABLE_STEP_VERSION 3 + // HIP API interface typedef hipError_t (*t___hipPopCallConfiguration)(dim3* gridDim, dim3* blockDim, size_t* sharedMem, hipStream_t* stream); @@ -255,6 +295,7 @@ typedef hipError_t (*t_hipGraphAddMemsetNode)(hipGraphNode_t* pGraphNode, hipGra const hipGraphNode_t* pDependencies, size_t numDependencies, const hipMemsetParams* pMemsetParams); + typedef hipError_t (*t_hipGraphChildGraphNodeGetGraph)(hipGraphNode_t node, hipGraph_t* pGraph); typedef hipError_t (*t_hipGraphClone)(hipGraph_t* pGraphClone, hipGraph_t originalGraph); typedef hipError_t (*t_hipGraphCreate)(hipGraph_t* pGraph, unsigned int flags); @@ -866,28 +907,68 @@ typedef hipError_t (*t_hipHccModuleLaunchKernel)(hipFunction_t f, uint32_t globa void** extra, hipEvent_t startEvent, hipEvent_t stopEvent); typedef int (*t_hipGetStreamDeviceId)(hipStream_t stream); - typedef hipError_t (*t_hipDrvGraphAddMemsetNode)(hipGraphNode_t* phGraphNode, hipGraph_t hGraph, const hipGraphNode_t* dependencies, size_t numDependencies, const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx); -typedef hipError_t (*t_hipGraphAddExternalSemaphoresWaitNode)(hipGraphNode_t* pGraphNode, hipGraph_t graph, - const hipGraphNode_t* pDependencies, size_t numDependencies, +typedef hipError_t (*t_hipGraphAddExternalSemaphoresWaitNode)(hipGraphNode_t* pGraphNode, + hipGraph_t graph, const hipGraphNode_t* pDependencies, + size_t numDependencies, const hipExternalSemaphoreWaitNodeParams* nodeParams); -typedef hipError_t (*t_hipGraphAddExternalSemaphoresSignalNode)(hipGraphNode_t* pGraphNode, hipGraph_t graph, - const hipGraphNode_t* pDependencies, size_t numDependencies, +typedef hipError_t (*t_hipGraphAddExternalSemaphoresSignalNode)(hipGraphNode_t* pGraphNode, + hipGraph_t graph, const hipGraphNode_t* pDependencies, + size_t numDependencies, const hipExternalSemaphoreSignalNodeParams* nodeParams); typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeSetParams)(hipGraphNode_t hNode, - const hipExternalSemaphoreSignalNodeParams* nodeParams); + const hipExternalSemaphoreSignalNodeParams* nodeParams); typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeSetParams)(hipGraphNode_t hNode, - const hipExternalSemaphoreWaitNodeParams* nodeParams); + const hipExternalSemaphoreWaitNodeParams* nodeParams); typedef hipError_t (*t_hipGraphExternalSemaphoresSignalNodeGetParams)(hipGraphNode_t hNode, - hipExternalSemaphoreSignalNodeParams* params_out); + hipExternalSemaphoreSignalNodeParams* params_out); typedef hipError_t (*t_hipGraphExternalSemaphoresWaitNodeGetParams)(hipGraphNode_t hNode, - hipExternalSemaphoreWaitNodeParams* params_out); -typedef hipError_t (*t_hipGraphExecExternalSemaphoresSignalNodeSetParams)(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, - const hipExternalSemaphoreSignalNodeParams* nodeParams); -typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraphExec_t hGraphExec, hipGraphNode_t hNode, - const hipExternalSemaphoreWaitNodeParams* nodeParams); + hipExternalSemaphoreWaitNodeParams* params_out); +typedef hipError_t (*t_hipGraphExecExternalSemaphoresSignalNodeSetParams)(hipGraphExec_t hGraphExec, + hipGraphNode_t hNode, + const hipExternalSemaphoreSignalNodeParams* nodeParams); +typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraphExec_t hGraphExec, + hipGraphNode_t hNode, + const hipExternalSemaphoreWaitNodeParams* nodeParams); +typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t *pGraphNode, hipGraph_t graph, + const hipGraphNode_t *pDependencies, size_t numDependencies, + hipGraphNodeParams *nodeParams); +typedef hipError_t (*t_hipGraphInstantiateWithParams)(hipGraphExec_t* pGraphExec, hipGraph_t graph, + hipGraphInstantiateParams* instantiateParams); +typedef hipError_t (*t_hipExtGetLastError)(); +typedef hipError_t (*t_hipTexRefGetBorderColor)(float* pBorderColor, + const textureReference* texRef); +typedef hipError_t (*t_hipTexRefGetArray)(hipArray_t* pArray, const textureReference* texRef); + +typedef hipError_t (*t_hipTexRefGetBorderColor)(float* pBorderColor, + const textureReference* texRef); +typedef hipError_t (*t_hipTexRefGetArray)(hipArray_t* pArray, const textureReference* texRef); +typedef hipError_t (*t_hipGetProcAddress)(const char* symbol, void** pfn, int hipVersion, uint64_t flags, + hipDriverProcAddressQueryResult* symbolStatus); +typedef hipError_t (*t_hipStreamBeginCaptureToGraph)(hipStream_t stream, hipGraph_t graph, + const hipGraphNode_t* dependencies, + const hipGraphEdgeData* dependencyData, + size_t numDependencies, + hipStreamCaptureMode mode); +typedef hipError_t (*t_hipGetFuncBySymbol)(hipFunction_t* functionPtr, const void* symbolPtr); +typedef hipError_t (*t_hipSetValidDevices)(int* device_arr, int len); +typedef hipError_t (*t_hipMemcpyAtoD)(hipDeviceptr_t dstDevice, hipArray_t srcArray, + size_t srcOffset, size_t ByteCount); +typedef hipError_t (*t_hipMemcpyDtoA)(hipArray_t dstArray, size_t dstOffset, + hipDeviceptr_t srcDevice, size_t ByteCount); +typedef hipError_t (*t_hipMemcpyAtoA)(hipArray_t dstArray, size_t dstOffset, hipArray_t srcArray, + size_t srcOffset, size_t ByteCount); +typedef hipError_t (*t_hipMemcpyAtoHAsync)(void* dstHost, hipArray_t srcArray, size_t srcOffset, + size_t ByteCount, hipStream_t stream); +typedef hipError_t (*t_hipMemcpyHtoAAsync)(hipArray_t dstArray, size_t dstOffset, + const void* srcHost, size_t ByteCount, + hipStream_t stream); +typedef hipError_t (*t_hipMemcpy2DArrayToArray)(hipArray_t dst, size_t wOffsetDst, + size_t hOffsetDst, hipArray_const_t src, + size_t wOffsetSrc, size_t hOffsetSrc, size_t width, + size_t height, hipMemcpyKind kind); // HIP Compiler dispatch table struct HipCompilerDispatchTable { @@ -1347,4 +1428,19 @@ struct HipDispatchTable { t_hipGraphExternalSemaphoresWaitNodeGetParams hipGraphExternalSemaphoresWaitNodeGetParams_fn; t_hipGraphExecExternalSemaphoresSignalNodeSetParams hipGraphExecExternalSemaphoresSignalNodeSetParams_fn; t_hipGraphExecExternalSemaphoresWaitNodeSetParams hipGraphExecExternalSemaphoresWaitNodeSetParams_fn; + t_hipGraphAddNode hipGraphAddNode_fn; + t_hipGraphInstantiateWithParams hipGraphInstantiateWithParams_fn; + t_hipExtGetLastError hipExtGetLastError_fn; + t_hipTexRefGetBorderColor hipTexRefGetBorderColor_fn; + t_hipTexRefGetArray hipTexRefGetArray_fn; + t_hipGetProcAddress hipGetProcAddress_fn; + t_hipStreamBeginCaptureToGraph hipStreamBeginCaptureToGraph_fn; + t_hipGetFuncBySymbol hipGetFuncBySymbol_fn; + t_hipSetValidDevices hipSetValidDevices_fn; + t_hipMemcpyAtoD hipMemcpyAtoD_fn; + t_hipMemcpyDtoA hipMemcpyDtoA_fn; + t_hipMemcpyAtoA hipMemcpyAtoA_fn; + t_hipMemcpyAtoHAsync hipMemcpyAtoHAsync_fn; + t_hipMemcpyHtoAAsync hipMemcpyHtoAAsync_fn; + t_hipMemcpy2DArrayToArray hipMemcpy2DArrayToArray_fn; }; diff --git a/third_party/amd/backend/include/hip/amd_detail/hip_prof_str.h b/third_party/amd/backend/include/hip/amd_detail/hip_prof_str.h index 3c9c09f2cee8..992c198d0894 100644 --- a/third_party/amd/backend/include/hip/amd_detail/hip_prof_str.h +++ b/third_party/amd/backend/include/hip/amd_detail/hip_prof_str.h @@ -385,8 +385,8 @@ enum hip_api_id_t { HIP_API_ID_hipChooseDeviceR0600 = 365, HIP_API_ID_hipDrvGraphAddMemcpyNode = 366, HIP_API_ID_hipDrvGraphAddMemsetNode = 367, - HIP_API_ID_hipDrvGraphMemcpyNodeGetParams = 368, - HIP_API_ID_hipDrvGraphMemcpyNodeSetParams = 369, + HIP_API_ID_RESERVED_368 = 368, + HIP_API_ID_RESERVED_369 = 369, HIP_API_ID_hipGetDevicePropertiesR0600 = 370, HIP_API_ID_hipGraphAddExternalSemaphoresSignalNode = 371, HIP_API_ID_hipGraphAddExternalSemaphoresWaitNode = 372, @@ -397,7 +397,27 @@ enum hip_api_id_t { HIP_API_ID_hipGraphExternalSemaphoresWaitNodeGetParams = 377, HIP_API_ID_hipGraphExternalSemaphoresWaitNodeSetParams = 378, HIP_API_ID_hipExtGetLastError = 379, - HIP_API_ID_LAST = 379, + HIP_API_ID_hipGraphAddNode = 380, + HIP_API_ID_hipGetProcAddress = 381, + HIP_API_ID_RESERVED_382 = 382, + HIP_API_ID_RESERVED_383 = 383, + HIP_API_ID_hipGraphInstantiateWithParams = 384, + HIP_API_ID_RESERVED_385 = 385, + HIP_API_ID_RESERVED_386 = 386, + HIP_API_ID_RESERVED_387 = 387, + HIP_API_ID_RESERVED_388 = 388, + HIP_API_ID_hipTexRefGetArray = 389, + HIP_API_ID_hipTexRefGetBorderColor = 390, + HIP_API_ID_hipStreamBeginCaptureToGraph = 391, + HIP_API_ID_hipGetFuncBySymbol = 392, + HIP_API_ID_hipMemcpy2DArrayToArray = 393, + HIP_API_ID_hipMemcpyAtoA = 394, + HIP_API_ID_hipMemcpyAtoD = 395, + HIP_API_ID_hipMemcpyAtoHAsync = 396, + HIP_API_ID_hipMemcpyDtoA = 397, + HIP_API_ID_hipMemcpyHtoAAsync = 398, + HIP_API_ID_hipSetValidDevices = 399, + HIP_API_ID_LAST = 399, HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice), HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties), @@ -414,24 +434,14 @@ enum hip_api_id_t { HIP_API_ID_hipGetTextureObjectResourceViewDesc = HIP_API_ID_NONE, HIP_API_ID_hipGetTextureObjectTextureDesc = HIP_API_ID_NONE, HIP_API_ID_hipGetTextureReference = HIP_API_ID_NONE, - HIP_API_ID_hipMemcpy2DArrayToArray = HIP_API_ID_NONE, - HIP_API_ID_hipMemcpyAtoA = HIP_API_ID_NONE, - HIP_API_ID_hipMemcpyAtoD = HIP_API_ID_NONE, - HIP_API_ID_hipMemcpyAtoHAsync = HIP_API_ID_NONE, - HIP_API_ID_hipMemcpyDtoA = HIP_API_ID_NONE, - HIP_API_ID_hipMemcpyHtoAAsync = HIP_API_ID_NONE, - HIP_API_ID_hipSetValidDevices = HIP_API_ID_NONE, HIP_API_ID_hipTexObjectCreate = HIP_API_ID_NONE, HIP_API_ID_hipTexObjectDestroy = HIP_API_ID_NONE, HIP_API_ID_hipTexObjectGetResourceDesc = HIP_API_ID_NONE, HIP_API_ID_hipTexObjectGetResourceViewDesc = HIP_API_ID_NONE, HIP_API_ID_hipTexObjectGetTextureDesc = HIP_API_ID_NONE, HIP_API_ID_hipTexRefGetAddressMode = HIP_API_ID_NONE, - HIP_API_ID_hipTexRefGetArray = HIP_API_ID_NONE, - HIP_API_ID_hipTexRefGetBorderColor = HIP_API_ID_NONE, HIP_API_ID_hipTexRefGetFilterMode = HIP_API_ID_NONE, HIP_API_ID_hipTexRefGetMipmapFilterMode = HIP_API_ID_NONE, - HIP_API_ID_hipTexRefGetMipmappedArray = HIP_API_ID_NONE, HIP_API_ID_hipTexRefSetAddressMode = HIP_API_ID_NONE, HIP_API_ID_hipTexRefSetFilterMode = HIP_API_ID_NONE, HIP_API_ID_hipTexRefSetMipmapFilterMode = HIP_API_ID_NONE, @@ -510,8 +520,6 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipDriverGetVersion: return "hipDriverGetVersion"; case HIP_API_ID_hipDrvGraphAddMemcpyNode: return "hipDrvGraphAddMemcpyNode"; case HIP_API_ID_hipDrvGraphAddMemsetNode: return "hipDrvGraphAddMemsetNode"; - case HIP_API_ID_hipDrvGraphMemcpyNodeGetParams: return "hipDrvGraphMemcpyNodeGetParams"; - case HIP_API_ID_hipDrvGraphMemcpyNodeSetParams: return "hipDrvGraphMemcpyNodeSetParams"; case HIP_API_ID_hipDrvMemcpy2DUnaligned: return "hipDrvMemcpy2DUnaligned"; case HIP_API_ID_hipDrvMemcpy3D: return "hipDrvMemcpy3D"; case HIP_API_ID_hipDrvMemcpy3DAsync: return "hipDrvMemcpy3DAsync"; @@ -523,6 +531,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipEventQuery: return "hipEventQuery"; case HIP_API_ID_hipEventRecord: return "hipEventRecord"; case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize"; + case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError"; case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount"; case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel"; case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice"; @@ -550,8 +559,10 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipGetDevicePropertiesR0000: return "hipGetDevicePropertiesR0000"; case HIP_API_ID_hipGetDevicePropertiesR0600: return "hipGetDevicePropertiesR0600"; case HIP_API_ID_hipGetErrorString: return "hipGetErrorString"; + case HIP_API_ID_hipGetFuncBySymbol: return "hipGetFuncBySymbol"; case HIP_API_ID_hipGetLastError: return "hipGetLastError"; case HIP_API_ID_hipGetMipmappedArrayLevel: return "hipGetMipmappedArrayLevel"; + case HIP_API_ID_hipGetProcAddress: return "hipGetProcAddress"; case HIP_API_ID_hipGetSymbolAddress: return "hipGetSymbolAddress"; case HIP_API_ID_hipGetSymbolSize: return "hipGetSymbolSize"; case HIP_API_ID_hipGraphAddChildGraphNode: return "hipGraphAddChildGraphNode"; @@ -570,6 +581,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol: return "hipGraphAddMemcpyNodeFromSymbol"; case HIP_API_ID_hipGraphAddMemcpyNodeToSymbol: return "hipGraphAddMemcpyNodeToSymbol"; case HIP_API_ID_hipGraphAddMemsetNode: return "hipGraphAddMemsetNode"; + case HIP_API_ID_hipGraphAddNode: return "hipGraphAddNode"; case HIP_API_ID_hipGraphChildGraphNodeGetGraph: return "hipGraphChildGraphNodeGetGraph"; case HIP_API_ID_hipGraphClone: return "hipGraphClone"; case HIP_API_ID_hipGraphCreate: return "hipGraphCreate"; @@ -605,6 +617,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipGraphHostNodeSetParams: return "hipGraphHostNodeSetParams"; case HIP_API_ID_hipGraphInstantiate: return "hipGraphInstantiate"; case HIP_API_ID_hipGraphInstantiateWithFlags: return "hipGraphInstantiateWithFlags"; + case HIP_API_ID_hipGraphInstantiateWithParams: return "hipGraphInstantiateWithParams"; case HIP_API_ID_hipGraphKernelNodeCopyAttributes: return "hipGraphKernelNodeCopyAttributes"; case HIP_API_ID_hipGraphKernelNodeGetAttribute: return "hipGraphKernelNodeGetAttribute"; case HIP_API_ID_hipGraphKernelNodeGetParams: return "hipGraphKernelNodeGetParams"; @@ -704,6 +717,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipMemUnmap: return "hipMemUnmap"; case HIP_API_ID_hipMemcpy: return "hipMemcpy"; case HIP_API_ID_hipMemcpy2D: return "hipMemcpy2D"; + case HIP_API_ID_hipMemcpy2DArrayToArray: return "hipMemcpy2DArrayToArray"; case HIP_API_ID_hipMemcpy2DAsync: return "hipMemcpy2DAsync"; case HIP_API_ID_hipMemcpy2DFromArray: return "hipMemcpy2DFromArray"; case HIP_API_ID_hipMemcpy2DFromArrayAsync: return "hipMemcpy2DFromArrayAsync"; @@ -712,7 +726,11 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipMemcpy3D: return "hipMemcpy3D"; case HIP_API_ID_hipMemcpy3DAsync: return "hipMemcpy3DAsync"; case HIP_API_ID_hipMemcpyAsync: return "hipMemcpyAsync"; + case HIP_API_ID_hipMemcpyAtoA: return "hipMemcpyAtoA"; + case HIP_API_ID_hipMemcpyAtoD: return "hipMemcpyAtoD"; case HIP_API_ID_hipMemcpyAtoH: return "hipMemcpyAtoH"; + case HIP_API_ID_hipMemcpyAtoHAsync: return "hipMemcpyAtoHAsync"; + case HIP_API_ID_hipMemcpyDtoA: return "hipMemcpyDtoA"; case HIP_API_ID_hipMemcpyDtoD: return "hipMemcpyDtoD"; case HIP_API_ID_hipMemcpyDtoDAsync: return "hipMemcpyDtoDAsync"; case HIP_API_ID_hipMemcpyDtoH: return "hipMemcpyDtoH"; @@ -721,6 +739,7 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipMemcpyFromSymbol: return "hipMemcpyFromSymbol"; case HIP_API_ID_hipMemcpyFromSymbolAsync: return "hipMemcpyFromSymbolAsync"; case HIP_API_ID_hipMemcpyHtoA: return "hipMemcpyHtoA"; + case HIP_API_ID_hipMemcpyHtoAAsync: return "hipMemcpyHtoAAsync"; case HIP_API_ID_hipMemcpyHtoD: return "hipMemcpyHtoD"; case HIP_API_ID_hipMemcpyHtoDAsync: return "hipMemcpyHtoDAsync"; case HIP_API_ID_hipMemcpyParam2D: return "hipMemcpyParam2D"; @@ -772,11 +791,13 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipRuntimeGetVersion: return "hipRuntimeGetVersion"; case HIP_API_ID_hipSetDevice: return "hipSetDevice"; case HIP_API_ID_hipSetDeviceFlags: return "hipSetDeviceFlags"; + case HIP_API_ID_hipSetValidDevices: return "hipSetValidDevices"; case HIP_API_ID_hipSetupArgument: return "hipSetupArgument"; case HIP_API_ID_hipSignalExternalSemaphoresAsync: return "hipSignalExternalSemaphoresAsync"; case HIP_API_ID_hipStreamAddCallback: return "hipStreamAddCallback"; case HIP_API_ID_hipStreamAttachMemAsync: return "hipStreamAttachMemAsync"; case HIP_API_ID_hipStreamBeginCapture: return "hipStreamBeginCapture"; + case HIP_API_ID_hipStreamBeginCaptureToGraph: return "hipStreamBeginCaptureToGraph"; case HIP_API_ID_hipStreamCreate: return "hipStreamCreate"; case HIP_API_ID_hipStreamCreateWithFlags: return "hipStreamCreateWithFlags"; case HIP_API_ID_hipStreamCreateWithPriority: return "hipStreamCreateWithPriority"; @@ -797,6 +818,8 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipStreamWriteValue32: return "hipStreamWriteValue32"; case HIP_API_ID_hipStreamWriteValue64: return "hipStreamWriteValue64"; case HIP_API_ID_hipTexRefGetAddress: return "hipTexRefGetAddress"; + case HIP_API_ID_hipTexRefGetArray: return "hipTexRefGetArray"; + case HIP_API_ID_hipTexRefGetBorderColor: return "hipTexRefGetBorderColor"; case HIP_API_ID_hipTexRefGetFlags: return "hipTexRefGetFlags"; case HIP_API_ID_hipTexRefGetFormat: return "hipTexRefGetFormat"; case HIP_API_ID_hipTexRefGetMaxAnisotropy: return "hipTexRefGetMaxAnisotropy"; @@ -818,7 +841,6 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipUserObjectRelease: return "hipUserObjectRelease"; case HIP_API_ID_hipUserObjectRetain: return "hipUserObjectRetain"; case HIP_API_ID_hipWaitExternalSemaphoresAsync: return "hipWaitExternalSemaphoresAsync"; - case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError"; }; return "unknown"; }; @@ -892,8 +914,6 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipDriverGetVersion", name) == 0) return HIP_API_ID_hipDriverGetVersion; if (strcmp("hipDrvGraphAddMemcpyNode", name) == 0) return HIP_API_ID_hipDrvGraphAddMemcpyNode; if (strcmp("hipDrvGraphAddMemsetNode", name) == 0) return HIP_API_ID_hipDrvGraphAddMemsetNode; - if (strcmp("hipDrvGraphMemcpyNodeGetParams", name) == 0) return HIP_API_ID_hipDrvGraphMemcpyNodeGetParams; - if (strcmp("hipDrvGraphMemcpyNodeSetParams", name) == 0) return HIP_API_ID_hipDrvGraphMemcpyNodeSetParams; if (strcmp("hipDrvMemcpy2DUnaligned", name) == 0) return HIP_API_ID_hipDrvMemcpy2DUnaligned; if (strcmp("hipDrvMemcpy3D", name) == 0) return HIP_API_ID_hipDrvMemcpy3D; if (strcmp("hipDrvMemcpy3DAsync", name) == 0) return HIP_API_ID_hipDrvMemcpy3DAsync; @@ -905,6 +925,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipEventQuery", name) == 0) return HIP_API_ID_hipEventQuery; if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord; if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize; + if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError; if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount; if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel; if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice; @@ -932,8 +953,10 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipGetDevicePropertiesR0000", name) == 0) return HIP_API_ID_hipGetDevicePropertiesR0000; if (strcmp("hipGetDevicePropertiesR0600", name) == 0) return HIP_API_ID_hipGetDevicePropertiesR0600; if (strcmp("hipGetErrorString", name) == 0) return HIP_API_ID_hipGetErrorString; + if (strcmp("hipGetFuncBySymbol", name) == 0) return HIP_API_ID_hipGetFuncBySymbol; if (strcmp("hipGetLastError", name) == 0) return HIP_API_ID_hipGetLastError; if (strcmp("hipGetMipmappedArrayLevel", name) == 0) return HIP_API_ID_hipGetMipmappedArrayLevel; + if (strcmp("hipGetProcAddress", name) == 0) return HIP_API_ID_hipGetProcAddress; if (strcmp("hipGetSymbolAddress", name) == 0) return HIP_API_ID_hipGetSymbolAddress; if (strcmp("hipGetSymbolSize", name) == 0) return HIP_API_ID_hipGetSymbolSize; if (strcmp("hipGraphAddChildGraphNode", name) == 0) return HIP_API_ID_hipGraphAddChildGraphNode; @@ -952,6 +975,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipGraphAddMemcpyNodeFromSymbol", name) == 0) return HIP_API_ID_hipGraphAddMemcpyNodeFromSymbol; if (strcmp("hipGraphAddMemcpyNodeToSymbol", name) == 0) return HIP_API_ID_hipGraphAddMemcpyNodeToSymbol; if (strcmp("hipGraphAddMemsetNode", name) == 0) return HIP_API_ID_hipGraphAddMemsetNode; + if (strcmp("hipGraphAddNode", name) == 0) return HIP_API_ID_hipGraphAddNode; if (strcmp("hipGraphChildGraphNodeGetGraph", name) == 0) return HIP_API_ID_hipGraphChildGraphNodeGetGraph; if (strcmp("hipGraphClone", name) == 0) return HIP_API_ID_hipGraphClone; if (strcmp("hipGraphCreate", name) == 0) return HIP_API_ID_hipGraphCreate; @@ -987,6 +1011,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipGraphHostNodeSetParams", name) == 0) return HIP_API_ID_hipGraphHostNodeSetParams; if (strcmp("hipGraphInstantiate", name) == 0) return HIP_API_ID_hipGraphInstantiate; if (strcmp("hipGraphInstantiateWithFlags", name) == 0) return HIP_API_ID_hipGraphInstantiateWithFlags; + if (strcmp("hipGraphInstantiateWithParams", name) == 0) return HIP_API_ID_hipGraphInstantiateWithParams; if (strcmp("hipGraphKernelNodeCopyAttributes", name) == 0) return HIP_API_ID_hipGraphKernelNodeCopyAttributes; if (strcmp("hipGraphKernelNodeGetAttribute", name) == 0) return HIP_API_ID_hipGraphKernelNodeGetAttribute; if (strcmp("hipGraphKernelNodeGetParams", name) == 0) return HIP_API_ID_hipGraphKernelNodeGetParams; @@ -1086,6 +1111,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipMemUnmap", name) == 0) return HIP_API_ID_hipMemUnmap; if (strcmp("hipMemcpy", name) == 0) return HIP_API_ID_hipMemcpy; if (strcmp("hipMemcpy2D", name) == 0) return HIP_API_ID_hipMemcpy2D; + if (strcmp("hipMemcpy2DArrayToArray", name) == 0) return HIP_API_ID_hipMemcpy2DArrayToArray; if (strcmp("hipMemcpy2DAsync", name) == 0) return HIP_API_ID_hipMemcpy2DAsync; if (strcmp("hipMemcpy2DFromArray", name) == 0) return HIP_API_ID_hipMemcpy2DFromArray; if (strcmp("hipMemcpy2DFromArrayAsync", name) == 0) return HIP_API_ID_hipMemcpy2DFromArrayAsync; @@ -1094,7 +1120,11 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipMemcpy3D", name) == 0) return HIP_API_ID_hipMemcpy3D; if (strcmp("hipMemcpy3DAsync", name) == 0) return HIP_API_ID_hipMemcpy3DAsync; if (strcmp("hipMemcpyAsync", name) == 0) return HIP_API_ID_hipMemcpyAsync; + if (strcmp("hipMemcpyAtoA", name) == 0) return HIP_API_ID_hipMemcpyAtoA; + if (strcmp("hipMemcpyAtoD", name) == 0) return HIP_API_ID_hipMemcpyAtoD; if (strcmp("hipMemcpyAtoH", name) == 0) return HIP_API_ID_hipMemcpyAtoH; + if (strcmp("hipMemcpyAtoHAsync", name) == 0) return HIP_API_ID_hipMemcpyAtoHAsync; + if (strcmp("hipMemcpyDtoA", name) == 0) return HIP_API_ID_hipMemcpyDtoA; if (strcmp("hipMemcpyDtoD", name) == 0) return HIP_API_ID_hipMemcpyDtoD; if (strcmp("hipMemcpyDtoDAsync", name) == 0) return HIP_API_ID_hipMemcpyDtoDAsync; if (strcmp("hipMemcpyDtoH", name) == 0) return HIP_API_ID_hipMemcpyDtoH; @@ -1103,6 +1133,7 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipMemcpyFromSymbol", name) == 0) return HIP_API_ID_hipMemcpyFromSymbol; if (strcmp("hipMemcpyFromSymbolAsync", name) == 0) return HIP_API_ID_hipMemcpyFromSymbolAsync; if (strcmp("hipMemcpyHtoA", name) == 0) return HIP_API_ID_hipMemcpyHtoA; + if (strcmp("hipMemcpyHtoAAsync", name) == 0) return HIP_API_ID_hipMemcpyHtoAAsync; if (strcmp("hipMemcpyHtoD", name) == 0) return HIP_API_ID_hipMemcpyHtoD; if (strcmp("hipMemcpyHtoDAsync", name) == 0) return HIP_API_ID_hipMemcpyHtoDAsync; if (strcmp("hipMemcpyParam2D", name) == 0) return HIP_API_ID_hipMemcpyParam2D; @@ -1154,11 +1185,13 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipRuntimeGetVersion", name) == 0) return HIP_API_ID_hipRuntimeGetVersion; if (strcmp("hipSetDevice", name) == 0) return HIP_API_ID_hipSetDevice; if (strcmp("hipSetDeviceFlags", name) == 0) return HIP_API_ID_hipSetDeviceFlags; + if (strcmp("hipSetValidDevices", name) == 0) return HIP_API_ID_hipSetValidDevices; if (strcmp("hipSetupArgument", name) == 0) return HIP_API_ID_hipSetupArgument; if (strcmp("hipSignalExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipSignalExternalSemaphoresAsync; if (strcmp("hipStreamAddCallback", name) == 0) return HIP_API_ID_hipStreamAddCallback; if (strcmp("hipStreamAttachMemAsync", name) == 0) return HIP_API_ID_hipStreamAttachMemAsync; if (strcmp("hipStreamBeginCapture", name) == 0) return HIP_API_ID_hipStreamBeginCapture; + if (strcmp("hipStreamBeginCaptureToGraph", name) == 0) return HIP_API_ID_hipStreamBeginCaptureToGraph; if (strcmp("hipStreamCreate", name) == 0) return HIP_API_ID_hipStreamCreate; if (strcmp("hipStreamCreateWithFlags", name) == 0) return HIP_API_ID_hipStreamCreateWithFlags; if (strcmp("hipStreamCreateWithPriority", name) == 0) return HIP_API_ID_hipStreamCreateWithPriority; @@ -1179,6 +1212,8 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipStreamWriteValue32", name) == 0) return HIP_API_ID_hipStreamWriteValue32; if (strcmp("hipStreamWriteValue64", name) == 0) return HIP_API_ID_hipStreamWriteValue64; if (strcmp("hipTexRefGetAddress", name) == 0) return HIP_API_ID_hipTexRefGetAddress; + if (strcmp("hipTexRefGetArray", name) == 0) return HIP_API_ID_hipTexRefGetArray; + if (strcmp("hipTexRefGetBorderColor", name) == 0) return HIP_API_ID_hipTexRefGetBorderColor; if (strcmp("hipTexRefGetFlags", name) == 0) return HIP_API_ID_hipTexRefGetFlags; if (strcmp("hipTexRefGetFormat", name) == 0) return HIP_API_ID_hipTexRefGetFormat; if (strcmp("hipTexRefGetMaxAnisotropy", name) == 0) return HIP_API_ID_hipTexRefGetMaxAnisotropy; @@ -1200,7 +1235,6 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipUserObjectRelease", name) == 0) return HIP_API_ID_hipUserObjectRelease; if (strcmp("hipUserObjectRetain", name) == 0) return HIP_API_ID_hipUserObjectRetain; if (strcmp("hipWaitExternalSemaphoresAsync", name) == 0) return HIP_API_ID_hipWaitExternalSemaphoresAsync; - if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError; return HIP_API_ID_NONE; } @@ -1519,16 +1553,6 @@ typedef struct hip_api_data_s { HIP_MEMSET_NODE_PARAMS memsetParams__val; hipCtx_t ctx; } hipDrvGraphAddMemsetNode; - struct { - hipGraphNode_t hNode; - HIP_MEMCPY3D* nodeParams; - HIP_MEMCPY3D nodeParams__val; - } hipDrvGraphMemcpyNodeGetParams; - struct { - hipGraphNode_t hNode; - const HIP_MEMCPY3D* nodeParams; - HIP_MEMCPY3D nodeParams__val; - } hipDrvGraphMemcpyNodeSetParams; struct { const hip_Memcpy2D* pCopy; hip_Memcpy2D pCopy__val; @@ -1730,12 +1754,27 @@ typedef struct hip_api_data_s { hipDeviceProp_tR0600 prop__val; int deviceId; } hipGetDevicePropertiesR0600; + struct { + hipFunction_t* functionPtr; + hipFunction_t functionPtr__val; + const void* symbolPtr; + } hipGetFuncBySymbol; struct { hipArray_t* levelArray; hipArray_t levelArray__val; hipMipmappedArray_const_t mipmappedArray; unsigned int level; } hipGetMipmappedArrayLevel; + struct { + const char* symbol; + char symbol__val; + void** pfn; + void* pfn__val; + int hipVersion; + uint64_t flags; + hipDriverProcAddressQueryResult* symbolStatus; + hipDriverProcAddressQueryResult symbolStatus__val; + } hipGetProcAddress; struct { void** devPtr; void* devPtr__val; @@ -1906,6 +1945,16 @@ typedef struct hip_api_data_s { const hipMemsetParams* pMemsetParams; hipMemsetParams pMemsetParams__val; } hipGraphAddMemsetNode; + struct { + hipGraphNode_t* pGraphNode; + hipGraphNode_t pGraphNode__val; + hipGraph_t graph; + const hipGraphNode_t* pDependencies; + hipGraphNode_t pDependencies__val; + size_t numDependencies; + hipGraphNodeParams* nodeParams; + hipGraphNodeParams nodeParams__val; + } hipGraphAddNode; struct { hipGraphNode_t node; hipGraph_t* pGraph; @@ -2108,15 +2157,22 @@ typedef struct hip_api_data_s { hipGraph_t graph; unsigned long long flags; } hipGraphInstantiateWithFlags; + struct { + hipGraphExec_t* pGraphExec; + hipGraphExec_t pGraphExec__val; + hipGraph_t graph; + hipGraphInstantiateParams* instantiateParams; + hipGraphInstantiateParams instantiateParams__val; + } hipGraphInstantiateWithParams; struct { hipGraphNode_t hSrc; hipGraphNode_t hDst; } hipGraphKernelNodeCopyAttributes; struct { hipGraphNode_t hNode; - hipKernelNodeAttrID attr; - hipKernelNodeAttrValue* value; - hipKernelNodeAttrValue value__val; + hipLaunchAttributeID attr; + hipLaunchAttributeValue* value; + hipLaunchAttributeValue value__val; } hipGraphKernelNodeGetAttribute; struct { hipGraphNode_t node; @@ -2125,9 +2181,9 @@ typedef struct hip_api_data_s { } hipGraphKernelNodeGetParams; struct { hipGraphNode_t hNode; - hipKernelNodeAttrID attr; - const hipKernelNodeAttrValue* value; - hipKernelNodeAttrValue value__val; + hipLaunchAttributeID attr; + const hipLaunchAttributeValue* value; + hipLaunchAttributeValue value__val; } hipGraphKernelNodeSetAttribute; struct { hipGraphNode_t node; @@ -2702,6 +2758,17 @@ typedef struct hip_api_data_s { size_t height; hipMemcpyKind kind; } hipMemcpy2D; + struct { + hipArray_t dst; + size_t wOffsetDst; + size_t hOffsetDst; + hipArray_const_t src; + size_t wOffsetSrc; + size_t hOffsetSrc; + size_t width; + size_t height; + hipMemcpyKind kind; + } hipMemcpy2DArrayToArray; struct { void* dst; size_t dpitch; @@ -2770,12 +2837,38 @@ typedef struct hip_api_data_s { hipMemcpyKind kind; hipStream_t stream; } hipMemcpyAsync; + struct { + hipArray_t dstArray; + size_t dstOffset; + hipArray_t srcArray; + size_t srcOffset; + size_t ByteCount; + } hipMemcpyAtoA; + struct { + hipDeviceptr_t dstDevice; + hipArray_t srcArray; + size_t srcOffset; + size_t ByteCount; + } hipMemcpyAtoD; struct { void* dst; hipArray_t srcArray; size_t srcOffset; size_t count; } hipMemcpyAtoH; + struct { + void* dstHost; + hipArray_t srcArray; + size_t srcOffset; + size_t ByteCount; + hipStream_t stream; + } hipMemcpyAtoHAsync; + struct { + hipArray_t dstArray; + size_t dstOffset; + hipDeviceptr_t srcDevice; + size_t ByteCount; + } hipMemcpyDtoA; struct { hipDeviceptr_t dst; hipDeviceptr_t src; @@ -2827,6 +2920,13 @@ typedef struct hip_api_data_s { const void* srcHost; size_t count; } hipMemcpyHtoA; + struct { + hipArray_t dstArray; + size_t dstOffset; + const void* srcHost; + size_t ByteCount; + hipStream_t stream; + } hipMemcpyHtoAAsync; struct { hipDeviceptr_t dst; void* src; @@ -3142,6 +3242,11 @@ typedef struct hip_api_data_s { struct { unsigned int flags; } hipSetDeviceFlags; + struct { + int* device_arr; + int device_arr__val; + int len; + } hipSetValidDevices; struct { const void* arg; size_t size; @@ -3171,6 +3276,16 @@ typedef struct hip_api_data_s { hipStream_t stream; hipStreamCaptureMode mode; } hipStreamBeginCapture; + struct { + hipStream_t stream; + hipGraph_t graph; + const hipGraphNode_t* dependencies; + hipGraphNode_t dependencies__val; + const hipGraphEdgeData* dependencyData; + hipGraphEdgeData dependencyData__val; + size_t numDependencies; + hipStreamCaptureMode mode; + } hipStreamBeginCaptureToGraph; struct { hipStream_t* stream; hipStream_t stream__val; @@ -3284,6 +3399,18 @@ typedef struct hip_api_data_s { const textureReference* texRef; textureReference texRef__val; } hipTexRefGetAddress; + struct { + hipArray_t* pArray; + hipArray_t pArray__val; + const textureReference* texRef; + textureReference texRef__val; + } hipTexRefGetArray; + struct { + float* pBorderColor; + float pBorderColor__val; + const textureReference* texRef; + textureReference texRef__val; + } hipTexRefGetBorderColor; struct { unsigned int* pFlags; unsigned int pFlags__val; @@ -3729,15 +3856,21 @@ typedef struct hip_api_data_s { }; // hipDrvGraphAddMemcpyNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const HIP_MEMCPY3D*', 'copyParams'), ('hipCtx_t', 'ctx')] #define INIT_hipDrvGraphAddMemcpyNode_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipDrvGraphAddMemcpyNode.phGraphNode = (hipGraphNode_t*)phGraphNode; \ + cb_data.args.hipDrvGraphAddMemcpyNode.hGraph = (hipGraph_t)hGraph; \ + cb_data.args.hipDrvGraphAddMemcpyNode.dependencies = (const hipGraphNode_t*)dependencies; \ + cb_data.args.hipDrvGraphAddMemcpyNode.numDependencies = (size_t)numDependencies; \ + cb_data.args.hipDrvGraphAddMemcpyNode.copyParams = (const HIP_MEMCPY3D*)copyParams; \ + cb_data.args.hipDrvGraphAddMemcpyNode.ctx = (hipCtx_t)ctx; \ }; // hipDrvGraphAddMemsetNode[('hipGraphNode_t*', 'phGraphNode'), ('hipGraph_t', 'hGraph'), ('const hipGraphNode_t*', 'dependencies'), ('size_t', 'numDependencies'), ('const HIP_MEMSET_NODE_PARAMS*', 'memsetParams'), ('hipCtx_t', 'ctx')] #define INIT_hipDrvGraphAddMemsetNode_CB_ARGS_DATA(cb_data) { \ -}; -// hipDrvGraphMemcpyNodeGetParams[('hipGraphNode_t', 'hNode'), ('HIP_MEMCPY3D*', 'nodeParams')] -#define INIT_hipDrvGraphMemcpyNodeGetParams_CB_ARGS_DATA(cb_data) { \ -}; -// hipDrvGraphMemcpyNodeSetParams[('hipGraphNode_t', 'hNode'), ('const HIP_MEMCPY3D*', 'nodeParams')] -#define INIT_hipDrvGraphMemcpyNodeSetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipDrvGraphAddMemsetNode.phGraphNode = (hipGraphNode_t*)phGraphNode; \ + cb_data.args.hipDrvGraphAddMemsetNode.hGraph = (hipGraph_t)hGraph; \ + cb_data.args.hipDrvGraphAddMemsetNode.dependencies = (const hipGraphNode_t*)dependencies; \ + cb_data.args.hipDrvGraphAddMemsetNode.numDependencies = (size_t)numDependencies; \ + cb_data.args.hipDrvGraphAddMemsetNode.memsetParams = (const HIP_MEMSET_NODE_PARAMS*)memsetParams; \ + cb_data.args.hipDrvGraphAddMemsetNode.ctx = (hipCtx_t)ctx; \ }; // hipDrvMemcpy2DUnaligned[('const hip_Memcpy2D*', 'pCopy')] #define INIT_hipDrvMemcpy2DUnaligned_CB_ARGS_DATA(cb_data) { \ @@ -3791,6 +3924,9 @@ typedef struct hip_api_data_s { #define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \ }; +// hipExtGetLastError[] +#define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \ +}; // hipExtGetLinkTypeAndHopCount[('int', 'device1'), ('int', 'device2'), ('unsigned int*', 'linktype'), ('unsigned int*', 'hopcount')] #define INIT_hipExtGetLinkTypeAndHopCount_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipExtGetLinkTypeAndHopCount.device1 = (int)device1; \ @@ -3948,6 +4084,11 @@ typedef struct hip_api_data_s { // hipGetErrorString[] #define INIT_hipGetErrorString_CB_ARGS_DATA(cb_data) { \ }; +// hipGetFuncBySymbol[('hipFunction_t*', 'functionPtr'), ('const void*', 'symbolPtr')] +#define INIT_hipGetFuncBySymbol_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGetFuncBySymbol.functionPtr = (hipFunction_t*)functionPtr; \ + cb_data.args.hipGetFuncBySymbol.symbolPtr = (const void*)symbolPtr; \ +}; // hipGetLastError[] #define INIT_hipGetLastError_CB_ARGS_DATA(cb_data) { \ }; @@ -3957,6 +4098,14 @@ typedef struct hip_api_data_s { cb_data.args.hipGetMipmappedArrayLevel.mipmappedArray = (hipMipmappedArray_const_t)mipmappedArray; \ cb_data.args.hipGetMipmappedArrayLevel.level = (unsigned int)level; \ }; +// hipGetProcAddress[('const char*', 'symbol'), ('void**', 'pfn'), ('int', 'hipVersion'), ('uint64_t', 'flags'), ('hipDriverProcAddressQueryResult*', 'symbolStatus')] +#define INIT_hipGetProcAddress_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGetProcAddress.symbol = (symbol) ? strdup(symbol) : NULL; \ + cb_data.args.hipGetProcAddress.pfn = (void**)pfn; \ + cb_data.args.hipGetProcAddress.hipVersion = (int)hipVersion; \ + cb_data.args.hipGetProcAddress.flags = (uint64_t)flags; \ + cb_data.args.hipGetProcAddress.symbolStatus = (hipDriverProcAddressQueryResult*)symbolStatus; \ +}; // hipGetSymbolAddress[('void**', 'devPtr'), ('const void*', 'symbol')] #define INIT_hipGetSymbolAddress_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipGetSymbolAddress.devPtr = (void**)devPtr; \ @@ -4007,9 +4156,19 @@ typedef struct hip_api_data_s { }; // hipGraphAddExternalSemaphoresSignalNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')] #define INIT_hipGraphAddExternalSemaphoresSignalNode_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphAddExternalSemaphoresSignalNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \ + cb_data.args.hipGraphAddExternalSemaphoresSignalNode.graph = (hipGraph_t)graph; \ + cb_data.args.hipGraphAddExternalSemaphoresSignalNode.pDependencies = (const hipGraphNode_t*)pDependencies; \ + cb_data.args.hipGraphAddExternalSemaphoresSignalNode.numDependencies = (size_t)numDependencies; \ + cb_data.args.hipGraphAddExternalSemaphoresSignalNode.nodeParams = (const hipExternalSemaphoreSignalNodeParams*)nodeParams; \ }; // hipGraphAddExternalSemaphoresWaitNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')] #define INIT_hipGraphAddExternalSemaphoresWaitNode_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphAddExternalSemaphoresWaitNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \ + cb_data.args.hipGraphAddExternalSemaphoresWaitNode.graph = (hipGraph_t)graph; \ + cb_data.args.hipGraphAddExternalSemaphoresWaitNode.pDependencies = (const hipGraphNode_t*)pDependencies; \ + cb_data.args.hipGraphAddExternalSemaphoresWaitNode.numDependencies = (size_t)numDependencies; \ + cb_data.args.hipGraphAddExternalSemaphoresWaitNode.nodeParams = (const hipExternalSemaphoreWaitNodeParams*)nodeParams; \ }; // hipGraphAddHostNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('const hipHostNodeParams*', 'pNodeParams')] #define INIT_hipGraphAddHostNode_CB_ARGS_DATA(cb_data) { \ @@ -4094,6 +4253,14 @@ typedef struct hip_api_data_s { cb_data.args.hipGraphAddMemsetNode.numDependencies = (size_t)numDependencies; \ cb_data.args.hipGraphAddMemsetNode.pMemsetParams = (const hipMemsetParams*)pMemsetParams; \ }; +// hipGraphAddNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipGraphNodeParams*', 'nodeParams')] +#define INIT_hipGraphAddNode_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphAddNode.pGraphNode = (hipGraphNode_t*)pGraphNode; \ + cb_data.args.hipGraphAddNode.graph = (hipGraph_t)graph; \ + cb_data.args.hipGraphAddNode.pDependencies = (const hipGraphNode_t*)pDependencies; \ + cb_data.args.hipGraphAddNode.numDependencies = (size_t)numDependencies; \ + cb_data.args.hipGraphAddNode.nodeParams = (hipGraphNodeParams*)nodeParams; \ +}; // hipGraphChildGraphNodeGetGraph[('hipGraphNode_t', 'node'), ('hipGraph_t*', 'pGraph')] #define INIT_hipGraphChildGraphNodeGetGraph_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipGraphChildGraphNodeGetGraph.node = (hipGraphNode_t)node; \ @@ -4167,9 +4334,15 @@ typedef struct hip_api_data_s { }; // hipGraphExecExternalSemaphoresSignalNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')] #define INIT_hipGraphExecExternalSemaphoresSignalNodeSetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphExecExternalSemaphoresSignalNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \ + cb_data.args.hipGraphExecExternalSemaphoresSignalNodeSetParams.hNode = (hipGraphNode_t)hNode; \ + cb_data.args.hipGraphExecExternalSemaphoresSignalNodeSetParams.nodeParams = (const hipExternalSemaphoreSignalNodeParams*)nodeParams; \ }; // hipGraphExecExternalSemaphoresWaitNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')] #define INIT_hipGraphExecExternalSemaphoresWaitNodeSetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphExecExternalSemaphoresWaitNodeSetParams.hGraphExec = (hipGraphExec_t)hGraphExec; \ + cb_data.args.hipGraphExecExternalSemaphoresWaitNodeSetParams.hNode = (hipGraphNode_t)hNode; \ + cb_data.args.hipGraphExecExternalSemaphoresWaitNodeSetParams.nodeParams = (const hipExternalSemaphoreWaitNodeParams*)nodeParams; \ }; // hipGraphExecHostNodeSetParams[('hipGraphExec_t', 'hGraphExec'), ('hipGraphNode_t', 'node'), ('const hipHostNodeParams*', 'pNodeParams')] #define INIT_hipGraphExecHostNodeSetParams_CB_ARGS_DATA(cb_data) { \ @@ -4233,15 +4406,23 @@ typedef struct hip_api_data_s { }; // hipGraphExternalSemaphoresSignalNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipExternalSemaphoreSignalNodeParams*', 'params_out')] #define INIT_hipGraphExternalSemaphoresSignalNodeGetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphExternalSemaphoresSignalNodeGetParams.hNode = (hipGraphNode_t)hNode; \ + cb_data.args.hipGraphExternalSemaphoresSignalNodeGetParams.params_out = (hipExternalSemaphoreSignalNodeParams*)params_out; \ }; // hipGraphExternalSemaphoresSignalNodeSetParams[('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreSignalNodeParams*', 'nodeParams')] #define INIT_hipGraphExternalSemaphoresSignalNodeSetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphExternalSemaphoresSignalNodeSetParams.hNode = (hipGraphNode_t)hNode; \ + cb_data.args.hipGraphExternalSemaphoresSignalNodeSetParams.nodeParams = (const hipExternalSemaphoreSignalNodeParams*)nodeParams; \ }; // hipGraphExternalSemaphoresWaitNodeGetParams[('hipGraphNode_t', 'hNode'), ('hipExternalSemaphoreWaitNodeParams*', 'params_out')] #define INIT_hipGraphExternalSemaphoresWaitNodeGetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphExternalSemaphoresWaitNodeGetParams.hNode = (hipGraphNode_t)hNode; \ + cb_data.args.hipGraphExternalSemaphoresWaitNodeGetParams.params_out = (hipExternalSemaphoreWaitNodeParams*)params_out; \ }; // hipGraphExternalSemaphoresWaitNodeSetParams[('hipGraphNode_t', 'hNode'), ('const hipExternalSemaphoreWaitNodeParams*', 'nodeParams')] #define INIT_hipGraphExternalSemaphoresWaitNodeSetParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphExternalSemaphoresWaitNodeSetParams.hNode = (hipGraphNode_t)hNode; \ + cb_data.args.hipGraphExternalSemaphoresWaitNodeSetParams.nodeParams = (const hipExternalSemaphoreWaitNodeParams*)nodeParams; \ }; // hipGraphGetEdges[('hipGraph_t', 'graph'), ('hipGraphNode_t*', 'from'), ('hipGraphNode_t*', 'to'), ('size_t*', 'numEdges')] #define INIT_hipGraphGetEdges_CB_ARGS_DATA(cb_data) { \ @@ -4286,27 +4467,27 @@ typedef struct hip_api_data_s { cb_data.args.hipGraphInstantiateWithFlags.graph = (hipGraph_t)graph; \ cb_data.args.hipGraphInstantiateWithFlags.flags = (unsigned long long)flags; \ }; +// hipGraphInstantiateWithParams[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('hipGraphInstantiateParams*', 'instantiateParams')] +#define INIT_hipGraphInstantiateWithParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipGraphInstantiateWithParams.pGraphExec = (hipGraphExec_t*)pGraphExec; \ + cb_data.args.hipGraphInstantiateWithParams.graph = (hipGraph_t)graph; \ + cb_data.args.hipGraphInstantiateWithParams.instantiateParams = (hipGraphInstantiateParams*)instantiateParams; \ +}; // hipGraphKernelNodeCopyAttributes[('hipGraphNode_t', 'hSrc'), ('hipGraphNode_t', 'hDst')] #define INIT_hipGraphKernelNodeCopyAttributes_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipGraphKernelNodeCopyAttributes.hSrc = (hipGraphNode_t)hSrc; \ cb_data.args.hipGraphKernelNodeCopyAttributes.hDst = (hipGraphNode_t)hDst; \ }; -// hipGraphKernelNodeGetAttribute[('hipGraphNode_t', 'hNode'), ('hipKernelNodeAttrID', 'attr'), ('hipKernelNodeAttrValue*', 'value')] +// hipGraphKernelNodeGetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('hipLaunchAttributeValue*', 'value')] #define INIT_hipGraphKernelNodeGetAttribute_CB_ARGS_DATA(cb_data) { \ - cb_data.args.hipGraphKernelNodeGetAttribute.hNode = (hipGraphNode_t)hNode; \ - cb_data.args.hipGraphKernelNodeGetAttribute.attr = (hipKernelNodeAttrID)attr; \ - cb_data.args.hipGraphKernelNodeGetAttribute.value = (hipKernelNodeAttrValue*)value; \ }; // hipGraphKernelNodeGetParams[('hipGraphNode_t', 'node'), ('hipKernelNodeParams*', 'pNodeParams')] #define INIT_hipGraphKernelNodeGetParams_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipGraphKernelNodeGetParams.node = (hipGraphNode_t)node; \ cb_data.args.hipGraphKernelNodeGetParams.pNodeParams = (hipKernelNodeParams*)pNodeParams; \ }; -// hipGraphKernelNodeSetAttribute[('hipGraphNode_t', 'hNode'), ('hipKernelNodeAttrID', 'attr'), ('const hipKernelNodeAttrValue*', 'value')] +// hipGraphKernelNodeSetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('const hipLaunchAttributeValue*', 'value')] #define INIT_hipGraphKernelNodeSetAttribute_CB_ARGS_DATA(cb_data) { \ - cb_data.args.hipGraphKernelNodeSetAttribute.hNode = (hipGraphNode_t)hNode; \ - cb_data.args.hipGraphKernelNodeSetAttribute.attr = (hipKernelNodeAttrID)attr; \ - cb_data.args.hipGraphKernelNodeSetAttribute.value = (const hipKernelNodeAttrValue*)value; \ }; // hipGraphKernelNodeSetParams[('hipGraphNode_t', 'node'), ('const hipKernelNodeParams*', 'pNodeParams')] #define INIT_hipGraphKernelNodeSetParams_CB_ARGS_DATA(cb_data) { \ @@ -4891,6 +5072,18 @@ typedef struct hip_api_data_s { cb_data.args.hipMemcpy2D.height = (size_t)height; \ cb_data.args.hipMemcpy2D.kind = (hipMemcpyKind)kind; \ }; +// hipMemcpy2DArrayToArray[('hipArray_t', 'dst'), ('size_t', 'wOffsetDst'), ('size_t', 'hOffsetDst'), ('hipArray_const_t', 'src'), ('size_t', 'wOffsetSrc'), ('size_t', 'hOffsetSrc'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')] +#define INIT_hipMemcpy2DArrayToArray_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipMemcpy2DArrayToArray.dst = (hipArray_t)dst; \ + cb_data.args.hipMemcpy2DArrayToArray.wOffsetDst = (size_t)wOffsetDst; \ + cb_data.args.hipMemcpy2DArrayToArray.hOffsetDst = (size_t)hOffsetDst; \ + cb_data.args.hipMemcpy2DArrayToArray.src = (hipArray_const_t)src; \ + cb_data.args.hipMemcpy2DArrayToArray.wOffsetSrc = (size_t)wOffsetSrc; \ + cb_data.args.hipMemcpy2DArrayToArray.hOffsetSrc = (size_t)hOffsetSrc; \ + cb_data.args.hipMemcpy2DArrayToArray.width = (size_t)width; \ + cb_data.args.hipMemcpy2DArrayToArray.height = (size_t)height; \ + cb_data.args.hipMemcpy2DArrayToArray.kind = (hipMemcpyKind)kind; \ +}; // hipMemcpy2DAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')] #define INIT_hipMemcpy2DAsync_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipMemcpy2DAsync.dst = (void*)dst; \ @@ -4965,6 +5158,21 @@ typedef struct hip_api_data_s { cb_data.args.hipMemcpyAsync.kind = (hipMemcpyKind)kind; \ cb_data.args.hipMemcpyAsync.stream = (hipStream_t)stream; \ }; +// hipMemcpyAtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')] +#define INIT_hipMemcpyAtoA_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipMemcpyAtoA.dstArray = (hipArray_t)dstArray; \ + cb_data.args.hipMemcpyAtoA.dstOffset = (size_t)dstOffset; \ + cb_data.args.hipMemcpyAtoA.srcArray = (hipArray_t)srcArray; \ + cb_data.args.hipMemcpyAtoA.srcOffset = (size_t)srcOffset; \ + cb_data.args.hipMemcpyAtoA.ByteCount = (size_t)ByteCount; \ +}; +// hipMemcpyAtoD[('hipDeviceptr_t', 'dstDevice'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')] +#define INIT_hipMemcpyAtoD_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipMemcpyAtoD.dstDevice = (hipDeviceptr_t)dstDevice; \ + cb_data.args.hipMemcpyAtoD.srcArray = (hipArray_t)srcArray; \ + cb_data.args.hipMemcpyAtoD.srcOffset = (size_t)srcOffset; \ + cb_data.args.hipMemcpyAtoD.ByteCount = (size_t)ByteCount; \ +}; // hipMemcpyAtoH[('void*', 'dst'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'count')] #define INIT_hipMemcpyAtoH_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipMemcpyAtoH.dst = (void*)dstHost; \ @@ -4972,6 +5180,21 @@ typedef struct hip_api_data_s { cb_data.args.hipMemcpyAtoH.srcOffset = (size_t)srcOffset; \ cb_data.args.hipMemcpyAtoH.count = (size_t)ByteCount; \ }; +// hipMemcpyAtoHAsync[('void*', 'dstHost'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')] +#define INIT_hipMemcpyAtoHAsync_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipMemcpyAtoHAsync.dstHost = (void*)dstHost; \ + cb_data.args.hipMemcpyAtoHAsync.srcArray = (hipArray_t)srcArray; \ + cb_data.args.hipMemcpyAtoHAsync.srcOffset = (size_t)srcOffset; \ + cb_data.args.hipMemcpyAtoHAsync.ByteCount = (size_t)ByteCount; \ + cb_data.args.hipMemcpyAtoHAsync.stream = (hipStream_t)stream; \ +}; +// hipMemcpyDtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipDeviceptr_t', 'srcDevice'), ('size_t', 'ByteCount')] +#define INIT_hipMemcpyDtoA_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipMemcpyDtoA.dstArray = (hipArray_t)dstArray; \ + cb_data.args.hipMemcpyDtoA.dstOffset = (size_t)dstOffset; \ + cb_data.args.hipMemcpyDtoA.srcDevice = (hipDeviceptr_t)srcDevice; \ + cb_data.args.hipMemcpyDtoA.ByteCount = (size_t)ByteCount; \ +}; // hipMemcpyDtoD[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')] #define INIT_hipMemcpyDtoD_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipMemcpyDtoD.dst = (hipDeviceptr_t)dstDevice; \ @@ -5031,6 +5254,14 @@ typedef struct hip_api_data_s { cb_data.args.hipMemcpyHtoA.srcHost = (const void*)srcHost; \ cb_data.args.hipMemcpyHtoA.count = (size_t)ByteCount; \ }; +// hipMemcpyHtoAAsync[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')] +#define INIT_hipMemcpyHtoAAsync_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipMemcpyHtoAAsync.dstArray = (hipArray_t)dstArray; \ + cb_data.args.hipMemcpyHtoAAsync.dstOffset = (size_t)dstOffset; \ + cb_data.args.hipMemcpyHtoAAsync.srcHost = (const void*)srcHost; \ + cb_data.args.hipMemcpyHtoAAsync.ByteCount = (size_t)ByteCount; \ + cb_data.args.hipMemcpyHtoAAsync.stream = (hipStream_t)stream; \ +}; // hipMemcpyHtoD[('hipDeviceptr_t', 'dst'), ('void*', 'src'), ('size_t', 'sizeBytes')] #define INIT_hipMemcpyHtoD_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipMemcpyHtoD.dst = (hipDeviceptr_t)dstDevice; \ @@ -5369,6 +5600,11 @@ typedef struct hip_api_data_s { #define INIT_hipSetDeviceFlags_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipSetDeviceFlags.flags = (unsigned int)flags; \ }; +// hipSetValidDevices[('int*', 'device_arr'), ('int', 'len')] +#define INIT_hipSetValidDevices_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipSetValidDevices.device_arr = (int*)device_arr; \ + cb_data.args.hipSetValidDevices.len = (int)len; \ +}; // hipSetupArgument[('const void*', 'arg'), ('size_t', 'size'), ('size_t', 'offset')] #define INIT_hipSetupArgument_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipSetupArgument.arg = (const void*)arg; \ @@ -5401,6 +5637,15 @@ typedef struct hip_api_data_s { cb_data.args.hipStreamBeginCapture.stream = (hipStream_t)stream; \ cb_data.args.hipStreamBeginCapture.mode = (hipStreamCaptureMode)mode; \ }; +// hipStreamBeginCaptureToGraph[('hipStream_t', 'stream'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'dependencies'), ('const hipGraphEdgeData*', 'dependencyData'), ('size_t', 'numDependencies'), ('hipStreamCaptureMode', 'mode')] +#define INIT_hipStreamBeginCaptureToGraph_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipStreamBeginCaptureToGraph.stream = (hipStream_t)stream; \ + cb_data.args.hipStreamBeginCaptureToGraph.graph = (hipGraph_t)graph; \ + cb_data.args.hipStreamBeginCaptureToGraph.dependencies = (const hipGraphNode_t*)dependencies; \ + cb_data.args.hipStreamBeginCaptureToGraph.dependencyData = (const hipGraphEdgeData*)dependencyData; \ + cb_data.args.hipStreamBeginCaptureToGraph.numDependencies = (size_t)numDependencies; \ + cb_data.args.hipStreamBeginCaptureToGraph.mode = (hipStreamCaptureMode)mode; \ +}; // hipStreamCreate[('hipStream_t*', 'stream')] #define INIT_hipStreamCreate_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipStreamCreate.stream = (hipStream_t*)stream; \ @@ -5516,6 +5761,16 @@ typedef struct hip_api_data_s { cb_data.args.hipTexRefGetAddress.dev_ptr = (hipDeviceptr_t*)dptr; \ cb_data.args.hipTexRefGetAddress.texRef = (const textureReference*)texRef; \ }; +// hipTexRefGetArray[('hipArray_t*', 'pArray'), ('const textureReference*', 'texRef')] +#define INIT_hipTexRefGetArray_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipTexRefGetArray.pArray = (hipArray_t*)pArray; \ + cb_data.args.hipTexRefGetArray.texRef = (const textureReference*)texRef; \ +}; +// hipTexRefGetBorderColor[('float*', 'pBorderColor'), ('const textureReference*', 'texRef')] +#define INIT_hipTexRefGetBorderColor_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipTexRefGetBorderColor.pBorderColor = (float*)pBorderColor; \ + cb_data.args.hipTexRefGetBorderColor.texRef = (const textureReference*)texRef; \ +}; // hipTexRefGetFlags[('unsigned int*', 'pFlags'), ('const textureReference*', 'texRef')] #define INIT_hipTexRefGetFlags_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipTexRefGetFlags.pFlags = (unsigned int*)pFlags; \ @@ -5534,6 +5789,8 @@ typedef struct hip_api_data_s { }; // hipTexRefGetMipMappedArray[('hipMipmappedArray_t*', 'pArray'), ('const textureReference*', 'texRef')] #define INIT_hipTexRefGetMipMappedArray_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipTexRefGetMipMappedArray.pArray = (hipMipmappedArray_t*)pArray; \ + cb_data.args.hipTexRefGetMipMappedArray.texRef = (const textureReference*)texRef; \ }; // hipTexRefGetMipmapLevelBias[('float*', 'pbias'), ('const textureReference*', 'texRef')] #define INIT_hipTexRefGetMipmapLevelBias_CB_ARGS_DATA(cb_data) { \ @@ -5633,9 +5890,6 @@ typedef struct hip_api_data_s { cb_data.args.hipWaitExternalSemaphoresAsync.numExtSems = (unsigned int)numExtSems; \ cb_data.args.hipWaitExternalSemaphoresAsync.stream = (hipStream_t)stream; \ }; -// hipExtGetLastError[] -#define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \ -}; #define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data) // Macros for non-public API primitives @@ -5663,20 +5917,6 @@ typedef struct hip_api_data_s { #define INIT_hipGetTextureObjectTextureDesc_CB_ARGS_DATA(cb_data) {}; // hipGetTextureReference() #define INIT_hipGetTextureReference_CB_ARGS_DATA(cb_data) {}; -// hipMemcpy2DArrayToArray() -#define INIT_hipMemcpy2DArrayToArray_CB_ARGS_DATA(cb_data) {}; -// hipMemcpyAtoA() -#define INIT_hipMemcpyAtoA_CB_ARGS_DATA(cb_data) {}; -// hipMemcpyAtoD() -#define INIT_hipMemcpyAtoD_CB_ARGS_DATA(cb_data) {}; -// hipMemcpyAtoHAsync() -#define INIT_hipMemcpyAtoHAsync_CB_ARGS_DATA(cb_data) {}; -// hipMemcpyDtoA() -#define INIT_hipMemcpyDtoA_CB_ARGS_DATA(cb_data) {}; -// hipMemcpyHtoAAsync() -#define INIT_hipMemcpyHtoAAsync_CB_ARGS_DATA(cb_data) {}; -// hipSetValidDevices() -#define INIT_hipSetValidDevices_CB_ARGS_DATA(cb_data) {}; // hipTexObjectCreate() #define INIT_hipTexObjectCreate_CB_ARGS_DATA(cb_data) {}; // hipTexObjectDestroy() @@ -5689,16 +5929,10 @@ typedef struct hip_api_data_s { #define INIT_hipTexObjectGetTextureDesc_CB_ARGS_DATA(cb_data) {}; // hipTexRefGetAddressMode() #define INIT_hipTexRefGetAddressMode_CB_ARGS_DATA(cb_data) {}; -// hipTexRefGetArray() -#define INIT_hipTexRefGetArray_CB_ARGS_DATA(cb_data) {}; -// hipTexRefGetBorderColor() -#define INIT_hipTexRefGetBorderColor_CB_ARGS_DATA(cb_data) {}; // hipTexRefGetFilterMode() #define INIT_hipTexRefGetFilterMode_CB_ARGS_DATA(cb_data) {}; // hipTexRefGetMipmapFilterMode() #define INIT_hipTexRefGetMipmapFilterMode_CB_ARGS_DATA(cb_data) {}; -// hipTexRefGetMipmappedArray() -#define INIT_hipTexRefGetMipmappedArray_CB_ARGS_DATA(cb_data) {}; // hipTexRefSetAddressMode() #define INIT_hipTexRefSetAddressMode_CB_ARGS_DATA(cb_data) {}; // hipTexRefSetFilterMode() @@ -5968,14 +6202,6 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipDrvGraphAddMemsetNode.dependencies) data->args.hipDrvGraphAddMemsetNode.dependencies__val = *(data->args.hipDrvGraphAddMemsetNode.dependencies); if (data->args.hipDrvGraphAddMemsetNode.memsetParams) data->args.hipDrvGraphAddMemsetNode.memsetParams__val = *(data->args.hipDrvGraphAddMemsetNode.memsetParams); break; -// hipDrvGraphMemcpyNodeGetParams[('hipGraphNode_t', 'hNode'), ('HIP_MEMCPY3D*', 'nodeParams')] - case HIP_API_ID_hipDrvGraphMemcpyNodeGetParams: - if (data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams) data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams__val = *(data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams); - break; -// hipDrvGraphMemcpyNodeSetParams[('hipGraphNode_t', 'hNode'), ('const HIP_MEMCPY3D*', 'nodeParams')] - case HIP_API_ID_hipDrvGraphMemcpyNodeSetParams: - if (data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams) data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams__val = *(data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams); - break; // hipDrvMemcpy2DUnaligned[('const hip_Memcpy2D*', 'pCopy')] case HIP_API_ID_hipDrvMemcpy2DUnaligned: if (data->args.hipDrvMemcpy2DUnaligned.pCopy) data->args.hipDrvMemcpy2DUnaligned.pCopy__val = *(data->args.hipDrvMemcpy2DUnaligned.pCopy); @@ -6017,6 +6243,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipEventSynchronize[('hipEvent_t', 'event')] case HIP_API_ID_hipEventSynchronize: break; +// hipExtGetLastError[] + case HIP_API_ID_hipExtGetLastError: + break; // hipExtGetLinkTypeAndHopCount[('int', 'device1'), ('int', 'device2'), ('unsigned int*', 'linktype'), ('unsigned int*', 'hopcount')] case HIP_API_ID_hipExtGetLinkTypeAndHopCount: if (data->args.hipExtGetLinkTypeAndHopCount.linktype) data->args.hipExtGetLinkTypeAndHopCount.linktype__val = *(data->args.hipExtGetLinkTypeAndHopCount.linktype); @@ -6122,16 +6351,23 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipGetErrorString[] case HIP_API_ID_hipGetErrorString: break; +// hipGetFuncBySymbol[('hipFunction_t*', 'functionPtr'), ('const void*', 'symbolPtr')] + case HIP_API_ID_hipGetFuncBySymbol: + if (data->args.hipGetFuncBySymbol.functionPtr) data->args.hipGetFuncBySymbol.functionPtr__val = *(data->args.hipGetFuncBySymbol.functionPtr); + break; // hipGetLastError[] case HIP_API_ID_hipGetLastError: break; -// hipExtGetLastError[] - case HIP_API_ID_hipExtGetLastError: - break; // hipGetMipmappedArrayLevel[('hipArray_t*', 'levelArray'), ('hipMipmappedArray_const_t', 'mipmappedArray'), ('unsigned int', 'level')] case HIP_API_ID_hipGetMipmappedArrayLevel: if (data->args.hipGetMipmappedArrayLevel.levelArray) data->args.hipGetMipmappedArrayLevel.levelArray__val = *(data->args.hipGetMipmappedArrayLevel.levelArray); break; +// hipGetProcAddress[('const char*', 'symbol'), ('void**', 'pfn'), ('int', 'hipVersion'), ('uint64_t', 'flags'), ('hipDriverProcAddressQueryResult*', 'symbolStatus')] + case HIP_API_ID_hipGetProcAddress: + if (data->args.hipGetProcAddress.symbol) data->args.hipGetProcAddress.symbol__val = *(data->args.hipGetProcAddress.symbol); + if (data->args.hipGetProcAddress.pfn) data->args.hipGetProcAddress.pfn__val = *(data->args.hipGetProcAddress.pfn); + if (data->args.hipGetProcAddress.symbolStatus) data->args.hipGetProcAddress.symbolStatus__val = *(data->args.hipGetProcAddress.symbolStatus); + break; // hipGetSymbolAddress[('void**', 'devPtr'), ('const void*', 'symbol')] case HIP_API_ID_hipGetSymbolAddress: if (data->args.hipGetSymbolAddress.devPtr) data->args.hipGetSymbolAddress.devPtr__val = *(data->args.hipGetSymbolAddress.devPtr); @@ -6227,6 +6463,12 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipGraphAddMemsetNode.pDependencies) data->args.hipGraphAddMemsetNode.pDependencies__val = *(data->args.hipGraphAddMemsetNode.pDependencies); if (data->args.hipGraphAddMemsetNode.pMemsetParams) data->args.hipGraphAddMemsetNode.pMemsetParams__val = *(data->args.hipGraphAddMemsetNode.pMemsetParams); break; +// hipGraphAddNode[('hipGraphNode_t*', 'pGraphNode'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'pDependencies'), ('size_t', 'numDependencies'), ('hipGraphNodeParams*', 'nodeParams')] + case HIP_API_ID_hipGraphAddNode: + if (data->args.hipGraphAddNode.pGraphNode) data->args.hipGraphAddNode.pGraphNode__val = *(data->args.hipGraphAddNode.pGraphNode); + if (data->args.hipGraphAddNode.pDependencies) data->args.hipGraphAddNode.pDependencies__val = *(data->args.hipGraphAddNode.pDependencies); + if (data->args.hipGraphAddNode.nodeParams) data->args.hipGraphAddNode.nodeParams__val = *(data->args.hipGraphAddNode.nodeParams); + break; // hipGraphChildGraphNodeGetGraph[('hipGraphNode_t', 'node'), ('hipGraph_t*', 'pGraph')] case HIP_API_ID_hipGraphChildGraphNodeGetGraph: if (data->args.hipGraphChildGraphNodeGetGraph.pGraph) data->args.hipGraphChildGraphNodeGetGraph.pGraph__val = *(data->args.hipGraphChildGraphNodeGetGraph.pGraph); @@ -6363,10 +6605,15 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { case HIP_API_ID_hipGraphInstantiateWithFlags: if (data->args.hipGraphInstantiateWithFlags.pGraphExec) data->args.hipGraphInstantiateWithFlags.pGraphExec__val = *(data->args.hipGraphInstantiateWithFlags.pGraphExec); break; +// hipGraphInstantiateWithParams[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'), ('hipGraphInstantiateParams*', 'instantiateParams')] + case HIP_API_ID_hipGraphInstantiateWithParams: + if (data->args.hipGraphInstantiateWithParams.pGraphExec) data->args.hipGraphInstantiateWithParams.pGraphExec__val = *(data->args.hipGraphInstantiateWithParams.pGraphExec); + if (data->args.hipGraphInstantiateWithParams.instantiateParams) data->args.hipGraphInstantiateWithParams.instantiateParams__val = *(data->args.hipGraphInstantiateWithParams.instantiateParams); + break; // hipGraphKernelNodeCopyAttributes[('hipGraphNode_t', 'hSrc'), ('hipGraphNode_t', 'hDst')] case HIP_API_ID_hipGraphKernelNodeCopyAttributes: break; -// hipGraphKernelNodeGetAttribute[('hipGraphNode_t', 'hNode'), ('hipKernelNodeAttrID', 'attr'), ('hipKernelNodeAttrValue*', 'value')] +// hipGraphKernelNodeGetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('hipLaunchAttributeValue*', 'value')] case HIP_API_ID_hipGraphKernelNodeGetAttribute: if (data->args.hipGraphKernelNodeGetAttribute.value) data->args.hipGraphKernelNodeGetAttribute.value__val = *(data->args.hipGraphKernelNodeGetAttribute.value); break; @@ -6374,7 +6621,7 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { case HIP_API_ID_hipGraphKernelNodeGetParams: if (data->args.hipGraphKernelNodeGetParams.pNodeParams) data->args.hipGraphKernelNodeGetParams.pNodeParams__val = *(data->args.hipGraphKernelNodeGetParams.pNodeParams); break; -// hipGraphKernelNodeSetAttribute[('hipGraphNode_t', 'hNode'), ('hipKernelNodeAttrID', 'attr'), ('const hipKernelNodeAttrValue*', 'value')] +// hipGraphKernelNodeSetAttribute[('hipGraphNode_t', 'hNode'), ('hipLaunchAttributeID', 'attr'), ('const hipLaunchAttributeValue*', 'value')] case HIP_API_ID_hipGraphKernelNodeSetAttribute: if (data->args.hipGraphKernelNodeSetAttribute.value) data->args.hipGraphKernelNodeSetAttribute.value__val = *(data->args.hipGraphKernelNodeSetAttribute.value); break; @@ -6748,6 +6995,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipMemcpy2D[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')] case HIP_API_ID_hipMemcpy2D: break; +// hipMemcpy2DArrayToArray[('hipArray_t', 'dst'), ('size_t', 'wOffsetDst'), ('size_t', 'hOffsetDst'), ('hipArray_const_t', 'src'), ('size_t', 'wOffsetSrc'), ('size_t', 'hOffsetSrc'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind')] + case HIP_API_ID_hipMemcpy2DArrayToArray: + break; // hipMemcpy2DAsync[('void*', 'dst'), ('size_t', 'dpitch'), ('const void*', 'src'), ('size_t', 'spitch'), ('size_t', 'width'), ('size_t', 'height'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')] case HIP_API_ID_hipMemcpy2DAsync: break; @@ -6774,9 +7024,21 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipMemcpyAsync[('void*', 'dst'), ('const void*', 'src'), ('size_t', 'sizeBytes'), ('hipMemcpyKind', 'kind'), ('hipStream_t', 'stream')] case HIP_API_ID_hipMemcpyAsync: break; +// hipMemcpyAtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')] + case HIP_API_ID_hipMemcpyAtoA: + break; +// hipMemcpyAtoD[('hipDeviceptr_t', 'dstDevice'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount')] + case HIP_API_ID_hipMemcpyAtoD: + break; // hipMemcpyAtoH[('void*', 'dst'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'count')] case HIP_API_ID_hipMemcpyAtoH: break; +// hipMemcpyAtoHAsync[('void*', 'dstHost'), ('hipArray_t', 'srcArray'), ('size_t', 'srcOffset'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')] + case HIP_API_ID_hipMemcpyAtoHAsync: + break; +// hipMemcpyDtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('hipDeviceptr_t', 'srcDevice'), ('size_t', 'ByteCount')] + case HIP_API_ID_hipMemcpyDtoA: + break; // hipMemcpyDtoD[('hipDeviceptr_t', 'dst'), ('hipDeviceptr_t', 'src'), ('size_t', 'sizeBytes')] case HIP_API_ID_hipMemcpyDtoD: break; @@ -6801,6 +7063,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipMemcpyHtoA[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'count')] case HIP_API_ID_hipMemcpyHtoA: break; +// hipMemcpyHtoAAsync[('hipArray_t', 'dstArray'), ('size_t', 'dstOffset'), ('const void*', 'srcHost'), ('size_t', 'ByteCount'), ('hipStream_t', 'stream')] + case HIP_API_ID_hipMemcpyHtoAAsync: + break; // hipMemcpyHtoD[('hipDeviceptr_t', 'dst'), ('void*', 'src'), ('size_t', 'sizeBytes')] case HIP_API_ID_hipMemcpyHtoD: break; @@ -6988,6 +7253,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipSetDeviceFlags[('unsigned int', 'flags')] case HIP_API_ID_hipSetDeviceFlags: break; +// hipSetValidDevices[('int*', 'device_arr'), ('int', 'len')] + case HIP_API_ID_hipSetValidDevices: + if (data->args.hipSetValidDevices.device_arr) data->args.hipSetValidDevices.device_arr__val = *(data->args.hipSetValidDevices.device_arr); + break; // hipSetupArgument[('const void*', 'arg'), ('size_t', 'size'), ('size_t', 'offset')] case HIP_API_ID_hipSetupArgument: break; @@ -7005,6 +7274,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipStreamBeginCapture[('hipStream_t', 'stream'), ('hipStreamCaptureMode', 'mode')] case HIP_API_ID_hipStreamBeginCapture: break; +// hipStreamBeginCaptureToGraph[('hipStream_t', 'stream'), ('hipGraph_t', 'graph'), ('const hipGraphNode_t*', 'dependencies'), ('const hipGraphEdgeData*', 'dependencyData'), ('size_t', 'numDependencies'), ('hipStreamCaptureMode', 'mode')] + case HIP_API_ID_hipStreamBeginCaptureToGraph: + if (data->args.hipStreamBeginCaptureToGraph.dependencies) data->args.hipStreamBeginCaptureToGraph.dependencies__val = *(data->args.hipStreamBeginCaptureToGraph.dependencies); + if (data->args.hipStreamBeginCaptureToGraph.dependencyData) data->args.hipStreamBeginCaptureToGraph.dependencyData__val = *(data->args.hipStreamBeginCaptureToGraph.dependencyData); + break; // hipStreamCreate[('hipStream_t*', 'stream')] case HIP_API_ID_hipStreamCreate: if (data->args.hipStreamCreate.stream) data->args.hipStreamCreate.stream__val = *(data->args.hipStreamCreate.stream); @@ -7083,6 +7357,16 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipTexRefGetAddress.dev_ptr) data->args.hipTexRefGetAddress.dev_ptr__val = *(data->args.hipTexRefGetAddress.dev_ptr); if (data->args.hipTexRefGetAddress.texRef) data->args.hipTexRefGetAddress.texRef__val = *(data->args.hipTexRefGetAddress.texRef); break; +// hipTexRefGetArray[('hipArray_t*', 'pArray'), ('const textureReference*', 'texRef')] + case HIP_API_ID_hipTexRefGetArray: + if (data->args.hipTexRefGetArray.pArray) data->args.hipTexRefGetArray.pArray__val = *(data->args.hipTexRefGetArray.pArray); + if (data->args.hipTexRefGetArray.texRef) data->args.hipTexRefGetArray.texRef__val = *(data->args.hipTexRefGetArray.texRef); + break; +// hipTexRefGetBorderColor[('float*', 'pBorderColor'), ('const textureReference*', 'texRef')] + case HIP_API_ID_hipTexRefGetBorderColor: + if (data->args.hipTexRefGetBorderColor.pBorderColor) data->args.hipTexRefGetBorderColor.pBorderColor__val = *(data->args.hipTexRefGetBorderColor.pBorderColor); + if (data->args.hipTexRefGetBorderColor.texRef) data->args.hipTexRefGetBorderColor.texRef__val = *(data->args.hipTexRefGetBorderColor.texRef); + break; // hipTexRefGetFlags[('unsigned int*', 'pFlags'), ('const textureReference*', 'texRef')] case HIP_API_ID_hipTexRefGetFlags: if (data->args.hipTexRefGetFlags.pFlags) data->args.hipTexRefGetFlags.pFlags__val = *(data->args.hipTexRefGetFlags.pFlags); @@ -7636,20 +7920,6 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", ctx="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphAddMemsetNode.ctx); oss << ")"; break; - case HIP_API_ID_hipDrvGraphMemcpyNodeGetParams: - oss << "hipDrvGraphMemcpyNodeGetParams("; - oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeGetParams.hNode); - if (data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams == NULL) oss << ", nodeParams=NULL"; - else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeGetParams.nodeParams__val); } - oss << ")"; - break; - case HIP_API_ID_hipDrvGraphMemcpyNodeSetParams: - oss << "hipDrvGraphMemcpyNodeSetParams("; - oss << "hNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeSetParams.hNode); - if (data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams == NULL) oss << ", nodeParams=NULL"; - else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipDrvGraphMemcpyNodeSetParams.nodeParams__val); } - oss << ")"; - break; case HIP_API_ID_hipDrvMemcpy2DUnaligned: oss << "hipDrvMemcpy2DUnaligned("; if (data->args.hipDrvMemcpy2DUnaligned.pCopy == NULL) oss << "pCopy=NULL"; @@ -7721,6 +7991,10 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event); oss << ")"; break; + case HIP_API_ID_hipExtGetLastError: + oss << "hipExtGetLastError("; + oss << ")"; + break; case HIP_API_ID_hipExtGetLinkTypeAndHopCount: oss << "hipExtGetLinkTypeAndHopCount("; oss << "device1="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtGetLinkTypeAndHopCount.device1); @@ -7929,12 +8203,15 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << "hipGetErrorString("; oss << ")"; break; - case HIP_API_ID_hipGetLastError: - oss << "hipGetLastError("; + case HIP_API_ID_hipGetFuncBySymbol: + oss << "hipGetFuncBySymbol("; + if (data->args.hipGetFuncBySymbol.functionPtr == NULL) oss << "functionPtr=NULL"; + else { oss << "functionPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetFuncBySymbol.functionPtr__val); } + oss << ", symbolPtr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetFuncBySymbol.symbolPtr); oss << ")"; break; - case HIP_API_ID_hipExtGetLastError: - oss << "hipExtGetLastError("; + case HIP_API_ID_hipGetLastError: + oss << "hipGetLastError("; oss << ")"; break; case HIP_API_ID_hipGetMipmappedArrayLevel: @@ -7945,6 +8222,18 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetMipmappedArrayLevel.level); oss << ")"; break; + case HIP_API_ID_hipGetProcAddress: + oss << "hipGetProcAddress("; + if (data->args.hipGetProcAddress.symbol == NULL) oss << "symbol=NULL"; + else { oss << "symbol="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.symbol__val); } + if (data->args.hipGetProcAddress.pfn == NULL) oss << ", pfn=NULL"; + else { oss << ", pfn="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.pfn__val); } + oss << ", hipVersion="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.hipVersion); + oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.flags); + if (data->args.hipGetProcAddress.symbolStatus == NULL) oss << ", symbolStatus=NULL"; + else { oss << ", symbolStatus="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGetProcAddress.symbolStatus__val); } + oss << ")"; + break; case HIP_API_ID_hipGetSymbolAddress: oss << "hipGetSymbolAddress("; if (data->args.hipGetSymbolAddress.devPtr == NULL) oss << "devPtr=NULL"; @@ -8151,6 +8440,18 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << ", pMemsetParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddMemsetNode.pMemsetParams__val); } oss << ")"; break; + case HIP_API_ID_hipGraphAddNode: + oss << "hipGraphAddNode("; + if (data->args.hipGraphAddNode.pGraphNode == NULL) oss << "pGraphNode=NULL"; + else { oss << "pGraphNode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.pGraphNode__val); } + oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.graph); + if (data->args.hipGraphAddNode.pDependencies == NULL) oss << ", pDependencies=NULL"; + else { oss << ", pDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.pDependencies__val); } + oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.numDependencies); + if (data->args.hipGraphAddNode.nodeParams == NULL) oss << ", nodeParams=NULL"; + else { oss << ", nodeParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphAddNode.nodeParams__val); } + oss << ")"; + break; case HIP_API_ID_hipGraphChildGraphNodeGetGraph: oss << "hipGraphChildGraphNodeGetGraph("; oss << "node="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphChildGraphNodeGetGraph.node); @@ -8423,6 +8724,15 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithFlags.flags); oss << ")"; break; + case HIP_API_ID_hipGraphInstantiateWithParams: + oss << "hipGraphInstantiateWithParams("; + if (data->args.hipGraphInstantiateWithParams.pGraphExec == NULL) oss << "pGraphExec=NULL"; + else { oss << "pGraphExec="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithParams.pGraphExec__val); } + oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithParams.graph); + if (data->args.hipGraphInstantiateWithParams.instantiateParams == NULL) oss << ", instantiateParams=NULL"; + else { oss << ", instantiateParams="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphInstantiateWithParams.instantiateParams__val); } + oss << ")"; + break; case HIP_API_ID_hipGraphKernelNodeCopyAttributes: oss << "hipGraphKernelNodeCopyAttributes("; oss << "hSrc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipGraphKernelNodeCopyAttributes.hSrc); @@ -9215,6 +9525,19 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2D.kind); oss << ")"; break; + case HIP_API_ID_hipMemcpy2DArrayToArray: + oss << "hipMemcpy2DArrayToArray("; + oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.dst); + oss << ", wOffsetDst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.wOffsetDst); + oss << ", hOffsetDst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.hOffsetDst); + oss << ", src="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.src); + oss << ", wOffsetSrc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.wOffsetSrc); + oss << ", hOffsetSrc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.hOffsetSrc); + oss << ", width="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.width); + oss << ", height="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.height); + oss << ", kind="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DArrayToArray.kind); + oss << ")"; + break; case HIP_API_ID_hipMemcpy2DAsync: oss << "hipMemcpy2DAsync("; oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpy2DAsync.dst); @@ -9299,6 +9622,23 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAsync.stream); oss << ")"; break; + case HIP_API_ID_hipMemcpyAtoA: + oss << "hipMemcpyAtoA("; + oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.dstArray); + oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.dstOffset); + oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.srcArray); + oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.srcOffset); + oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoA.ByteCount); + oss << ")"; + break; + case HIP_API_ID_hipMemcpyAtoD: + oss << "hipMemcpyAtoD("; + oss << "dstDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.dstDevice); + oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.srcArray); + oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.srcOffset); + oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoD.ByteCount); + oss << ")"; + break; case HIP_API_ID_hipMemcpyAtoH: oss << "hipMemcpyAtoH("; oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoH.dst); @@ -9307,6 +9647,23 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoH.count); oss << ")"; break; + case HIP_API_ID_hipMemcpyAtoHAsync: + oss << "hipMemcpyAtoHAsync("; + oss << "dstHost="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.dstHost); + oss << ", srcArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.srcArray); + oss << ", srcOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.srcOffset); + oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.ByteCount); + oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyAtoHAsync.stream); + oss << ")"; + break; + case HIP_API_ID_hipMemcpyDtoA: + oss << "hipMemcpyDtoA("; + oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.dstArray); + oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.dstOffset); + oss << ", srcDevice="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.srcDevice); + oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoA.ByteCount); + oss << ")"; + break; case HIP_API_ID_hipMemcpyDtoD: oss << "hipMemcpyDtoD("; oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyDtoD.dst); @@ -9374,6 +9731,15 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", count="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoA.count); oss << ")"; break; + case HIP_API_ID_hipMemcpyHtoAAsync: + oss << "hipMemcpyHtoAAsync("; + oss << "dstArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.dstArray); + oss << ", dstOffset="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.dstOffset); + oss << ", srcHost="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.srcHost); + oss << ", ByteCount="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.ByteCount); + oss << ", stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoAAsync.stream); + oss << ")"; + break; case HIP_API_ID_hipMemcpyHtoD: oss << "hipMemcpyHtoD("; oss << "dst="; roctracer::hip_support::detail::operator<<(oss, data->args.hipMemcpyHtoD.dst); @@ -9797,6 +10163,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << "flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetDeviceFlags.flags); oss << ")"; break; + case HIP_API_ID_hipSetValidDevices: + oss << "hipSetValidDevices("; + if (data->args.hipSetValidDevices.device_arr == NULL) oss << "device_arr=NULL"; + else { oss << "device_arr="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetValidDevices.device_arr__val); } + oss << ", len="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetValidDevices.len); + oss << ")"; + break; case HIP_API_ID_hipSetupArgument: oss << "hipSetupArgument("; oss << "arg="; roctracer::hip_support::detail::operator<<(oss, data->args.hipSetupArgument.arg); @@ -9836,6 +10209,18 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", mode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCapture.mode); oss << ")"; break; + case HIP_API_ID_hipStreamBeginCaptureToGraph: + oss << "hipStreamBeginCaptureToGraph("; + oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.stream); + oss << ", graph="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.graph); + if (data->args.hipStreamBeginCaptureToGraph.dependencies == NULL) oss << ", dependencies=NULL"; + else { oss << ", dependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.dependencies__val); } + if (data->args.hipStreamBeginCaptureToGraph.dependencyData == NULL) oss << ", dependencyData=NULL"; + else { oss << ", dependencyData="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.dependencyData__val); } + oss << ", numDependencies="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.numDependencies); + oss << ", mode="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamBeginCaptureToGraph.mode); + oss << ")"; + break; case HIP_API_ID_hipStreamCreate: oss << "hipStreamCreate("; if (data->args.hipStreamCreate.stream == NULL) oss << "stream=NULL"; @@ -9989,6 +10374,22 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetAddress.texRef__val); } oss << ")"; break; + case HIP_API_ID_hipTexRefGetArray: + oss << "hipTexRefGetArray("; + if (data->args.hipTexRefGetArray.pArray == NULL) oss << "pArray=NULL"; + else { oss << "pArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetArray.pArray__val); } + if (data->args.hipTexRefGetArray.texRef == NULL) oss << ", texRef=NULL"; + else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetArray.texRef__val); } + oss << ")"; + break; + case HIP_API_ID_hipTexRefGetBorderColor: + oss << "hipTexRefGetBorderColor("; + if (data->args.hipTexRefGetBorderColor.pBorderColor == NULL) oss << "pBorderColor=NULL"; + else { oss << "pBorderColor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetBorderColor.pBorderColor__val); } + if (data->args.hipTexRefGetBorderColor.texRef == NULL) oss << ", texRef=NULL"; + else { oss << ", texRef="; roctracer::hip_support::detail::operator<<(oss, data->args.hipTexRefGetBorderColor.texRef__val); } + oss << ")"; + break; case HIP_API_ID_hipTexRefGetFlags: oss << "hipTexRefGetFlags("; if (data->args.hipTexRefGetFlags.pFlags == NULL) oss << "pFlags=NULL"; diff --git a/third_party/amd/backend/include/hip/amd_detail/hip_runtime_prof.h b/third_party/amd/backend/include/hip/amd_detail/hip_runtime_prof.h index d201ab517c9b..307e75c21e76 100644 --- a/third_party/amd/backend/include/hip/amd_detail/hip_runtime_prof.h +++ b/third_party/amd/backend/include/hip/amd_detail/hip_runtime_prof.h @@ -34,6 +34,7 @@ enum HipVdiOpId { // Types of ROCclr commands enum HipVdiCommandKind { kHipVdiCommandKernel = 0x11F0, + kHipVdiCommandTask = 0x11F1, kHipVdiMemcpyDeviceToHost = 0x11F3, kHipHipVdiMemcpyHostToDevice = 0x11F4, kHipVdiMemcpyDeviceToDevice = 0x11F5, @@ -41,7 +42,7 @@ enum HipVdiCommandKind { kHipVdiMemcpyHostToDeviceRect = 0x1202, kHipVdiMemcpyDeviceToDeviceRect = 0x1203, kHipVdiFillMemory = 0x1207, -}; +}; /** * @brief Initializes activity callback diff --git a/third_party/amd/backend/include/hip/amd_detail/host_defines.h b/third_party/amd/backend/include/hip/amd_detail/host_defines.h index 0fad2b47042b..e7e8364969f7 100644 --- a/third_party/amd/backend/include/hip/amd_detail/host_defines.h +++ b/third_party/amd/backend/include/hip/amd_detail/host_defines.h @@ -127,6 +127,10 @@ template struct is_trivial : public integral_constant { }; + + +template struct conditional { using type = T; }; +template struct conditional { using type = F; }; } typedef __hip_internal::uint8_t __hip_uint8_t; typedef __hip_internal::uint16_t __hip_uint16_t; diff --git a/third_party/amd/backend/include/hip/hip_ext.h b/third_party/amd/backend/include/hip/hip_ext.h index 5d5d9b6fa26b..319f5694d021 100644 --- a/third_party/amd/backend/include/hip/hip_ext.h +++ b/third_party/amd/backend/include/hip/hip_ext.h @@ -64,6 +64,8 @@ THE SOFTWARE. * Currently, timing between startEvent and stopEvent does not include the time it takes to perform * a system scope release/cache flush - only the time it takes to issues writes to cache. * + * @note For this HIP API, the flag 'hipExtAnyOrderLaunch' is not supported on AMD GFX9xx boards. + * */ HIP_PUBLIC_API extern "C" hipError_t hipExtModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, @@ -78,6 +80,7 @@ HIP_PUBLIC_API * @brief This HIP API is deprecated, please use hipExtModuleLaunchKernel() instead. * */ +DEPRECATED("use hipExtModuleLaunchKernel instead") HIP_PUBLIC_API extern "C" hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, @@ -85,8 +88,7 @@ extern "C" hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalW uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, hipEvent_t startEvent __dparm(NULL), - hipEvent_t stopEvent __dparm(NULL)) - __attribute__((deprecated("use hipExtModuleLaunchKernel instead"))); + hipEvent_t stopEvent __dparm(NULL)); #if defined(__cplusplus) diff --git a/third_party/amd/backend/include/hip/hip_fp8.h b/third_party/amd/backend/include/hip/hip_fp8.h new file mode 100644 index 000000000000..82f47afcba08 --- /dev/null +++ b/third_party/amd/backend/include/hip/hip_fp8.h @@ -0,0 +1,33 @@ +/* +Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HIP_FP8_H +#define HIP_INCLUDE_HIP_HIP_FP8_H + +#include + +#if defined(__HIP_PLATFORM_AMD__) && !defined(__HIP_PLATFORM_NVIDIA__) +// We only have fnuz defs for now, which are not supported by other platforms +#include +#endif + +#endif // HIP_INCLUDE_HIP_HIP_FP8_H diff --git a/third_party/amd/backend/include/hip/hip_runtime_api.h b/third_party/amd/backend/include/hip/hip_runtime_api.h index 498173bbb158..0323d77d5117 100644 --- a/third_party/amd/backend/include/hip/hip_runtime_api.h +++ b/third_party/amd/backend/include/hip/hip_runtime_api.h @@ -102,7 +102,7 @@ typedef struct hipDeviceProp_t { char luid[8]; ///< 8-byte unique identifier. Only valid on windows unsigned int luidDeviceNodeMask; ///< LUID node mask size_t totalGlobalMem; ///< Size of global memory region (in bytes). - size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory per block (in bytes). int regsPerBlock; ///< Registers per block. int warpSize; ///< Warp size. size_t memPitch; ///< Maximum pitch in bytes allowed by memory copies @@ -111,7 +111,8 @@ typedef struct hipDeviceProp_t { int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. int maxGridSize[3]; ///< Max grid dimensions (XYZ). int clockRate; ///< Max clock frequency of the multiProcessors in khz. - size_t totalConstMem; ///< Size of shared memory region (in bytes). + size_t totalConstMem; ///< Size of shared constant memory region on the device + ///< (in bytes). int major; ///< Major compute capability. On HCC, this is an approximation and features may ///< differ from CUDA CC. See the arch feature flags for portable ways to query ///< feature caps. @@ -538,6 +539,12 @@ typedef enum hipDeviceAttribute_t { // Extended attributes for vendors } hipDeviceAttribute_t; +typedef enum hipDriverProcAddressQueryResult { + HIP_GET_PROC_ADDRESS_SUCCESS = 0, + HIP_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND = 1, + HIP_GET_PROC_ADDRESS_VERSION_NOT_SUFFICIENT = 2 +} hipDriverProcAddressQueryResult; + enum hipComputeMode { hipComputeModeDefault = 0, hipComputeModeExclusive = 1, @@ -740,6 +747,9 @@ enum hipLimit_t { /** Memory allocated will be uncached. */ #define hipDeviceMallocUncached 0x3 +/** Memory allocated will be contiguous. */ +#define hipDeviceMallocContiguous 0x4 + //Flags that can be used with hipHostRegister. /** Memory is Mapped and Portable.*/ #define hipHostRegisterDefault 0x0 @@ -798,6 +808,8 @@ enum hipLimit_t { /** Implicit stream per application thread.*/ #define hipStreamPerThread ((hipStream_t)2) +#define hipStreamLegacy ((hipStream_t)1) + // Indicates that the external memory object is a dedicated resource #define hipExternalMemoryDedicated 0x1 /** @@ -973,7 +985,8 @@ typedef struct hipMemPoolProps { * Windows-specific LPSECURITYATTRIBUTES required when @p hipMemHandleTypeWin32 is specified */ void* win32SecurityAttributes; - unsigned char reserved[64]; ///< Reserved for future use, must be 0 + size_t maxSize; ///< Maximum pool size. When set to 0, defaults to a system dependent value + unsigned char reserved[56]; ///< Reserved for future use, must be 0 } hipMemPoolProps; /** * Opaque data structure for exporting a pool allocation @@ -1269,13 +1282,7 @@ typedef struct hipMemAllocNodeParams { void* dptr; ///< Returned device address of the allocation } hipMemAllocNodeParams; -/** - * Kernel node attributeID - */ -typedef enum hipKernelNodeAttrID { - hipKernelNodeAttributeAccessPolicyWindow = 1, - hipKernelNodeAttributeCooperative = 2, -} hipKernelNodeAttrID; + typedef enum hipAccessProperty { hipAccessPropertyNormal = 0, hipAccessPropertyStreaming = 1, @@ -1288,10 +1295,39 @@ typedef struct hipAccessPolicyWindow { hipAccessProperty missProp; size_t num_bytes; } hipAccessPolicyWindow; -typedef union hipKernelNodeAttrValue { - hipAccessPolicyWindow accessPolicyWindow; - int cooperative; -} hipKernelNodeAttrValue; + +/** + * Launch Attribute ID + */ +typedef enum hipLaunchAttributeID { + hipLaunchAttributeAccessPolicyWindow = 1, /**< Valid for Streams, graph nodes, launches*/ + hipLaunchAttributeCooperative = 2, /**< Valid for graph nodes, launches */ + hipLaunchAttributePriority = 8, /**< Valid for graph node, streams, launches */ +} hipLaunchAttributeID; + +/** + * Launch Attribute Value + */ +typedef union hipLaunchAttributeValue { + hipAccessPolicyWindow accessPolicyWindow; /**< Value of launch attribute:: + hipLaunchAttributePolicyWindow. */ + int cooperative; /**< Value of launch attribute ::hipLaunchAttributeCooperative */ + int priority; /**< Value of launch attribute :: hipLaunchAttributePriority. Execution + priority of kernel. */ +} hipLaunchAttributeValue; + +/** + * Kernel node attributeID + */ +#define hipKernelNodeAttrID hipLaunchAttributeID +#define hipKernelNodeAttributeAccessPolicyWindow hipLaunchAttributeAccessPolicyWindow +#define hipKernelNodeAttributeCooperative hipLaunchAttributeCooperative +#define hipKernelNodeAttributePriority hipLaunchAttributePriority + +/** + * Kernel node attribute value + */ +#define hipKernelNodeAttrValue hipLaunchAttributeValue /** * Memset node params @@ -1383,6 +1419,34 @@ enum hipGraphDebugDotFlags { hipGraphDebugDotFlagsHandles = 1 << 10 /**< Adds node handles and every kernel function handle to output */ }; + +/** +* hipGraphInstantiateWithParams results +*/ +typedef enum hipGraphInstantiateResult { + hipGraphInstantiateSuccess = 0, /**< Instantiation Success */ + hipGraphInstantiateError = 1, /**< Instantiation failed for an + unexpected reason which is described in the return value of the function */ + hipGraphInstantiateInvalidStructure = 2, /**< Instantiation failed due + to invalid structure, such as cycles */ + hipGraphInstantiateNodeOperationNotSupported = 3, /**< Instantiation for device launch failed + because the graph contained an unsupported operation */ + hipGraphInstantiateMultipleDevicesNotSupported = 4, /**< Instantiation for device launch failed + due to the nodes belonging to different contexts */ +}hipGraphInstantiateResult; + +/** + * Graph Instantiation parameters +*/ +typedef struct hipGraphInstantiateParams { + hipGraphNode_t errNode_out; /**< The node which caused instantiation to fail, if any*/ + unsigned long long flags; /**< Instantiation flags */ + hipGraphInstantiateResult result_out; /**< Whether instantiation was successful. + If it failed, the reason why */ + hipStream_t uploadStream; /**< Upload stream */ +} hipGraphInstantiateParams; + + /** * Memory allocation properties */ @@ -1557,6 +1621,44 @@ typedef struct hipGraphNodeParams { long long reserved2; } hipGraphNodeParams; + +/** + * This port activates when the kernel has finished executing. + */ +#define hipGraphKernelNodePortDefault 0 + +/** + * This port activates when all blocks of the kernel have begun execution. + */ +#define hipGraphKernelNodePortLaunchCompletion 2 + +/** + * This port activates when all blocks of the kernel have performed + * hipTriggerProgrammaticLaunchCompletion() or have terminated. + * It must be used with edge type hipGraphDependencyTypeProgrammatic. + */ +#define hipGraphKernelNodePortProgrammatic 1 + +typedef enum hipGraphDependencyType { + hipGraphDependencyTypeDefault = 0, + hipGraphDependencyTypeProgrammatic = 1 +}hipGraphDependencyType; + +typedef struct hipGraphEdgeData { + unsigned char + from_port; ///< This indicates when the dependency is triggered from the upstream node on the + ///< edge. The meaning is specfic to the node type. A value of 0 in all cases + ///< means full completion of the upstream node, with memory visibility to the + ///< downstream node or portion thereof (indicated by to_port). Only kernel nodes + ///< define non-zero ports. A kernel node can use the following output port types: + ///< hipGraphKernelNodePortDefault, hipGraphKernelNodePortProgrammatic, or + ///< hipGraphKernelNodePortLaunchCompletion. + unsigned char reserved[5]; ///< These bytes are unused and must be zeroed + unsigned char + to_port; ///< Currently no node types define non-zero ports. This field must be set to zero. + unsigned char type; ///< This should be populated with a value from hipGraphDependencyType +} hipGraphEdgeData; + // Doxygen end group GlobalDefs /** * @} @@ -1585,6 +1687,7 @@ typedef struct hipGraphNodeParams { */ // TODO-ctx - more description on error codes. hipError_t hipInit(unsigned int flags); + /** * @brief Returns the approximate HIP driver version. * @@ -1755,6 +1858,18 @@ hipError_t hipDeviceReset(void); * @see #hipGetDevice, #hipGetDeviceCount */ hipError_t hipSetDevice(int deviceId); +/** + * @brief Set a list of devices that can be used. + * + * @param[in] device_arr List of devices to try + * @param[in] len Number of devices in specified list + * + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue + * + * @see #hipGetDevice, #hipGetDeviceCount. #hipSetDevice. #hipGetDeviceProperties. #hipSetDeviceFlags. #hipChooseDevice + * + * */ +hipError_t hipSetValidDevices(int* device_arr, int len); /** * @brief Return the default device id for the calling host thread. * @@ -2100,7 +2215,7 @@ hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event); /** * @brief Opens an interprocess event handles. * - * Opens an interprocess event handle exported from another process with cudaIpcGetEventHandle. The returned + * Opens an interprocess event handle exported from another process with hipIpcGetEventHandle. The returned * hipEvent_t behaves like a locally created event with the hipEventDisableTiming flag specified. This event * need be freed with hipEventDestroy. Operations on the imported event after the exported event has been freed * with hipEventDestroy will result in undefined behavior. If the function is called within the same process where @@ -2276,7 +2391,7 @@ hipError_t hipDrvGetErrorString(hipError_t hipError, const char** errorString); * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to * reference the newly created stream in subsequent hipStream* commands. The stream is allocated on * the heap and will remain allocated even if the handle goes out-of-scope. To release the memory - * used by the stream, applicaiton must call hipStreamDestroy. + * used by the stream, application must call hipStreamDestroy. * * @return #hipSuccess, #hipErrorInvalidValue * @@ -2293,7 +2408,7 @@ hipError_t hipStreamCreate(hipStream_t* stream); * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to * reference the newly created stream in subsequent hipStream* commands. The stream is allocated on * the heap and will remain allocated even if the handle goes out-of-scope. To release the memory - * used by the stream, applicaiton must call hipStreamDestroy. Flags controls behavior of the + * used by the stream, application must call hipStreamDestroy. Flags controls behavior of the * stream. See #hipStreamDefault, #hipStreamNonBlocking. * * @@ -2311,7 +2426,7 @@ hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags); * Create a new asynchronous stream with the specified priority. @p stream returns an opaque handle * that can be used to reference the newly created stream in subsequent hipStream* commands. The * stream is allocated on the heap and will remain allocated even if the handle goes out-of-scope. - * To release the memory used by the stream, applicaiton must call hipStreamDestroy. Flags controls + * To release the memory used by the stream, application must call hipStreamDestroy. Flags controls * behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking. * * @@ -2329,7 +2444,7 @@ hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, * and greatest stream priority respectively. Stream priorities follow a convention where lower numbers * imply greater priorities. The range of meaningful stream priorities is given by * [*greatestPriority, *leastPriority]. If the user attempts to create a stream with a priority value - * that is outside the the meaningful range as specified by this API, the priority is automatically + * that is outside the meaningful range as specified by this API, the priority is automatically * clamped to within the valid range. */ hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority); @@ -2401,8 +2516,8 @@ hipError_t hipStreamSynchronize(hipStream_t stream); * All future work submitted to @p stream will wait until @p event reports completion before * beginning execution. * - * This function only waits for commands in the current stream to complete. Notably,, this function - * does not impliciy wait for commands in the default stream to complete, even if the specified + * This function only waits for commands in the current stream to complete. Notably, this function + * does not implicitly wait for commands in the default stream to complete, even if the specified * stream is created with hipStreamNonBlocking = 0. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamCreateWithPriority, hipStreamSynchronize, hipStreamDestroy @@ -2688,7 +2803,7 @@ hipError_t hipEventCreate(hipEvent_t* event); * * If hipEventRecord() has been previously called on this event, then this call will overwrite any * existing state in event. - * + * * If this function is called on an event that is currently being recorded, results are undefined * - either outstanding recording may save state into the event, and the order is not guaranteed. * @@ -2730,7 +2845,6 @@ hipError_t hipEventDestroy(hipEvent_t event); * If hipEventRecord() has not been called on @p event, this function returns #hipSuccess when no * event is captured. * - * This function needs to support hipEventBlockingSync parameter. * * @param[in] event Event on which to wait. * @@ -3252,7 +3366,7 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream, * * Inserts a memory allocation operation into @p stream. * A pointer to the allocated memory is returned immediately in *dptr. - * The allocation must not be accessed until the the allocation operation completes. + * The allocation must not be accessed until the allocation operation completes. * The allocation comes from the memory pool associated with the stream's device. * * @note The default memory pool of a device contains device memory from that device. @@ -3504,7 +3618,7 @@ hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool); * * Inserts an allocation operation into @p stream. * A pointer to the allocated memory is returned immediately in @p dev_ptr. - * The allocation must not be accessed until the the allocation operation completes. + * The allocation must not be accessed until the allocation operation completes. * The allocation comes from the specified memory pool. * * @note The specified memory pool may be from a device different than that of the specified @p stream. @@ -3915,6 +4029,68 @@ hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t sizeBytes); * hipMemHostAlloc, hipMemHostGetDevicePointer */ hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes); +/** + * @brief Copies from one 1D array to device memory. + * + * @param[out] dstDevice Destination device pointer + * @param[in] srcArray Source array + * @param[in] srcOffset Offset in bytes of source array + * @param[in] ByteCount Size of memory copy in bytes + * + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * #hipErrorInvalidValue + * + * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, + * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA, + * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD, + * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync, + * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, + * hipMemHostAlloc, hipMemHostGetDevicePointer + */ +hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray, size_t srcOffset, + size_t ByteCount); +/** + * @brief Copies from device memory to a 1D array. + * + * @param[out] dstArray Destination array + * @param[in] dstOffset Offset in bytes of destination array + * @param[in] srcDevice Source device pointer + * @param[in] ByteCount Size of memory copy in bytes + * + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * #hipErrorInvalidValue + * + * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, + * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA, + * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD, + * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync, + * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, + * hipMemHostAlloc, hipMemHostGetDevicePointer + */ +hipError_t hipMemcpyDtoA(hipArray_t dstArray, size_t dstOffset, hipDeviceptr_t srcDevice, + size_t ByteCount); + +/** + * @brief Copies from one 1D array to another. + * + * @param[out] dstArray Destination array + * @param[in] dstOffset Offset in bytes of destination array + * @param[in] srcArray Source array + * @param[in] srcOffset Offset in bytes of source array + * @param[in] ByteCount Size of memory copy in bytes + * + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * #hipErrorInvalidValue + * + * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, + * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA, + * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD, + * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync, + * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, + * hipMemHostAlloc, hipMemHostGetDevicePointer + */ +hipError_t hipMemcpyAtoA(hipArray_t dstArray, size_t dstOffset, hipArray_t srcArray, + size_t srcOffset, size_t ByteCount); /** * @brief Copy data from Host to Device asynchronously * @@ -3973,7 +4149,48 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h */ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream); - +/** + * @brief Copies from one 1D array to host memory. + * + * @param[out] dstHost Destination pointer + * @param[in] srcArray Source array + * @param[in] srcOffset Offset in bytes of source array + * @param[in] ByteCount Size of memory copy in bytes + * @param[in] stream Stream identifier + * + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * #hipErrorInvalidValue + * + * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, + * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA, + * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD, + * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync, + * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, + * hipMemHostAlloc, hipMemHostGetDevicePointer + */ +hipError_t hipMemcpyAtoHAsync(void* dstHost, hipArray_t srcArray, size_t srcOffset, + size_t ByteCount, hipStream_t stream); +/** + * @brief Copies from host memory to a 1D array. + * + * @param[out] dstArray Destination array + * @param[in] dstOffset Offset in bytes of destination array + * @param[in] srcHost Source host pointer + * @param[in] ByteCount Size of memory copy in bytes + * @param[in] stream Stream identifier + * + * @return #hipSuccess, #hipErrorDeinitialized, #hipErrorNotInitialized, #hipErrorInvalidContext, + * #hipErrorInvalidValue + * + * @see hipArrayCreate, hipArrayDestroy, hipArrayGetDescriptor, hipMemAlloc, hipMemAllocHost, + * hipMemAllocPitch, hipMemcpy2D, hipMemcpy2DAsync, hipMemcpy2DUnaligned, hipMemcpyAtoA, + * hipMemcpyAtoD, hipMemcpyAtoH, hipMemcpyAtoHAsync, hipMemcpyDtoA, hipMemcpyDtoD, + * hipMemcpyDtoDAsync, hipMemcpyDtoH, hipMemcpyDtoHAsync, hipMemcpyHtoA, hipMemcpyHtoAAsync, + * hipMemcpyHtoDAsync, hipMemFree, hipMemFreeHost, hipMemGetAddressRange, hipMemGetInfo, + * hipMemHostAlloc, hipMemHostGetDevicePointer + */ +hipError_t hipMemcpyHtoAAsync(hipArray_t dstArray, size_t dstOffset, const void* srcHost, + size_t ByteCount, hipStream_t stream); /** * @brief Returns a global pointer from a module. * Returns in *dptr and *bytes the pointer and size of the global of name name located in module hmod. @@ -4002,6 +4219,8 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, */ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol); + + /** * @brief Gets the size of the given symbol on the device. * @@ -4013,14 +4232,38 @@ hipError_t hipGetSymbolAddress(void** devPtr, const void* symbol); */ hipError_t hipGetSymbolSize(size_t* size, const void* symbol); +/** + * @brief Gets the pointer of requested HIP driver function. + * + * @param[in] symbol The Symbol name of the driver function to request. + * @param[out] pfn Output pointer to the requested driver function. + * @param[in] hipVersion The HIP version for the requested driver function symbol. + * HIP version is defined as 100*version_major + version_minor. For example, in HIP 6.1, the + * hipversion is 601, for the symbol function "hipGetDeviceProperties", the specified hipVersion 601 + * is greater or equal to the version 600, the symbol function will be handle properly as backend + * compatible function. + * + * @param[in] flags Currently only default flag is suppported. + * @param[out] symbolStatus Optional enumeration for returned status of searching for symbol driver + * function based on the input hipVersion. + * + * Returns hipSuccess if the returned pfn is addressed to the pointer of found driver function. + * + * @return #hipSuccess, #hipErrorInvalidValue. + */ +hipError_t hipGetProcAddress(const char* symbol, void** pfn, int hipVersion, uint64_t flags, + hipDriverProcAddressQueryResult* symbolStatus); + /** * @brief Copies data to the given symbol on the device. * Symbol HIP APIs allow a kernel to define a device-side data symbol which can be accessed on * the host side. The symbol can be in __constant or device space. * Note that the symbol name needs to be encased in the HIP_SYMBOL macro. * This also applies to hipMemcpyFromSymbol, hipGetSymbolAddress, and hipGetSymbolSize. - * For detail usage, see the example at - * https://github.com/ROCm/HIP/blob/develop/docs/user_guide/hip_porting_guide.md + * For detailed usage, see the + * memcpyToSymbol example + * in the HIP Porting Guide. + * * * @param[out] symbol pointer to the device symbole * @param[in] src pointer to the source address @@ -4520,6 +4763,27 @@ hipError_t hipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset, co hipError_t hipMemcpy2DToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream __dparm(0)); +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] wOffsetDst Destination starting X offset + * @param[in] hOffsetDst Destination starting Y offset + * @param[in] src Source memory address + * @param[in] wOffsetSrc Source starting X offset + * @param[in] hOffsetSrc Source starting Y offset (columns in bytes) + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * + * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, + * hipMemcpyAsync + */ +hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst, size_t hOffsetDst, + hipArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, + size_t width, size_t height, hipMemcpyKind kind); /** * @brief Copies data between host and device. * @@ -4734,7 +4998,7 @@ hipError_t hipDeviceDisablePeerAccess(int peerDeviceId); * @param [out] psize - Size of allocation * @param [in] dptr- Device Pointer * - * @returns #hipSuccess, #hipErrorInvalidDevicePointer + * @returns #hipSuccess, #hipErrorNotFound * * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, * hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice @@ -5225,6 +5489,16 @@ hipError_t hipFuncGetAttributes(struct hipFuncAttributes* attr, const void* func * @returns #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidDeviceFunction */ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunction_t hfunc); +/** + * @brief Gets pointer to device entry function that matches entry function symbolPtr. + * + * @param [out] functionPtr Device entry function + * @param [in] symbolPtr Pointer to device entry function to search for + * + * @returns #hipSuccess, #hipErrorInvalidDeviceFunction + * + */ +hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr); /** * @brief returns the handle of the texture reference with the name from the module. * @@ -5646,12 +5920,26 @@ hipError_t hipLaunchKernel(const void* function_address, /** * @brief Enqueues a host function call in a stream. * - * @param [in] stream - stream to enqueue work to. - * @param [in] fn - function to call once operations enqueued preceeding are complete. + * @param [in] stream - The stream to enqueue work in. + * @param [in] fn - The function to call once enqueued preceeding operations are complete. * @param [in] userData - User-specified data to be passed to the function. + * * @returns #hipSuccess, #hipErrorInvalidResourceHandle, #hipErrorInvalidValue, * #hipErrorNotSupported - * @warning : This API is marked as beta, meaning, while this is feature complete, + * + * The host function to call in this API will be executed after the preceding operations in + * the stream are complete. The function is a blocking operation that blocks operations in the + * stream that follow it, until the function is returned. + * Event synchronization and internal callback functions make sure enqueued operations will + * execute in order, in the stream. + * + * The host function must not make any HIP API calls. The host function is non-reentrant. It must + * not perform sychronization with any operation that may depend on other processing execution + * but is not enqueued to run earlier in the stream. + * + * Host functions that are enqueued respectively in different non-blocking streams can run concurrently. + * + * @warning This API is marked as beta, meaning, while this is feature complete, * it is still open to changes and may have outstanding issues. */ hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData); @@ -6181,7 +6469,7 @@ hipError_t hipGetTextureAlignmentOffset( DEPRECATED(DEPRECATED_MSG) hipError_t hipUnbindTexture(const textureReference* tex); /** - * @brief Gets the the address for a texture reference. + * @brief Gets the address for a texture reference. * * @param [out] dev_ptr Pointer of device address. * @param [in] texRef Pointer of texture reference. @@ -6564,6 +6852,30 @@ int hipGetStreamDeviceId(hipStream_t stream); */ hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode); +/** +* @brief Begins graph capture on a stream to an existing graph. +* +* @param [in] stream - Stream to initiate capture. +* @param [in] graph - Graph to capture into. +* @param [in] dependencies - Dependencies of the first node captured in the stream. Can be NULL if +* numDependencies is 0. +* @param [in] dependencyData - Optional array of data associated with each dependency. +* @param [in] numDependencies - Number of dependencies. +* @param [in] mode - Controls the interaction of this capture sequence with other API calls that +are not safe. +* +* @returns #hipSuccess, #hipErrorInvalidValue +* +* @warning : param "const hipGraphEdgeData* dependencyData" is currently not supported and has to +passed as nullptr. This API is marked as beta, meaning, while this is feature complete, it is still +open to changes and may have outstanding issues. +* +*/ +hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph, + const hipGraphNode_t* dependencies, + const hipGraphEdgeData* dependencyData, + size_t numDependencies, hipStreamCaptureMode mode); + /** * @brief Ends capture on a stream, returning the captured graph. * @@ -6902,6 +7214,19 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph, hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph, unsigned long long flags); +/** + * @brief Creates an executable graph from a graph. + * + * @param [out] pGraphExec - pointer to instantiated executable graph that is created. + * @param [in] graph - instance of graph to instantiate. + * @param [in] instantiateParams - Graph Instantiate Params + * @returns #hipSuccess, #hipErrorInvalidValue + * + * @warning : This API is marked as beta, meaning, while this is feature complete, + * it is still open to changes and may have outstanding issues. + */ +hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph, + hipGraphInstantiateParams *instantiateParams); /** * @brief launches an executable graph in a stream * @@ -6926,6 +7251,22 @@ hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream); */ hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream); +/** + * @brief Creates a kernel execution node and adds it to a graph. + * + * @param [out] pGraphNode - pointer to graph node to create. + * @param [in] graph - instance of graph to add the created node. + * @param [in] pDependencies - pointer to the dependencies on the kernel execution node. + * @param [in] numDependencies - the number of the dependencies. + * @param [in] nodeParams - pointer to the parameters for the node. + * @returns #hipSuccess, #hipErrorInvalidValue. + * @warning : This API is marked as beta, meaning, while this is feature complete, + * it is still open to changes and may have outstanding issues. + */ +hipError_t hipGraphAddNode(hipGraphNode_t *pGraphNode, hipGraph_t graph, + const hipGraphNode_t *pDependencies, size_t numDependencies, + hipGraphNodeParams *nodeParams); + /** * @brief Destroys an executable graph * @@ -8906,6 +9247,7 @@ static inline hipError_t hipMallocManaged(T** devPtr, size_t size, return hipMallocManaged((void**)devPtr, size, flags); } + #endif #endif // doxygen end HIP API diff --git a/third_party/amd/backend/include/hip/hip_version.h b/third_party/amd/backend/include/hip/hip_version.h index 0c64f38b1f01..bab5288f806f 100644 --- a/third_party/amd/backend/include/hip/hip_version.h +++ b/third_party/amd/backend/include/hip/hip_version.h @@ -4,9 +4,9 @@ #define HIP_VERSION_H #define HIP_VERSION_MAJOR 6 -#define HIP_VERSION_MINOR 1 -#define HIP_VERSION_PATCH 40091 -#define HIP_VERSION_GITHASH "a8dbc0c19" +#define HIP_VERSION_MINOR 2 +#define HIP_VERSION_PATCH 41134 +#define HIP_VERSION_GITHASH "65d174c3e" #define HIP_VERSION_BUILD_ID 0 #define HIP_VERSION_BUILD_NAME "" #define HIP_VERSION (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH) diff --git a/third_party/amd/backend/include/hip/hiprtc.h b/third_party/amd/backend/include/hip/hiprtc.h index 88e9094d848c..e10acbfe09c0 100644 --- a/third_party/amd/backend/include/hip/hiprtc.h +++ b/third_party/amd/backend/include/hip/hiprtc.h @@ -67,32 +67,32 @@ typedef enum hiprtcResult { */ typedef enum hiprtcJIT_option { - HIPRTC_JIT_MAX_REGISTERS = 0, ///< Maximum registers may be used in a thread, passed to compiler - HIPRTC_JIT_THREADS_PER_BLOCK, ///< Number of thread per block - HIPRTC_JIT_WALL_TIME, ///< Value for total wall clock time - HIPRTC_JIT_INFO_LOG_BUFFER, ///< Pointer to the buffer with logged information - HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES, ///< Size of the buffer in bytes for logged info - HIPRTC_JIT_ERROR_LOG_BUFFER, ///< Pointer to the buffer with logged error(s) - HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, ///< Size of the buffer in bytes for logged error(s) - HIPRTC_JIT_OPTIMIZATION_LEVEL, ///< Value of optimization level for generated codes - HIPRTC_JIT_TARGET_FROM_HIPCONTEXT, ///< The target context, which is the default - HIPRTC_JIT_TARGET, ///< JIT target - HIPRTC_JIT_FALLBACK_STRATEGY, ///< Fallback strategy - HIPRTC_JIT_GENERATE_DEBUG_INFO, ///< Generate debug information - HIPRTC_JIT_LOG_VERBOSE, ///< Generate log verbose - HIPRTC_JIT_GENERATE_LINE_INFO, ///< Generate line number information - HIPRTC_JIT_CACHE_MODE, ///< Set cache mode - HIPRTC_JIT_NEW_SM3X_OPT, ///< @deprecated New SM3X option. - HIPRTC_JIT_FAST_COMPILE, ///< Set fast compile - HIPRTC_JIT_GLOBAL_SYMBOL_NAMES, ///< Array of device symbol names to be relocated to the host - HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS, ///< Array of host addresses to be relocated to the device - HIPRTC_JIT_GLOBAL_SYMBOL_COUNT, ///< Number of symbol count. - HIPRTC_JIT_LTO, ///< @deprecated Enable link-time optimization for device code - HIPRTC_JIT_FTZ, ///< @deprecated Set single-precision denormals. - HIPRTC_JIT_PREC_DIV, ///< @deprecated Set single-precision floating-point division and + HIPRTC_JIT_MAX_REGISTERS = 0, ///< CUDA Only Maximum registers may be used in a thread, passed to compiler + HIPRTC_JIT_THREADS_PER_BLOCK, ///< CUDA Only Number of thread per block + HIPRTC_JIT_WALL_TIME, ///< CUDA Only Value for total wall clock time + HIPRTC_JIT_INFO_LOG_BUFFER, ///< CUDA Only Pointer to the buffer with logged information + HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES, ///< CUDA Only Size of the buffer in bytes for logged info + HIPRTC_JIT_ERROR_LOG_BUFFER, ///< CUDA Only Pointer to the buffer with logged error(s) + HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, ///< CUDA Only Size of the buffer in bytes for logged error(s) + HIPRTC_JIT_OPTIMIZATION_LEVEL, ///< Value of optimization level for generated codes, acceptable options -O0, -O1, -O2, -O3 + HIPRTC_JIT_TARGET_FROM_HIPCONTEXT, ///< CUDA Only The target context, which is the default + HIPRTC_JIT_TARGET, ///< CUDA Only JIT target + HIPRTC_JIT_FALLBACK_STRATEGY, ///< CUDA Only Fallback strategy + HIPRTC_JIT_GENERATE_DEBUG_INFO, ///< CUDA Only Generate debug information + HIPRTC_JIT_LOG_VERBOSE, ///< CUDA Only Generate log verbose + HIPRTC_JIT_GENERATE_LINE_INFO, ///< CUDA Only Generate line number information + HIPRTC_JIT_CACHE_MODE, ///< CUDA Only Set cache mode + HIPRTC_JIT_NEW_SM3X_OPT, ///< @deprecated CUDA Only New SM3X option. + HIPRTC_JIT_FAST_COMPILE, ///< CUDA Only Set fast compile + HIPRTC_JIT_GLOBAL_SYMBOL_NAMES, ///< CUDA Only Array of device symbol names to be relocated to the host + HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS, ///< CUDA Only Array of host addresses to be relocated to the device + HIPRTC_JIT_GLOBAL_SYMBOL_COUNT, ///< CUDA Only Number of symbol count. + HIPRTC_JIT_LTO, ///< @deprecated CUDA Only Enable link-time optimization for device code + HIPRTC_JIT_FTZ, ///< @deprecated CUDA Only Set single-precision denormals. + HIPRTC_JIT_PREC_DIV, ///< @deprecated CUDA Only Set single-precision floating-point division and ///< reciprocals - HIPRTC_JIT_PREC_SQRT, ///< @deprecated Set single-precision floating-point square root - HIPRTC_JIT_FMA, ///< @deprecated Enable floating-point multiplies and adds/subtracts operations + HIPRTC_JIT_PREC_SQRT, ///< @deprecated CUDA Only Set single-precision floating-point square root + HIPRTC_JIT_FMA, ///< @deprecated CUDA Only Enable floating-point multiplies and adds/subtracts operations HIPRTC_JIT_NUM_OPTIONS, ///< Number of options HIPRTC_JIT_IR_TO_ISA_OPT_EXT = 10000, ///< Linker options to be passed on to compiler /// @note Only supported for the AMD platform. diff --git a/third_party/amd/backend/include/hsa/amd_hsa_elf.h b/third_party/amd/backend/include/hsa/amd_hsa_elf.h index 51aa389a0681..74f15d7d7ab6 100644 --- a/third_party/amd/backend/include/hsa/amd_hsa_elf.h +++ b/third_party/amd/backend/include/hsa/amd_hsa_elf.h @@ -75,7 +75,8 @@ enum { ELFABIVERSION_AMDGPU_HSA_V2 = 0, ELFABIVERSION_AMDGPU_HSA_V3 = 1, ELFABIVERSION_AMDGPU_HSA_V4 = 2, - ELFABIVERSION_AMDGPU_HSA_V5 = 3 + ELFABIVERSION_AMDGPU_HSA_V5 = 3, + ELFABIVERSION_AMDGPU_HSA_V6 = 4, }; // AMDGPU specific e_flags. @@ -87,6 +88,7 @@ enum : unsigned { EF_AMDGPU_MACH_NONE = 0x000, // AMDGCN-based processors. + // clang-format off EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020, EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021, EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022, @@ -127,13 +129,25 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1036 = 0x045, EF_AMDGPU_MACH_AMDGCN_GFX1101 = 0x046, EF_AMDGPU_MACH_AMDGCN_GFX1102 = 0x047, + EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, + EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4F = 0x04f, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X50 = 0x050, + EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC = 0x051, + EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, + EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, + EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055, + // clang-format on // First/last AMDGCN-based processors. EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600, - EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX942, + EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC, // Indicates if the "xnack" target feature is enabled for all code contained // in the object. @@ -159,8 +173,7 @@ enum : unsigned { // XNACK selection mask for EF_AMDGPU_FEATURE_XNACK_* values. // - // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4, - // ELFABIVERSION_AMDGPU_HSA_V5. + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. EF_AMDGPU_FEATURE_XNACK_V4 = 0x300, // XNACK is not supported. EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4 = 0x000, @@ -173,8 +186,7 @@ enum : unsigned { // SRAMECC selection mask for EF_AMDGPU_FEATURE_SRAMECC_* values. // - // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4, - // ELFABIVERSION_AMDGPU_HSA_V5. + // Only valid for ELFOSABI_AMDGPU_HSA and ELFABIVERSION_AMDGPU_HSA_V4. EF_AMDGPU_FEATURE_SRAMECC_V4 = 0xc00, // SRAMECC is not supported. EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4 = 0x000, @@ -184,6 +196,21 @@ enum : unsigned { EF_AMDGPU_FEATURE_SRAMECC_OFF_V4 = 0x800, // SRAMECC is on. EF_AMDGPU_FEATURE_SRAMECC_ON_V4 = 0xc00, + + // Generic target versioning. This is contained in the list byte of EFLAGS. + EF_AMDGPU_GENERIC_VERSION = 0xff000000, + EF_AMDGPU_GENERIC_VERSION_OFFSET = 24, + EF_AMDGPU_GENERIC_VERSION_MIN = 1, + EF_AMDGPU_GENERIC_VERSION_MAX = 0xff, +}; + +// ELF Relocation types for AMDGPU. +enum : unsigned { + R_AMDGPU_ABS32_LO = 1, + R_AMDGPU_ABS32_HI = 2, + R_AMDGPU_ABS64 = 3, + R_AMDGPU_ABS32 = 6, + R_AMDGPU_RELATIVE64 = 13, }; } // end namespace ELF @@ -245,14 +272,14 @@ typedef enum { // ELF Symbol Flag Enumeration Values. #define STF_AMDGPU_HSA_CONST AMDGPU_HSA_SYMBOL_FLAG_CONST -// AMD GPU Relocation Type Enumeration Values. -#define R_AMDGPU_NONE 0 -#define R_AMDGPU_32_LOW 1 -#define R_AMDGPU_32_HIGH 2 -#define R_AMDGPU_64 3 -#define R_AMDGPU_INIT_SAMPLER 4 -#define R_AMDGPU_INIT_IMAGE 5 -#define R_AMDGPU_RELATIVE64 13 +// Legacy/V1 AMD GPU Relocation Type Enumeration Values. +#define R_AMDGPU_V1_NONE 0 +#define R_AMDGPU_V1_32_LOW 1 +#define R_AMDGPU_V1_32_HIGH 2 +#define R_AMDGPU_V1_64 3 +#define R_AMDGPU_V1_INIT_SAMPLER 4 +#define R_AMDGPU_V1_INIT_IMAGE 5 +#define R_AMDGPU_V1_RELATIVE64 13 // AMD GPU Note Type Enumeration Values. #define NT_AMD_HSA_CODE_OBJECT_VERSION 1 diff --git a/third_party/amd/backend/include/hsa/hsa.h b/third_party/amd/backend/include/hsa/hsa.h index 9520bd870c9c..1ad714c44c2d 100644 --- a/third_party/amd/backend/include/hsa/hsa.h +++ b/third_party/amd/backend/include/hsa/hsa.h @@ -598,10 +598,14 @@ typedef enum { * AqlProfile extension. */ HSA_EXTENSION_AMD_AQLPROFILE = 0x202, + /** + * PC Sampling extension. + */ + HSA_EXTENSION_AMD_PC_SAMPLING = 0x203, /** * Last AMD extension. */ - HSA_AMD_LAST_EXTENSION = 0x202 + HSA_AMD_LAST_EXTENSION = 0x203 } hsa_extension_t; /** @@ -5656,7 +5660,12 @@ typedef enum { * undefined if the symbol is not an indirect function. The type of this * attribute is uint32_t. */ - HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16 + HSA_CODE_SYMBOL_INFO_INDIRECT_FUNCTION_CALL_CONVENTION = 16, + /** + * Wavefront size used by the kernel. The value of this attribute is either + * 32 or 64. The type of this attribute is uint32_t. + */ + HSA_CODE_SYMBOL_INFO_KERNEL_WAVEFRONT_SIZE = 19 } hsa_code_symbol_info_t; /** diff --git a/third_party/amd/backend/include/hsa/hsa_api_trace.h b/third_party/amd/backend/include/hsa/hsa_api_trace.h index e46c777af13f..2a0f59df3b82 100644 --- a/third_party/amd/backend/include/hsa/hsa_api_trace.h +++ b/third_party/amd/backend/include/hsa/hsa_api_trace.h @@ -44,39 +44,26 @@ #define HSA_RUNTIME_INC_HSA_API_TRACE_H #include "hsa.h" +#include "hsa_api_trace_version.h" #ifdef AMD_INTERNAL_BUILD #include "hsa_ext_image.h" #include "hsa_ext_amd.h" #include "hsa_ext_finalize.h" #include "hsa_amd_tool.h" +#include "hsa_ven_amd_pc_sampling.h" #else #include "inc/hsa_ext_image.h" #include "inc/hsa_ext_amd.h" #include "inc/hsa_ext_finalize.h" #include "inc/hsa_amd_tool.h" +#include "inc/hsa_ven_amd_pc_sampling.h" #endif #include #include #include -// Major Ids of the Api tables exported by Hsa Core Runtime -#define HSA_API_TABLE_MAJOR_VERSION 0x03 -#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 -#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 -#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 - -// Step Ids of the Api tables exported by Hsa Core Runtime -#define HSA_API_TABLE_STEP_VERSION 0x00 -#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x01 -#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 -#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00 -#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 -#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +// Table MAJOR_VERSION and STEP_VERSION defines have moved to hsa_api_trace_version.h // Min function used to copy Api Tables static inline uint32_t Min(const uint32_t a, const uint32_t b) { @@ -191,6 +178,19 @@ struct ImageExtTable { decltype(hsa_ext_image_create_with_layout)* hsa_ext_image_create_with_layout_fn; }; +// Table to export HSA PC Sampling Extension Apis +struct PcSamplingExtTable { + ApiTableVersion version; + decltype(hsa_ven_amd_pcs_iterate_configuration)* hsa_ven_amd_pcs_iterate_configuration_fn; + decltype(hsa_ven_amd_pcs_create)* hsa_ven_amd_pcs_create_fn; + decltype(hsa_ven_amd_pcs_create_from_id)* hsa_ven_amd_pcs_create_from_id_fn; + decltype(hsa_ven_amd_pcs_destroy)* hsa_ven_amd_pcs_destroy_fn; + decltype(hsa_ven_amd_pcs_start)* hsa_ven_amd_pcs_start_fn; + decltype(hsa_ven_amd_pcs_stop)* hsa_ven_amd_pcs_stop_fn; + decltype(hsa_ven_amd_pcs_flush)* hsa_ven_amd_pcs_flush_fn; +}; + + // Table to export AMD Extension Apis struct AmdExtTable { ApiTableVersion version; @@ -263,6 +263,8 @@ struct AmdExtTable { decltype(hsa_amd_vmem_get_alloc_properties_from_handle)* hsa_amd_vmem_get_alloc_properties_from_handle_fn; decltype(hsa_amd_agent_set_async_scratch_limit)* hsa_amd_agent_set_async_scratch_limit_fn; + decltype(hsa_amd_queue_get_info)* hsa_amd_queue_get_info_fn; + decltype(hsa_amd_vmem_address_reserve_align)* hsa_amd_vmem_address_reserve_align_fn; }; // Table to export HSA Core Runtime Apis @@ -464,6 +466,9 @@ struct HsaApiTable { // Table of function pointers for tools to use ToolsApiTable* tools_; + + // Table of function pointers to AMD PC Sampling Extension + PcSamplingExtTable* pc_sampling_ext_; }; // Structure containing instances of different api tables @@ -474,6 +479,7 @@ struct HsaApiTableContainer { FinalizerExtTable finalizer_ext; ImageExtTable image_ext; ToolsApiTable tools; + PcSamplingExtTable pc_sampling_ext; // Default initialization of a container instance HsaApiTableContainer() { @@ -505,6 +511,11 @@ struct HsaApiTableContainer { tools.version.minor_id = sizeof(ToolsApiTable); tools.version.step_id = HSA_TOOLS_API_TABLE_STEP_VERSION; root.tools_ = &tools; + + pc_sampling_ext.version.major_id = HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION; + pc_sampling_ext.version.minor_id = sizeof(PcSamplingExtTable); + pc_sampling_ext.version.step_id = HSA_PC_SAMPLING_API_TABLE_STEP_VERSION; + root.pc_sampling_ext_ = &pc_sampling_ext; } }; @@ -562,5 +573,7 @@ static void inline copyTables(const HsaApiTable* src, HsaApiTable* dest) { copyElement(&dest->image_ext_->version, &src->image_ext_->version); if ((offsetof(HsaApiTable, tools_) < dest->version.minor_id)) copyElement(&dest->tools_->version, &src->tools_->version); + if ((offsetof(HsaApiTable, pc_sampling_ext_) < dest->version.minor_id)) + copyElement(&dest->pc_sampling_ext_->version, &src->pc_sampling_ext_->version); } #endif diff --git a/third_party/amd/backend/include/hsa/hsa_api_trace_version.h b/third_party/amd/backend/include/hsa/hsa_api_trace_version.h new file mode 100644 index 000000000000..3393a776207b --- /dev/null +++ b/third_party/amd/backend/include/hsa/hsa_api_trace_version.h @@ -0,0 +1,68 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H +#define HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H + +// CODE IN THIS FILE **MUST** BE C-COMPATIBLE + +// Major Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_MAJOR_VERSION 0x03 +#define HSA_CORE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AMD_EXT_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_FINALIZER_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_IMAGE_API_TABLE_MAJOR_VERSION 0x02 +#define HSA_AQLPROFILE_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_TOOLS_API_TABLE_MAJOR_VERSION 0x01 +#define HSA_PC_SAMPLING_API_TABLE_MAJOR_VERSION 0x01 + +// Step Ids of the Api tables exported by Hsa Core Runtime +#define HSA_API_TABLE_STEP_VERSION 0x01 +#define HSA_CORE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AMD_EXT_API_TABLE_STEP_VERSION 0x03 +#define HSA_FINALIZER_API_TABLE_STEP_VERSION 0x00 +#define HSA_IMAGE_API_TABLE_STEP_VERSION 0x00 +#define HSA_AQLPROFILE_API_TABLE_STEP_VERSION 0x00 +#define HSA_TOOLS_API_TABLE_STEP_VERSION 0x00 +#define HSA_PC_SAMPLING_API_TABLE_STEP_VERSION 0x00 + +#endif // HSA_RUNTIME_INC_HSA_API_TRACE_VERSION_H diff --git a/third_party/amd/backend/include/hsa/hsa_ext_amd.h b/third_party/amd/backend/include/hsa/hsa_ext_amd.h index 187bcd958707..f9f60edeb9d0 100644 --- a/third_party/amd/backend/include/hsa/hsa_ext_amd.h +++ b/third_party/amd/backend/include/hsa/hsa_ext_amd.h @@ -47,16 +47,19 @@ #include "hsa.h" #include "hsa_ext_image.h" +#include "hsa_ven_amd_pc_sampling.h" -/* +/** * - 1.0 - initial version * - 1.1 - dmabuf export * - 1.2 - hsa_amd_memory_async_copy_on_engine * - 1.3 - HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED pool * - 1.4 - Virtual Memory API + * - 1.5 - hsa_amd_agent_info: HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES + * - 1.6 - Virtual Memory API: hsa_amd_vmem_address_reserve_align */ #define HSA_AMD_INTERFACE_VERSION_MAJOR 1 -#define HSA_AMD_INTERFACE_VERSION_MINOR 4 +#define HSA_AMD_INTERFACE_VERSION_MINOR 6 #ifdef __cplusplus extern "C" { @@ -221,6 +224,11 @@ enum { * Exceeded number of VGPRs available on this agent */ HSA_STATUS_ERROR_OUT_OF_REGISTERS = 45, + + /** + * Resource is busy or temporarily unavailable + */ + HSA_STATUS_ERROR_RESOURCE_BUSY = 46, }; /** @@ -1176,7 +1184,11 @@ typedef enum hsa_amd_memory_pool_flag_s { * connection. Atomic memory operations on these memory buffers are not * guaranteed to be visible at system scope. */ - HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1, + HSA_AMD_MEMORY_POOL_PCIE_FLAG = (1 << 0), + /** + * Allocates physically contiguous memory + */ + HSA_AMD_MEMORY_POOL_CONTIGUOUS_FLAG = (1 << 1), } hsa_amd_memory_pool_flag_t; @@ -2783,7 +2795,7 @@ hsa_status_t hsa_amd_portable_export_dmabuf(const void* ptr, size_t size, int* d */ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); -/* +/** * @brief Allocate a reserved address range * * Reserve a virtual address range. The size must be a multiple of the system page size. @@ -2803,11 +2815,39 @@ hsa_status_t hsa_amd_portable_close_dmabuf(int dmabuf); * * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address * range of this size. + * + * Note that this API will be deprecated in a future release and replaced by + * hsa_amd_vmem_address_reserve_align */ hsa_status_t hsa_amd_vmem_address_reserve(void** va, size_t size, uint64_t address, uint64_t flags); -/* +/** + * @brief Allocate a reserved address range + * + * Reserve a virtual address range. The size must be a multiple of the system page size. + * If it is not possible to allocate the address specified by @p address, then @p va will be + * a different address range. + * Address range should be released by calling hsa_amd_vmem_address_free. + * + * @param[out] va virtual address allocated + * @param[in] size of address range requested + * @param[in] address requested + * @param[in] alignment requested. 0 for default. Must be >= page-size and a power of 2 + * @param[in] flags currently unsupported + * + * @retval ::HSA_STATUS_SUCCESS Address range allocated successfully + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Insufficient resources to allocate an address + * range of this size. + */ +hsa_status_t hsa_amd_vmem_address_reserve_align(void** va, size_t size, uint64_t address, + uint64_t alignment, uint64_t flags); + +/** * @brief Free a reserved address range * * Free a previously allocated address range. The size must match the size of a previously @@ -2841,7 +2881,7 @@ typedef enum { MEMORY_TYPE_PINNED, } hsa_amd_memory_type_t; -/* +/** * @brief Create a virtual memory handle * * Create a virtual memory handle within this pool @@ -2870,7 +2910,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, hsa_amd_memory_type_t type, uint64_t flags, hsa_amd_vmem_alloc_handle_t* memory_handle); -/* +/** * @brief Release a virtual memory handle * * @param[in] memory handle that was previously allocated @@ -2881,7 +2921,7 @@ hsa_status_t hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size, */ hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle); -/* +/** * @brief Map a virtual memory handle * * Map a virtual memory handle to a reserved address range. The virtual address requested must be @@ -2907,7 +2947,7 @@ hsa_status_t hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_hand hsa_status_t hsa_amd_vmem_map(void* va, size_t size, size_t in_offset, hsa_amd_vmem_alloc_handle_t memory_handle, uint64_t flags); -/* +/** * @brief Unmap a virtual memory handle * * Unmap previously mapped virtual address range @@ -2930,7 +2970,7 @@ typedef struct hsa_amd_memory_access_desc_s { hsa_agent_t agent_handle; } hsa_amd_memory_access_desc_t; -/* +/** * @brief Make a memory mapping accessible * * Make previously mapped virtual address accessible to specific agents. @p size must be equal to @@ -2959,7 +2999,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, const hsa_amd_memory_access_desc_t* desc, size_t desc_cnt); -/* +/** * @brief Get current access permissions for memory mapping * * Get access permissions for memory mapping for specific agent. @@ -2980,7 +3020,7 @@ hsa_status_t hsa_amd_vmem_set_access(void* va, size_t size, hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, hsa_agent_t agent_handle); -/* +/** * @brief Get an exportable shareable handle * * Get an exportable shareable handle for a memory_handle. This shareabl handle can then be used to @@ -3003,7 +3043,7 @@ hsa_status_t hsa_amd_vmem_get_access(void* va, hsa_access_permission_t* perms, hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, hsa_amd_vmem_alloc_handle_t handle, uint64_t flags); -/* +/** * @brief Import a shareable handle * * Import a shareable handle for a memory handle. Importing a shareable handle that has been closed @@ -3023,7 +3063,7 @@ hsa_status_t hsa_amd_vmem_export_shareable_handle(int* dmabuf_fd, hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, hsa_amd_vmem_alloc_handle_t* handle); -/* +/** * @brief Returns memory handle for mapped memory * * Return a memory handle for previously mapped memory. The handle will be the same value of handle @@ -3040,19 +3080,19 @@ hsa_status_t hsa_amd_vmem_import_shareable_handle(int dmabuf_fd, hsa_status_t hsa_amd_vmem_retain_alloc_handle(hsa_amd_vmem_alloc_handle_t* memory_handle, void* addr); -/* -* @brief Returns the current allocation properties of a handle -* -* Returns the allocation properties of an existing handle -* -* @param[in] memory_handle memory handle to be queried -* @param[out] pool memory pool that owns this handle -* @param[out] memory type - -* @retval ::HSA_STATUS_SUCCESS -* -* @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle -*/ +/** + * @brief Returns the current allocation properties of a handle + * + * Returns the allocation properties of an existing handle + * + * @param[in] memory_handle memory handle to be queried + * @param[out] pool memory pool that owns this handle + * @param[out] memory type + + * @retval ::HSA_STATUS_SUCCESS + * + * @retval ::HSA_STATUS_ERROR_INVALID_ALLOCATION Invalid memory_handle + */ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( hsa_amd_vmem_alloc_handle_t memory_handle, hsa_amd_memory_pool_t* pool, hsa_amd_memory_type_t* type); @@ -3084,6 +3124,22 @@ hsa_status_t hsa_amd_vmem_get_alloc_properties_from_handle( */ hsa_status_t HSA_API hsa_amd_agent_set_async_scratch_limit(hsa_agent_t agent, size_t threshold); +typedef enum { + /* + * Returns the agent that owns the underlying HW queue. + * The type of this attribute is hsa_agent_t. + */ + HSA_AMD_QUEUE_INFO_AGENT, + /* + * Returns the doorbell ID of the completion signal of the queue + * The type of this attribute is uint64_t. + */ + HSA_AMD_QUEUE_INFO_DOORBELL_ID, +} hsa_queue_info_attribute_t; + +hsa_status_t hsa_amd_queue_get_info(hsa_queue_t* queue, hsa_queue_info_attribute_t attribute, + void* value); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/third_party/amd/backend/include/hsa/hsa_ven_amd_aqlprofile.h b/third_party/amd/backend/include/hsa/hsa_ven_amd_aqlprofile.h index 32ca6b7320bb..0022c0d8b8b6 100644 --- a/third_party/amd/backend/include/hsa/hsa_ven_amd_aqlprofile.h +++ b/third_party/amd/backend/include/hsa/hsa_ven_amd_aqlprofile.h @@ -149,61 +149,61 @@ hsa_status_t hsa_ven_amd_aqlprofile_validate_event( // All parameters are generic and if not applicable for a specific // profile configuration then error status will be returned. typedef enum { - /* - * Select the target compute unit (wgp) for profiling. - */ + /** + * Select the target compute unit (wgp) for profiling. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_COMPUTE_UNIT_TARGET = 0, - /* - * VMID Mask - */ + /** + * VMID Mask + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_VM_ID_MASK = 1, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_MASK = 2, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK = 3, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_TOKEN_MASK2 = 4, - /* - * Shader engine mask for selection. - */ + /** + * Shader engine mask for selection. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SE_MASK = 5, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SAMPLE_RATE = 6, - /* - * Legacy. Deprecated. - */ + /** + * Legacy. Deprecated. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_K_CONCURRENT = 7, - /* - * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) - */ + /** + * Set SIMD Mask (GFX9) or SIMD ID for collection (Navi) + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_SIMD_SELECTION = 8, - /* - * Set true for occupancy collection only. - */ + /** + * Set true for occupancy collection only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_OCCUPANCY_MODE = 9, - /* - * ATT collection max data size, in MB. Shared among shader engines. - */ + /** + * ATT collection max data size, in MB. Shared among shader engines. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_ATT_BUFFER_SIZE = 10, - /* - * Mask of which compute units to generate perfcounters. GFX9 only. - */ + /** + * Mask of which compute units to generate perfcounters. GFX9 only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_MASK = 240, - /* - * Select collection period for perfcounters. GFX9 only. - */ + /** + * Select collection period for perfcounters. GFX9 only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_CTRL = 241, - /* - * Select perfcounter ID (SQ block) for collection. GFX9 only. - */ + /** + * Select perfcounter ID (SQ block) for collection. GFX9 only. + */ HSA_VEN_AMD_AQLPROFILE_PARAMETER_NAME_PERFCOUNTER_NAME = 242, } hsa_ven_amd_aqlprofile_parameter_name_t; @@ -365,11 +365,11 @@ hsa_status_t hsa_ven_amd_aqlprofile_error_string( /** * @brief Callback for iteration of all possible event coordinate IDs and coordinate names. -*/ + */ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_eventname_callback_t)(int id, const char* name); /** * @brief Iterate over all possible event coordinate IDs and their names. -*/ + */ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eventname_callback_t); /** @@ -380,7 +380,7 @@ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_ids(hsa_ven_amd_aqlprofile_eve * @param coordinate The coordinate, in the range [0,extent-1]. * @param name Coordinate name as in _iterate_event_ids. * @param userdata Userdata returned from _iterate_event_coord function. -*/ + */ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( int position, int id, @@ -397,7 +397,7 @@ typedef hsa_status_t(*hsa_ven_amd_aqlprofile_coordinate_callback_t)( * @param[in] sample_id aqlprofile_info_data_t.sample_id returned from _aqlprofile_iterate_data. * @param[in] callback Callback function to return the coordinates. * @param[in] userdata Arbitrary data pointer to be sent back to the user via callback. -*/ + */ hsa_status_t hsa_ven_amd_aqlprofile_iterate_event_coord( hsa_agent_t agent, hsa_ven_amd_aqlprofile_event_t event, diff --git a/third_party/amd/backend/include/hsa/hsa_ven_amd_pc_sampling.h b/third_party/amd/backend/include/hsa/hsa_ven_amd_pc_sampling.h new file mode 100644 index 000000000000..019f0ea5c960 --- /dev/null +++ b/third_party/amd/backend/include/hsa/hsa_ven_amd_pc_sampling.h @@ -0,0 +1,416 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_VEN_AMD_PC_SAMPLING_H +#define HSA_VEN_AMD_PC_SAMPLING_H + +#include "hsa.h" + +#ifdef __cplusplus +extern "C" { +#endif /*__cplusplus*/ + + +/** + * @brief HSA AMD Vendor PC Sampling APIs + * EXPERIMENTAL: All PC Sampling APIs are currently in an experimental phase and the APIs may be + * modified extensively in the future + */ + +/** + * @brief PC Sampling sample data for hosttrap sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t reserved0; + uint64_t reserved1; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_hosttrap_v1_t; + +/** + * @brief PC Sampling sample data for stochastic sampling method + */ +typedef struct { + uint64_t pc; + uint64_t exec_mask; + uint32_t workgroup_id_x; + uint32_t workgroup_id_y; + uint32_t workgroup_id_z; + uint32_t wave_in_wg : 6; + uint32_t chiplet : 3; // Currently not used + uint32_t reserved : 23; + uint32_t hw_id; + uint32_t perf_snapshot_data; + uint32_t perf_snapshot_data1; + uint32_t perf_snapshot_data2; + uint64_t timestamp; + uint64_t correlation_id; +} perf_sample_snapshot_v1_t; + +/** + * @brief PC Sampling method kinds + */ +typedef enum { + HSA_VEN_AMD_PCS_METHOD_HOSTTRAP_V1, + HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1 +} hsa_ven_amd_pcs_method_kind_t; + +/** + * @brief PC Sampling interval unit type + */ +typedef enum { + HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_CLOCK_CYCLES, + HSA_VEN_AMD_PCS_INTERVAL_UNITS_INSTRUCTIONS +} hsa_ven_amd_pcs_units_t; + +/** + * @brief HSA callback function to perform the copy onto a destination buffer + * + * If data_size is 0, HSA will stop current copy operation and keep remaining data in internal + * buffers. Remaining contents of HSA internal buffers will be included in next + * hsa_ven_amd_pcs_data_ready_callback_t. HSA internal buffers can also be drained by calling + * hsa_ven_amd_pcs_flush. + * + * @param[in] hsa_callback_data private data to pass back to HSA. Provided in + * hsa_ven_amd_pcs_data_ready_callback_t + * + * @param[in] data_size size of destination buffer in bytes. + * @param[in] destination destination buffer + * @retval TBD: but could be used to indicate that there is no more data to be read. + * Or indicate an error and abort of current copy operations + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_data_copy_callback_t)(void* hsa_callback_data, + size_t data_size, void* destination); + +/** + * @brief HSA callback function to to indicate that there is data ready to be copied + * + * When the client receives this callback, the client should call back @p data_copy_callback for HSA + * to perform the copy operation into an available buffer. @p data_copy_callback can be called back + * multiple times with smaller @p data_size to split the copy operation. + * + * This callback must not call ::hsa_ven_amd_pcs_flush. + * + * @param[in] client_callback_data client private data passed in via + * hsa_ven_amd_pcs_create/hsa_ven_amd_pcs_create_from_id + * @param[in] data_size size of data available to be copied + * @param[in] lost_sample_count number of lost samples since last call to + * hsa_ven_amd_pcs_data_ready_callback_t. + * @param[in] data_copy_callback callback function for HSA to perform the actual copy + * @param[in] hsa_callback_data private data to pass back to HSA + */ +typedef void (*hsa_ven_amd_pcs_data_ready_callback_t)( + void* client_callback_data, size_t data_size, size_t lost_sample_count, + hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, void* hsa_callback_data); + +/** + * @brief Opaque handle representing a sampling session. + * Two sessions having same handle value represent the same session + */ +typedef struct { + uint64_t handle; +} hsa_ven_amd_pcs_t; + +/** + * @brief PC Sampling configuration flag options + */ +typedef enum { + /* The interval for this sampling method have to be a power of 2 */ + HSA_VEN_AMD_PCS_CONFIGURATION_FLAGS_INTERVAL_POWER_OF_2 = (1 << 0) +} hsa_ven_amd_pcs_configuration_flags_t; + +/** + * @brief PC Sampling method information + * Used to provide client with list of supported PC Sampling methods + */ +typedef struct { + hsa_ven_amd_pcs_method_kind_t method; + hsa_ven_amd_pcs_units_t units; + size_t min_interval; + size_t max_interval; + uint64_t flags; +} hsa_ven_amd_pcs_configuration_t; + +/** + * @brief Callback function to iterate through list of supported PC Sampling configurations + * + * @param[in] configuration one entry for supported PC Sampling method and configuration options + * @param[in] callback_data client private callback data that was passed in when calling + * hsa_ven_amd_pcs_iterate_configuration + */ +typedef hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration_callback_t)( + const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data); + +/** + * @brief Iterate through list of current supported PC Sampling configurations for this @p agent + * + * HSA will callback @p configuration_callback for each currently available PC Sampling + * configuration. The list of currently available configurations may not be the complete list of + * configurations supported on the @p agent. The list of currently available configurations may be + * reduced if the @p agent is currently handling other PC sampling sessions. + * + * @param[in] agent target agent + * @param[in] configuration_callback callback function to iterate through list of configurations + * @param[in] callback_data client private callback data + **/ +hsa_status_t hsa_ven_amd_pcs_iterate_configuration( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + +/** + * @brief Create a PC Sampling session on @p agent + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, as described by the + * hsa_ven_amd_pcs_configuration_t configurations passed to the callbacks of + * hsa_ven_amd_pcs_iterate_configuration for this @p agent. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, + size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, hsa_ven_amd_pcs_t* pc_sampling); + + +/** + * @brief Creates a PC Sampling session on @p agent. Assumes that the caller provides the + * @p pcs_id generated by the previous call to the underlying driver that reserved PC sampling + * on the @p agent. + * + * Similar to the @ref hsa_ven_amd_pcs_create with the difference that it inherits an existing + * PC sampling session that was previously created in the underlying driver. + * + * Allocate the resources required for a PC Sampling session. The @p method, @p units, @p interval + * parameters must be a legal configuration value, and match the parameters that we used to create + * the underlying PC Sampling session in the underlying driver. + * A successfull call may restrict the list of possible PC sampling methods available to subsequent + * calls to hsa_ven_amd_pcs_iterate_configuration on the same agent as agents have limitations + * on what types of PC sampling they can perform concurrently. + * For all successful calls, hsa_ven_amd_pcs_destroy should be called to free this session. + * The session will be in a stopped/inactive state after this call + * + * @param[in] pcs_id ID that uniquely identifies the PC sampling session within underlying driver + * @param[in] agent target agent + * @param[in] method method to use + * @param[in] units sampling units + * @param[in] interval sampling interval in @p units + * @param[in] latency expected latency in microseconds for client to provide a buffer for the data + * copy callback once HSA calls @p data_ready_callback. This is a performance hint to avoid the + * buffer filling up before the client is notified that data is ready. HSA-runtime will estimate + * how many samples are received within @p latency and call @p data_ready_callback ahead of time so + * that the client has @p latency time to allocate the buffer before the HSA-runtime internal + * buffers are full. The value of latency can be 0. + * @param[in] buffer_size size of client buffer in bytes. @p data_ready_callback will be called once + * HSA-runtime has enough samples to fill @p buffer_size. This needs to be a multiple of size of + * perf_sample_hosttrap_v1_t or size of perf_sample_snapshot_v1_t. + * @param[in] data_ready_callback client callback function that will be called when: + * 1. There is enough samples fill a buffer with @p buffer_size - estimated samples received + * within @p latency period. + * OR + * 2. When hsa_ven_amd_pcs_flush is called. + * @param[in] client_callback_data client private data to be provided back when data_ready_callback + * is called. + * @param[out] pc_sampling PC sampling session handle used to reference this session when calling + * hsa_ven_amd_pcs_start, hsa_ven_amd_pcs_stop, hsa_ven_amd_pcs_destroy + * + * @retval ::HSA_STATUS_SUCCESS session created successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT invalid parameters + * @retval ::HSA_STATUS_ERROR_RESOURCE_BUSY agent currently handling another PC Sampling session and + * cannot handle the type requested. + * @retval ::HSA_STATUS_ERROR_OUT_OF_RESOURCES Failed to allocate resources + * @retval ::HSA_STATUS_ERROR Unexpected error + **/ +hsa_status_t hsa_ven_amd_pcs_create_from_id( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + +/** + * @brief Free a PC Sampling session on @p agent + * + * Free all the resources allocated for a PC Sampling session on @p agent + * Internal buffers for this session will be lost. + * If the session was active, the session will be stopped before it is destroyed. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session destroyed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_destroy(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Start a PC Sampling session + * + * Activate a PC Sampling session that was previous created. + * The session with be in a active state after this call + * If the session was already active, this will result in a no-op and will return HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session started successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + * @retval ::HSA_STATUS_ERROR unexpected error + */ +hsa_status_t hsa_ven_amd_pcs_start(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Stop a PC Sampling session + * + * Stop a session that is currently active + * After a session is stopped HSA may still have some PC Sampling data in its internal buffers. + * The internal buffers can be drained using hsa_ven_amd_pcs_flush. If the internal + * buffers are not drained and the session is started again, the internal buffers will be available + * on the next data_ready_callback. + * If the session was already inactive, this will result in a no-op and will return + * HSA_STATUS_SUCCESS + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session stopped successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_stop(hsa_ven_amd_pcs_t pc_sampling); + +/** + * @brief Flush internal buffers for a PC Sampling session + * + * Drain internal buffers for a PC Sampling session. If internal buffers have available data, + * this trigger a data_ready_callback. + * + * The function blocks until all PC samples associated with the @p pc_sampling session + * generated prior to the function call have been communicated by invocations of + * @p data_ready_callback having completed execution. + * + * @param[in] pc_sampling PC sampling session handle + * + * @retval ::HSA_STATUS_SUCCESS Session flushed successfully + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT Invalid PC sampling handle + */ +hsa_status_t hsa_ven_amd_pcs_flush(hsa_ven_amd_pcs_t pc_sampling); + +#define hsa_ven_amd_pc_sampling_1_00 + +/** + * @brief The function pointer table for the PC Sampling v1.00 extension. Can be returned by + * ::hsa_system_get_extension_table or ::hsa_system_get_major_extension_table. + */ +typedef struct hsa_ven_amd_pc_sampling_1_00_pfn_t { + hsa_status_t (*hsa_ven_amd_pcs_iterate_configuration)( + hsa_agent_t agent, hsa_ven_amd_pcs_iterate_configuration_callback_t configuration_callback, + void* callback_data); + + hsa_status_t (*hsa_ven_amd_pcs_create)(hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, + size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, + void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_create_from_id)( + uint32_t pcs_id, hsa_agent_t agent, hsa_ven_amd_pcs_method_kind_t method, + hsa_ven_amd_pcs_units_t units, size_t interval, size_t latency, size_t buffer_size, + hsa_ven_amd_pcs_data_ready_callback_t data_ready_callback, void* client_callback_data, + hsa_ven_amd_pcs_t* pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_destroy)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_start)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_stop)(hsa_ven_amd_pcs_t pc_sampling); + + hsa_status_t (*hsa_ven_amd_pcs_flush)(hsa_ven_amd_pcs_t pc_sampling); + +} hsa_ven_amd_pc_sampling_1_00_pfn_t; + +#ifdef __cplusplus +} // end extern "C" block +#endif /*__cplusplus*/ + +#endif /* HSA_VEN_AMD_PC_SAMPLING_H */ diff --git a/third_party/amd/backend/include/roctracer/hip_ostream_ops.h b/third_party/amd/backend/include/roctracer/hip_ostream_ops.h index 13ee9ac2d379..eba2592fa305 100644 --- a/third_party/amd/backend/include/roctracer/hip_ostream_ops.h +++ b/third_party/amd/backend/include/roctracer/hip_ostream_ops.h @@ -2795,6 +2795,11 @@ inline static std::ostream& operator<<(std::ostream& out, const hipMemPoolProps& roctracer::hip_support::detail::operator<<(out, 0); std::operator<<(out, ", "); } + if (std::string("hipMemPoolProps::maxSize").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "maxSize="); + roctracer::hip_support::detail::operator<<(out, v.maxSize); + std::operator<<(out, ", "); + } if (std::string("hipMemPoolProps::location").find(HIP_structs_regex) != std::string::npos) { std::operator<<(out, "location="); roctracer::hip_support::detail::operator<<(out, v.location); @@ -3229,17 +3234,22 @@ inline static std::ostream& operator<<(std::ostream& out, const hipAccessPolicyW std::operator<<(out, '}'); return out; } -inline static std::ostream& operator<<(std::ostream& out, const hipKernelNodeAttrValue& v) +inline static std::ostream& operator<<(std::ostream& out, const hipLaunchAttributeValue& v) { std::operator<<(out, '{'); HIP_depth_max_cnt++; if (HIP_depth_max == -1 || HIP_depth_max_cnt <= HIP_depth_max) { - if (std::string("hipKernelNodeAttrValue::cooperative").find(HIP_structs_regex) != std::string::npos) { + if (std::string("hipLaunchAttributeValue::priority").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "priority="); + roctracer::hip_support::detail::operator<<(out, v.priority); + std::operator<<(out, ", "); + } + if (std::string("hipLaunchAttributeValue::cooperative").find(HIP_structs_regex) != std::string::npos) { std::operator<<(out, "cooperative="); roctracer::hip_support::detail::operator<<(out, v.cooperative); std::operator<<(out, ", "); } - if (std::string("hipKernelNodeAttrValue::accessPolicyWindow").find(HIP_structs_regex) != std::string::npos) { + if (std::string("hipLaunchAttributeValue::accessPolicyWindow").find(HIP_structs_regex) != std::string::npos) { std::operator<<(out, "accessPolicyWindow="); roctracer::hip_support::detail::operator<<(out, v.accessPolicyWindow); } @@ -3287,6 +3297,35 @@ inline static std::ostream& operator<<(std::ostream& out, const HIP_MEMSET_NODE_ std::operator<<(out, '}'); return out; } +inline static std::ostream& operator<<(std::ostream& out, const hipGraphInstantiateParams& v) +{ + std::operator<<(out, '{'); + HIP_depth_max_cnt++; + if (HIP_depth_max == -1 || HIP_depth_max_cnt <= HIP_depth_max) { + if (std::string("hipGraphInstantiateParams::uploadStream").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "uploadStream="); + roctracer::hip_support::detail::operator<<(out, v.uploadStream); + std::operator<<(out, ", "); + } + if (std::string("hipGraphInstantiateParams::result_out").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "result_out="); + roctracer::hip_support::detail::operator<<(out, v.result_out); + std::operator<<(out, ", "); + } + if (std::string("hipGraphInstantiateParams::flags").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "flags="); + roctracer::hip_support::detail::operator<<(out, v.flags); + std::operator<<(out, ", "); + } + if (std::string("hipGraphInstantiateParams::errNode_out").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "errNode_out="); + roctracer::hip_support::detail::operator<<(out, v.errNode_out); + } + }; + HIP_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} inline static std::ostream& operator<<(std::ostream& out, const hipMemAllocationProp& v) { std::operator<<(out, '{'); @@ -3513,6 +3552,35 @@ inline static std::ostream& operator<<(std::ostream& out, const hipGraphNodePara std::operator<<(out, '}'); return out; } +inline static std::ostream& operator<<(std::ostream& out, const hipGraphEdgeData& v) +{ + std::operator<<(out, '{'); + HIP_depth_max_cnt++; + if (HIP_depth_max == -1 || HIP_depth_max_cnt <= HIP_depth_max) { + if (std::string("hipGraphEdgeData::type").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "type="); + roctracer::hip_support::detail::operator<<(out, v.type); + std::operator<<(out, ", "); + } + if (std::string("hipGraphEdgeData::to_port").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "to_port="); + roctracer::hip_support::detail::operator<<(out, v.to_port); + std::operator<<(out, ", "); + } + if (std::string("hipGraphEdgeData::reserved").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "reserved="); + roctracer::hip_support::detail::operator<<(out, 0); + std::operator<<(out, ", "); + } + if (std::string("hipGraphEdgeData::from_port").find(HIP_structs_regex) != std::string::npos) { + std::operator<<(out, "from_port="); + roctracer::hip_support::detail::operator<<(out, v.from_port); + } + }; + HIP_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} inline static std::ostream& operator<<(std::ostream& out, const hipDeviceProp_tR0000& v) { std::operator<<(out, '{'); @@ -4352,7 +4420,7 @@ inline static std::ostream& operator<<(std::ostream& out, const hipAccessPolicyW return out; } -inline static std::ostream& operator<<(std::ostream& out, const hipKernelNodeAttrValue& v) +inline static std::ostream& operator<<(std::ostream& out, const hipLaunchAttributeValue& v) { roctracer::hip_support::detail::operator<<(out, v); return out; @@ -4364,6 +4432,12 @@ inline static std::ostream& operator<<(std::ostream& out, const HIP_MEMSET_NODE_ return out; } +inline static std::ostream& operator<<(std::ostream& out, const hipGraphInstantiateParams& v) +{ + roctracer::hip_support::detail::operator<<(out, v); + return out; +} + inline static std::ostream& operator<<(std::ostream& out, const hipMemAllocationProp& v) { roctracer::hip_support::detail::operator<<(out, v); @@ -4424,6 +4498,12 @@ inline static std::ostream& operator<<(std::ostream& out, const hipGraphNodePara return out; } +inline static std::ostream& operator<<(std::ostream& out, const hipGraphEdgeData& v) +{ + roctracer::hip_support::detail::operator<<(out, v); + return out; +} + inline static std::ostream& operator<<(std::ostream& out, const hipDeviceProp_tR0000& v) { roctracer::hip_support::detail::operator<<(out, v); diff --git a/third_party/amd/backend/include/roctracer/hsa_ostream_ops.h b/third_party/amd/backend/include/roctracer/hsa_ostream_ops.h index 353ddc6ba4ca..7dfd39dd099f 100644 --- a/third_party/amd/backend/include/roctracer/hsa_ostream_ops.h +++ b/third_party/amd/backend/include/roctracer/hsa_ostream_ops.h @@ -785,6 +785,236 @@ inline static std::ostream& operator<<(std::ostream& out, const hsa_ext_images_1 std::operator<<(out, '}'); return out; } +inline static std::ostream& operator<<(std::ostream& out, const perf_sample_hosttrap_v1_t& v) +{ + std::operator<<(out, '{'); + HSA_depth_max_cnt++; + if (HSA_depth_max == -1 || HSA_depth_max_cnt <= HSA_depth_max) { + if (std::string("perf_sample_hosttrap_v1_t::correlation_id").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "correlation_id="); + roctracer::hsa_support::detail::operator<<(out, v.correlation_id); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::timestamp").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "timestamp="); + roctracer::hsa_support::detail::operator<<(out, v.timestamp); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::reserved1").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "reserved1="); + roctracer::hsa_support::detail::operator<<(out, v.reserved1); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::reserved0").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "reserved0="); + roctracer::hsa_support::detail::operator<<(out, v.reserved0); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::hw_id").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "hw_id="); + roctracer::hsa_support::detail::operator<<(out, v.hw_id); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::reserved").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "reserved="); + roctracer::hsa_support::detail::operator<<(out, v.reserved); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::chiplet").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "chiplet="); + roctracer::hsa_support::detail::operator<<(out, v.chiplet); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::wave_in_wg").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "wave_in_wg="); + roctracer::hsa_support::detail::operator<<(out, v.wave_in_wg); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::workgroup_id_z").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "workgroup_id_z="); + roctracer::hsa_support::detail::operator<<(out, v.workgroup_id_z); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::workgroup_id_y").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "workgroup_id_y="); + roctracer::hsa_support::detail::operator<<(out, v.workgroup_id_y); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::workgroup_id_x").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "workgroup_id_x="); + roctracer::hsa_support::detail::operator<<(out, v.workgroup_id_x); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::exec_mask").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "exec_mask="); + roctracer::hsa_support::detail::operator<<(out, v.exec_mask); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_hosttrap_v1_t::pc").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "pc="); + roctracer::hsa_support::detail::operator<<(out, v.pc); + } + }; + HSA_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} +inline static std::ostream& operator<<(std::ostream& out, const perf_sample_snapshot_v1_t& v) +{ + std::operator<<(out, '{'); + HSA_depth_max_cnt++; + if (HSA_depth_max == -1 || HSA_depth_max_cnt <= HSA_depth_max) { + if (std::string("perf_sample_snapshot_v1_t::correlation_id").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "correlation_id="); + roctracer::hsa_support::detail::operator<<(out, v.correlation_id); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::timestamp").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "timestamp="); + roctracer::hsa_support::detail::operator<<(out, v.timestamp); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::perf_snapshot_data2").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "perf_snapshot_data2="); + roctracer::hsa_support::detail::operator<<(out, v.perf_snapshot_data2); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::perf_snapshot_data1").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "perf_snapshot_data1="); + roctracer::hsa_support::detail::operator<<(out, v.perf_snapshot_data1); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::perf_snapshot_data").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "perf_snapshot_data="); + roctracer::hsa_support::detail::operator<<(out, v.perf_snapshot_data); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::hw_id").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "hw_id="); + roctracer::hsa_support::detail::operator<<(out, v.hw_id); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::reserved").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "reserved="); + roctracer::hsa_support::detail::operator<<(out, v.reserved); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::chiplet").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "chiplet="); + roctracer::hsa_support::detail::operator<<(out, v.chiplet); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::wave_in_wg").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "wave_in_wg="); + roctracer::hsa_support::detail::operator<<(out, v.wave_in_wg); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::workgroup_id_z").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "workgroup_id_z="); + roctracer::hsa_support::detail::operator<<(out, v.workgroup_id_z); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::workgroup_id_y").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "workgroup_id_y="); + roctracer::hsa_support::detail::operator<<(out, v.workgroup_id_y); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::workgroup_id_x").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "workgroup_id_x="); + roctracer::hsa_support::detail::operator<<(out, v.workgroup_id_x); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::exec_mask").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "exec_mask="); + roctracer::hsa_support::detail::operator<<(out, v.exec_mask); + std::operator<<(out, ", "); + } + if (std::string("perf_sample_snapshot_v1_t::pc").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "pc="); + roctracer::hsa_support::detail::operator<<(out, v.pc); + } + }; + HSA_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} +inline static std::ostream& operator<<(std::ostream& out, const hsa_ven_amd_pcs_t& v) +{ + std::operator<<(out, '{'); + HSA_depth_max_cnt++; + if (HSA_depth_max == -1 || HSA_depth_max_cnt <= HSA_depth_max) { + if (std::string("hsa_ven_amd_pcs_t::handle").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "handle="); + roctracer::hsa_support::detail::operator<<(out, v.handle); + } + }; + HSA_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} +inline static std::ostream& operator<<(std::ostream& out, const hsa_ven_amd_pcs_configuration_t& v) +{ + std::operator<<(out, '{'); + HSA_depth_max_cnt++; + if (HSA_depth_max == -1 || HSA_depth_max_cnt <= HSA_depth_max) { + if (std::string("hsa_ven_amd_pcs_configuration_t::flags").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "flags="); + roctracer::hsa_support::detail::operator<<(out, v.flags); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pcs_configuration_t::max_interval").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "max_interval="); + roctracer::hsa_support::detail::operator<<(out, v.max_interval); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pcs_configuration_t::min_interval").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "min_interval="); + roctracer::hsa_support::detail::operator<<(out, v.min_interval); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pcs_configuration_t::units").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "units="); + roctracer::hsa_support::detail::operator<<(out, v.units); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pcs_configuration_t::method").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "method="); + roctracer::hsa_support::detail::operator<<(out, v.method); + } + }; + HSA_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} +inline static std::ostream& operator<<(std::ostream& out, const hsa_ven_amd_pc_sampling_1_00_pfn_t& v) +{ + std::operator<<(out, '{'); + HSA_depth_max_cnt++; + if (HSA_depth_max == -1 || HSA_depth_max_cnt <= HSA_depth_max) { + if (std::string("hsa_ven_amd_pc_sampling_1_00_pfn_t::hsa_ven_amd_pcs_flush").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "hsa_ven_amd_pcs_flush="); + roctracer::hsa_support::detail::operator<<(out, v.hsa_ven_amd_pcs_flush); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pc_sampling_1_00_pfn_t::hsa_ven_amd_pcs_stop").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "hsa_ven_amd_pcs_stop="); + roctracer::hsa_support::detail::operator<<(out, v.hsa_ven_amd_pcs_stop); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pc_sampling_1_00_pfn_t::hsa_ven_amd_pcs_start").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "hsa_ven_amd_pcs_start="); + roctracer::hsa_support::detail::operator<<(out, v.hsa_ven_amd_pcs_start); + std::operator<<(out, ", "); + } + if (std::string("hsa_ven_amd_pc_sampling_1_00_pfn_t::hsa_ven_amd_pcs_destroy").find(HSA_structs_regex) != std::string::npos) { + std::operator<<(out, "hsa_ven_amd_pcs_destroy="); + roctracer::hsa_support::detail::operator<<(out, v.hsa_ven_amd_pcs_destroy); + } + }; + HSA_depth_max_cnt--; + std::operator<<(out, '}'); + return out; +} inline static std::ostream& operator<<(std::ostream& out, const hsa_amd_vendor_packet_header_t& v) { std::operator<<(out, '{'); @@ -1360,6 +1590,36 @@ inline static std::ostream& operator<<(std::ostream& out, const hsa_ext_images_1 return out; } +inline static std::ostream& operator<<(std::ostream& out, const perf_sample_hosttrap_v1_t& v) +{ + roctracer::hsa_support::detail::operator<<(out, v); + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const perf_sample_snapshot_v1_t& v) +{ + roctracer::hsa_support::detail::operator<<(out, v); + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const hsa_ven_amd_pcs_t& v) +{ + roctracer::hsa_support::detail::operator<<(out, v); + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const hsa_ven_amd_pcs_configuration_t& v) +{ + roctracer::hsa_support::detail::operator<<(out, v); + return out; +} + +inline static std::ostream& operator<<(std::ostream& out, const hsa_ven_amd_pc_sampling_1_00_pfn_t& v) +{ + roctracer::hsa_support::detail::operator<<(out, v); + return out; +} + inline static std::ostream& operator<<(std::ostream& out, const hsa_amd_vendor_packet_header_t& v) { roctracer::hsa_support::detail::operator<<(out, v); diff --git a/third_party/amd/backend/include/roctracer/hsa_prof_str.h b/third_party/amd/backend/include/roctracer/hsa_prof_str.h index 28b2bf54d7c4..3747659f7924 100644 --- a/third_party/amd/backend/include/roctracer/hsa_prof_str.h +++ b/third_party/amd/backend/include/roctracer/hsa_prof_str.h @@ -22,9 +22,9 @@ /* HSA API tracing primitives 'CoreApi', header 'hsa.h', 125 funcs - 'AmdExt', header 'hsa_ext_amd.h', 68 funcs + 'AmdExt', header 'hsa_ext_amd.h', 70 funcs 'ImageExt', header 'hsa_ext_image.h', 13 funcs - 'AmdExt', header 'hsa_api_trace.h', 68 funcs + 'AmdExt', header 'hsa_api_trace.h', 70 funcs */ #ifndef HSA_PROF_STR_H_ @@ -229,24 +229,26 @@ enum hsa_api_id_t { HSA_API_ID_hsa_amd_vmem_retain_alloc_handle = 190, HSA_API_ID_hsa_amd_vmem_get_alloc_properties_from_handle = 191, HSA_API_ID_hsa_amd_agent_set_async_scratch_limit = 192, + HSA_API_ID_hsa_amd_queue_get_info = 193, + HSA_API_ID_hsa_amd_vmem_address_reserve_align = 194, /* block: ImageExt API */ - HSA_API_ID_hsa_ext_image_get_capability = 193, - HSA_API_ID_hsa_ext_image_data_get_info = 194, - HSA_API_ID_hsa_ext_image_create = 195, - HSA_API_ID_hsa_ext_image_import = 196, - HSA_API_ID_hsa_ext_image_export = 197, - HSA_API_ID_hsa_ext_image_copy = 198, - HSA_API_ID_hsa_ext_image_clear = 199, - HSA_API_ID_hsa_ext_image_destroy = 200, - HSA_API_ID_hsa_ext_sampler_create = 201, - HSA_API_ID_hsa_ext_sampler_destroy = 202, - HSA_API_ID_hsa_ext_image_get_capability_with_layout = 203, - HSA_API_ID_hsa_ext_image_data_get_info_with_layout = 204, - HSA_API_ID_hsa_ext_image_create_with_layout = 205, + HSA_API_ID_hsa_ext_image_get_capability = 195, + HSA_API_ID_hsa_ext_image_data_get_info = 196, + HSA_API_ID_hsa_ext_image_create = 197, + HSA_API_ID_hsa_ext_image_import = 198, + HSA_API_ID_hsa_ext_image_export = 199, + HSA_API_ID_hsa_ext_image_copy = 200, + HSA_API_ID_hsa_ext_image_clear = 201, + HSA_API_ID_hsa_ext_image_destroy = 202, + HSA_API_ID_hsa_ext_sampler_create = 203, + HSA_API_ID_hsa_ext_sampler_destroy = 204, + HSA_API_ID_hsa_ext_image_get_capability_with_layout = 205, + HSA_API_ID_hsa_ext_image_data_get_info_with_layout = 206, + HSA_API_ID_hsa_ext_image_create_with_layout = 207, - HSA_API_ID_DISPATCH = 206, - HSA_API_ID_NUMBER = 207, + HSA_API_ID_DISPATCH = 208, + HSA_API_ID_NUMBER = 209, }; /* Declarations of APIs intended for use only by tools. */ typedef void (*hsa_amd_queue_intercept_packet_writer)(const void*, uint64_t); @@ -261,9 +263,9 @@ struct hsa_api_data_t { uint32_t phase; union { uint64_t uint64_t_retval; - uint32_t uint32_t_retval; - hsa_signal_value_t hsa_signal_value_t_retval; hsa_status_t hsa_status_t_retval; + hsa_signal_value_t hsa_signal_value_t_retval; + uint32_t uint32_t_retval; }; union { /* block: CoreApi API */ @@ -1236,6 +1238,18 @@ struct hsa_api_data_t { hsa_agent_t agent; size_t threshold; } hsa_amd_agent_set_async_scratch_limit; + struct { + hsa_queue_t* queue; + hsa_queue_info_attribute_t attribute; + void* value; + } hsa_amd_queue_get_info; + struct { + void** va; + size_t size; + uint64_t address; + uint64_t alignment; + uint64_t flags; + } hsa_amd_vmem_address_reserve_align; /* block: ImageExt API */ struct { @@ -2888,6 +2902,24 @@ inline std::ostream& operator<< (std::ostream& out, const hsa_api_data_pair_t& d out << ") = " << api_data.hsa_status_t_retval; break; } + case HSA_API_ID_hsa_amd_queue_get_info: { + out << "hsa_amd_queue_get_info("; + out << api_data.args.hsa_amd_queue_get_info.queue << ", "; + out << api_data.args.hsa_amd_queue_get_info.attribute << ", "; + out << api_data.args.hsa_amd_queue_get_info.value; + out << ") = " << api_data.hsa_status_t_retval; + break; + } + case HSA_API_ID_hsa_amd_vmem_address_reserve_align: { + out << "hsa_amd_vmem_address_reserve_align("; + out << api_data.args.hsa_amd_vmem_address_reserve_align.va << ", "; + out << api_data.args.hsa_amd_vmem_address_reserve_align.size << ", "; + out << api_data.args.hsa_amd_vmem_address_reserve_align.address << ", "; + out << api_data.args.hsa_amd_vmem_address_reserve_align.alignment << ", "; + out << api_data.args.hsa_amd_vmem_address_reserve_align.flags; + out << ") = " << api_data.hsa_status_t_retval; + break; + } /* block: ImageExt API */ case HSA_API_ID_hsa_ext_image_get_capability: {