From 36c38ad970a769ace00816cd4c9934812a916877 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 21 Oct 2022 04:04:38 +0000 Subject: [PATCH 001/118] wmma_op + unit test --- include/ck/ck.hpp | 11 +- include/ck/utility/amd_wmma.hpp | 136 ++++++++++++++++++++++++ include/ck/utility/data_type.hpp | 5 + test/CMakeLists.txt | 1 + test/wmma_op/CMakeLists.txt | 2 + test/wmma_op/wmma_op.cpp | 176 +++++++++++++++++++++++++++++++ 6 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 include/ck/utility/amd_wmma.hpp create mode 100644 test/wmma_op/CMakeLists.txt create mode 100644 test/wmma_op/wmma_op.cpp diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index ad85e233825..8054fa9cdf8 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -25,7 +25,7 @@ // check GPU target #ifdef __HIP_DEVICE_COMPILE__ #if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \ - defined(__gfx90a__) || defined(__gfx1030__)) + defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__)) #error Not supported target #endif #endif @@ -38,6 +38,8 @@ #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000 #elif defined(__gfx1030__) // for GPU code #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 +#elif defined(__gfx1100__) // for GPU code +#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000 #endif // FMA instruction @@ -62,6 +64,13 @@ #define CK_USE_AMD_MFMA_BF16_1K_OP #endif +// WMMA instruction +#ifndef __HIP_DEVICE_COMPILE__ // for host code +#define CK_USE_AMD_WMMA +#elif defined(__gfx1100__) // for GPU code +#define CK_USE_AMD_WMMA +#endif + // buffer load #define CK_USE_AMD_BUFFER_LOAD 1 diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp new file mode 100644 index 00000000000..f88d3ac87cd --- /dev/null +++ b/include/ck/utility/amd_wmma.hpp @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef CK_AMD_WMMA_HPP +#define CK_AMD_WMMA_HPP + +#include "data_type.hpp" + +namespace ck { + +// wave32 only +// src: fp16, dst: fp32 +template +struct intrin_wmma_f32_16x16x16_f16_w32; + +template <> +struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> +{ + template + __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + } +}; + +// src: bf16, dst: fp32 +template +struct intrin_wmma_f32_16x16x16_bf16_w32; + +template <> +struct intrin_wmma_f32_16x16x16_bf16_w32<16, 16> +{ + template + __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + } +}; + +// src: fp16, dst: fp16 +template +struct intrin_wmma_f16_16x16x16_f16_w32; + +template <> +struct intrin_wmma_f16_16x16x16_f16_w32<16, 16> +{ + template + __device__ static void + Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c, const bool opsel) + { + // opsel usage + // false: D0.[0:15] = result + // true : D0.[16:31]= result + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], opsel); + } +}; + +// src: bf16, dst: bf32 +template +struct intrin_wmma_bf16_16x16x16_bf16_w32; + +template <> +struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16> +{ + template + __device__ static void + Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c, const bool opsel) + { + // opsel usage + // false: D0.[0:15] = result + // true : D0.[16:31]= result + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], opsel); + } +}; + +// src: iu8, dst: i32 +template +struct intrin_wmma_i32_16x16x16_iu8_w32; + +template <> +struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16> +{ + template + __device__ static void Run(const bool neg_a, + const int8x16_t& reg_a, + const bool neg_b, + const int8x16_t& reg_b, + FloatC& reg_c, + const bool clamp) + { + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32( + neg_a, + bit_cast(reg_a), + neg_b, + bit_cast(reg_b), + reg_c.template AsType()[Number<0>{}], + clamp); + } +}; + +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +// src: iu4, dst: i32 +template +struct intrin_wmma_i32_16x16x16_iu4_w32; + +template <> +struct intrin_wmma_i32_16x16x16_iu4_w32<16, 16> +{ + template + __device__ static void Run(const bool neg_a, + const int4x16_t& reg_a, + const bool neg_b, + const int4x16_t& reg_b, + FloatC& reg_c, + const bool clamp) + { + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32( + neg_a, + bit_cast(reg_a), + neg_b, + bit_cast(reg_b), + reg_c.template AsType()[Number<0>{}], + clamp); + } +}; +#endif +} // namespace ck +#endif diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 40ee8b617e2..9fc55423750 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -942,6 +942,11 @@ using int8x16_t = typename vector_type::type; using int8x32_t = typename vector_type::type; using int8x64_t = typename vector_type::type; +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +// i4 +using int4x16_t = typename vector_type::type; +#endif + // Convert X to Y template __host__ __device__ constexpr Y type_convert(X x) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e1b0b9c6e67..264a8352392 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -52,3 +52,4 @@ add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) add_subdirectory(normalization) add_subdirectory(data_type) +add_subdirectory(wmma_op) diff --git a/test/wmma_op/CMakeLists.txt b/test/wmma_op/CMakeLists.txt new file mode 100644 index 00000000000..e553253c625 --- /dev/null +++ b/test/wmma_op/CMakeLists.txt @@ -0,0 +1,2 @@ +add_test_executable(test_wmma_op wmma_op.cpp) +target_link_libraries(test_wmma_op PRIVATE utility) diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp new file mode 100644 index 00000000000..86d95b56e0b --- /dev/null +++ b/test/wmma_op/wmma_op.cpp @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/amd_wmma.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" + +namespace ck { +__global__ void matmul(const half_t* a, const half_t* b, float* c) +{ + const int lIdx = threadIdx.x; + + // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and + // b a_frag will store one column of the 16x16 matrix tile b_frag will store one row of the + // 16x16 matrix tile + half16_t a_frag = {}; + half16_t b_frag = {}; + // initialize c fragment to 0 + StaticBufferTupleOfVector c_thread_buf_; + + // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in gfx11 + // see https://atlvsp3.amd.com/sp3_gfx11_5_instructions.pdf page 482 + // TODO: remove this dependency in gfx12 https://ontrack-internal.amd.com/browse/DEGFXSP3-101 + const int lane = lIdx % 16; + + for(int ele = 0; ele < 16; ++ele) + { + b_frag[ele] = b[16 * lane + ele]; + } + // follow origin design + for(int ele = 0; ele < 16; ++ele) + { + a_frag[ele] = a[16 * lane + ele]; + } + + // sync threads, similar to mma_sync + __syncthreads(); + intrin_wmma_f32_16x16x16_f16_w32<16, 16>::Run( + a_frag, b_frag, c_thread_buf_.GetVectorTypeReference(Number<0>{})); + __syncthreads(); + // wait for results, similar to mma_sync + static_for<0, 8, 1>{}([&](auto ele) { + const int r = ele * 2 + (lIdx / 16); + // store results from unpacked c_thread_buf_ output + c[16 * r + lane] = c_thread_buf_[Number{}]; + }); +} + +__global__ void matmul_swizzle_a(const half_t* a, const half_t* b, float* c) +{ + const int lIdx = threadIdx.x; + + half16_t a_frag = {}; + half16_t b_frag = {}; + StaticBufferTupleOfVector c_thread_buf_; + + const int lane = lIdx % 16; + + for(int ele = 0; ele < 16; ++ele) + { + b_frag[ele] = b[16 * lane + ele]; + } + + const int offset_m = (((lane & 1) << 3) | (lane >> 1)); + for(int ele = 0; ele < 16; ++ele) + { + a_frag[ele] = a[16 * offset_m + ele]; + } + + __syncthreads(); + intrin_wmma_f32_16x16x16_f16_w32<16, 16>::Run( + a_frag, b_frag, c_thread_buf_.GetVectorTypeReference(Number<0>{})); + __syncthreads(); + + static_for<0, 8, 1>{}([&](auto ele) { + const int blk = lIdx / 16; + const int r = ele; + c[16 * 8 * blk + 16 * r + lane] = c_thread_buf_[Number{}]; + }); +} +} // namespace ck + +int main(int, char*[]) +{ + std::vector host_a(16 * 16); + std::vector host_b(16 * 16); + std::vector host_c(16 * 16); + std::vector wmma_c(16 * 16); + std::vector wmma_c_swizzle_a(16 * 16); + uint64_t num_element = 256; + + // generate matrix a + for(int i_m = 0; i_m < 16; i_m++) + { + for(int i_k = 0; i_k < 16; i_k++) + { + host_a[i_m * 16 + i_k] = float(i_m + 1) / 99.0 + (float(i_k + 1) / 100); + // host_a[i_m * 16 + i_k] = float(i_k); + } + } + + // generate matrix b + for(int i_n = 0; i_n < 16; i_n++) + { + for(int i_k = 0; i_k < 16; i_k++) + { + host_b[i_n * 16 + i_k] = float(i_n + 1) / 98.0 + (float(i_k + 1) / 100); + // host_b[i_n * 16 + i_k] = 1.0; + } + } + + // run mk_nk_mn gemm on cpu + for(int i_m = 0; i_m < 16; i_m++) + { + for(int i_n = 0; i_n < 16; i_n++) + { + for(int i_k = 0; i_k < 16; i_k++) + { + host_c[i_m * 16 + i_n] += host_a[i_m * 16 + i_k] * host_b[i_n * 16 + i_k]; + } + } + } + + DeviceMem device_a(sizeof(ck::half_t) * num_element); + DeviceMem device_b(sizeof(ck::half_t) * num_element); + DeviceMem device_c(sizeof(float) * num_element); + + std::vector fp16_a(16 * 16); + std::vector fp16_b(16 * 16); + // convert fp32 a and b into fp16 on host + for(int i = 0; i < 16 * 16; i++) + { + fp16_a[i] = __float2half_rn(host_a[i]); + fp16_b[i] = __float2half_rn(host_b[i]); + } + + device_a.ToDevice(fp16_a.data()); + device_b.ToDevice(fp16_b.data()); + + // run single wave wmma on GPU + ck::matmul<<<1, 32>>>(static_cast(device_a.GetDeviceBuffer()), + static_cast(device_b.GetDeviceBuffer()), + static_cast(device_c.GetDeviceBuffer())); + + device_c.FromDevice(wmma_c.data()); + + bool res = ck::utils::check_err(wmma_c, host_c, "Error: Incorrect results!", 1e-2); + + // run single wave wmma_swizzle_a on GPU + ck::matmul_swizzle_a<<<1, 32>>>(static_cast(device_a.GetDeviceBuffer()), + static_cast(device_b.GetDeviceBuffer()), + static_cast(device_c.GetDeviceBuffer())); + device_c.FromDevice(wmma_c_swizzle_a.data()); + + bool res_swizzle_a = + ck::utils::check_err(wmma_c_swizzle_a, host_c, "Error: Incorrect results!", 1e-2); + + if(res && res_swizzle_a) + { + std::cout << "test single wave wmma: Pass" << std::endl; + return 0; + } + else + { + std::cout << "test single wave wmma: Fail" << std::endl; + return -1; + } +} From 7dca8463152ec077ad21e3edbe7122f5438dddb5 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 21 Oct 2022 07:46:45 +0000 Subject: [PATCH 002/118] add arch limitation to wmma test --- test/wmma_op/wmma_op.cpp | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp index 86d95b56e0b..9a7c4316c2c 100644 --- a/test/wmma_op/wmma_op.cpp +++ b/test/wmma_op/wmma_op.cpp @@ -16,6 +16,7 @@ namespace ck { __global__ void matmul(const half_t* a, const half_t* b, float* c) { +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) const int lIdx = threadIdx.x; // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and @@ -52,10 +53,16 @@ __global__ void matmul(const half_t* a, const half_t* b, float* c) // store results from unpacked c_thread_buf_ output c[16 * r + lane] = c_thread_buf_[Number{}]; }); +#else + ignore = a; + ignore = b; + ignore = c; +#endif // end of if (defined(__gfx1100__)) } __global__ void matmul_swizzle_a(const half_t* a, const half_t* b, float* c) { +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) const int lIdx = threadIdx.x; half16_t a_frag = {}; @@ -85,6 +92,11 @@ __global__ void matmul_swizzle_a(const half_t* a, const half_t* b, float* c) const int r = ele; c[16 * 8 * blk + 16 * r + lane] = c_thread_buf_[Number{}]; }); +#else + ignore = a; + ignore = b; + ignore = c; +#endif // end of if (defined(__gfx1100__)) } } // namespace ck @@ -152,16 +164,20 @@ int main(int, char*[]) device_c.FromDevice(wmma_c.data()); - bool res = ck::utils::check_err(wmma_c, host_c, "Error: Incorrect results!", 1e-2); - // run single wave wmma_swizzle_a on GPU ck::matmul_swizzle_a<<<1, 32>>>(static_cast(device_a.GetDeviceBuffer()), static_cast(device_b.GetDeviceBuffer()), static_cast(device_c.GetDeviceBuffer())); device_c.FromDevice(wmma_c_swizzle_a.data()); - bool res_swizzle_a = + // result check + bool res = true; + bool res_swizzle_a = true; +#if(defined(__gfx1100__)) + res = ck::utils::check_err(wmma_c, host_c, "Error: Incorrect results!", 1e-2); + res_swizzle_a = ck::utils::check_err(wmma_c_swizzle_a, host_c, "Error: Incorrect results!", 1e-2); +#endif // end of if (defined(__gfx1100__)) if(res && res_swizzle_a) { From 049cc8afcf880293d4a1bc0ee264f0c681b1b171 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 21 Oct 2022 08:46:11 +0000 Subject: [PATCH 003/118] change arch limitation --- include/ck/utility/amd_wmma.hpp | 2 +- test/CMakeLists.txt | 4 +++- test/wmma_op/wmma_op.cpp | 16 +--------------- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index f88d3ac87cd..efb0923ab72 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -5,7 +5,7 @@ #define CK_AMD_WMMA_HPP #include "data_type.hpp" - +// TODO: Add arch limitation namespace ck { // wave32 only diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 264a8352392..90308fa59ff 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -52,4 +52,6 @@ add_subdirectory(block_to_ctile_map) add_subdirectory(softmax) add_subdirectory(normalization) add_subdirectory(data_type) -add_subdirectory(wmma_op) +if(GPU_TARGETS MATCHES "gfx1100") + add_subdirectory(wmma_op) +endif() diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp index 9a7c4316c2c..7acea2ef105 100644 --- a/test/wmma_op/wmma_op.cpp +++ b/test/wmma_op/wmma_op.cpp @@ -16,7 +16,6 @@ namespace ck { __global__ void matmul(const half_t* a, const half_t* b, float* c) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) const int lIdx = threadIdx.x; // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and @@ -53,16 +52,10 @@ __global__ void matmul(const half_t* a, const half_t* b, float* c) // store results from unpacked c_thread_buf_ output c[16 * r + lane] = c_thread_buf_[Number{}]; }); -#else - ignore = a; - ignore = b; - ignore = c; -#endif // end of if (defined(__gfx1100__)) } __global__ void matmul_swizzle_a(const half_t* a, const half_t* b, float* c) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) const int lIdx = threadIdx.x; half16_t a_frag = {}; @@ -92,11 +85,6 @@ __global__ void matmul_swizzle_a(const half_t* a, const half_t* b, float* c) const int r = ele; c[16 * 8 * blk + 16 * r + lane] = c_thread_buf_[Number{}]; }); -#else - ignore = a; - ignore = b; - ignore = c; -#endif // end of if (defined(__gfx1100__)) } } // namespace ck @@ -173,11 +161,9 @@ int main(int, char*[]) // result check bool res = true; bool res_swizzle_a = true; -#if(defined(__gfx1100__)) - res = ck::utils::check_err(wmma_c, host_c, "Error: Incorrect results!", 1e-2); + res = ck::utils::check_err(wmma_c, host_c, "Error: Incorrect results!", 1e-2); res_swizzle_a = ck::utils::check_err(wmma_c_swizzle_a, host_c, "Error: Incorrect results!", 1e-2); -#endif // end of if (defined(__gfx1100__)) if(res && res_swizzle_a) { From 790e21ecc7edeeff122e788ef8e36684b6c3b99b Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 28 Oct 2022 16:10:55 +0000 Subject: [PATCH 004/118] Refactor + Add all type unit test(int4 compile failed) --- include/ck/utility/amd_wmma.hpp | 50 ++--- test/wmma_op/wmma_op.cpp | 216 +++++-------------- test/wmma_op/wmma_op_util.hpp | 357 ++++++++++++++++++++++++++++++++ 3 files changed, 428 insertions(+), 195 deletions(-) create mode 100644 test/wmma_op/wmma_op_util.hpp diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index efb0923ab72..ee3759d7e48 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -41,58 +41,51 @@ struct intrin_wmma_f32_16x16x16_bf16_w32<16, 16> }; // src: fp16, dst: fp16 -template +template struct intrin_wmma_f16_16x16x16_f16_w32; -template <> -struct intrin_wmma_f16_16x16x16_f16_w32<16, 16> +template +struct intrin_wmma_f16_16x16x16_f16_w32<16, 16, Opsel> { template - __device__ static void - Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c, const bool opsel) + __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) { // opsel usage // false: D0.[0:15] = result // true : D0.[16:31]= result reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], opsel); + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], Opsel); } }; -// src: bf16, dst: bf32 -template +// src: bf16, dst: bf16 +template struct intrin_wmma_bf16_16x16x16_bf16_w32; -template <> -struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16> +template +struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, Opsel> { template - __device__ static void - Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c, const bool opsel) + __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c) { // opsel usage // false: D0.[0:15] = result // true : D0.[16:31]= result reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}], opsel); + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], Opsel); } }; // src: iu8, dst: i32 -template +template struct intrin_wmma_i32_16x16x16_iu8_w32; -template <> -struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16> +template +struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16, neg_a, neg_b, clamp> { template - __device__ static void Run(const bool neg_a, - const int8x16_t& reg_a, - const bool neg_b, - const int8x16_t& reg_b, - FloatC& reg_c, - const bool clamp) + __device__ static void Run(const int8x16_t& reg_a, const int8x16_t& reg_b, FloatC& reg_c) { reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32( @@ -107,19 +100,14 @@ struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16> #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 // src: iu4, dst: i32 -template +template struct intrin_wmma_i32_16x16x16_iu4_w32; -template <> -struct intrin_wmma_i32_16x16x16_iu4_w32<16, 16> +template +struct intrin_wmma_i32_16x16x16_iu4_w32<16, 16, neg_a, neg_b, clamp> { template - __device__ static void Run(const bool neg_a, - const int4x16_t& reg_a, - const bool neg_b, - const int4x16_t& reg_b, - FloatC& reg_c, - const bool clamp) + __device__ static void Run(const int4x16_t& reg_a, const int4x16_t& reg_b, FloatC& reg_c) { reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32( diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp index 7acea2ef105..34ebf41a3ca 100644 --- a/test/wmma_op/wmma_op.cpp +++ b/test/wmma_op/wmma_op.cpp @@ -1,178 +1,66 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +#include +#include #include #include -#include -#include +#include +#include #include "ck/ck.hpp" -#include "ck/utility/amd_wmma.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" - -namespace ck { -__global__ void matmul(const half_t* a, const half_t* b, float* c) +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "test/wmma_op/wmma_op_util.hpp" + +template +bool run_test() { - const int lIdx = threadIdx.x; - - // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and - // b a_frag will store one column of the 16x16 matrix tile b_frag will store one row of the - // 16x16 matrix tile - half16_t a_frag = {}; - half16_t b_frag = {}; - // initialize c fragment to 0 - StaticBufferTupleOfVector c_thread_buf_; - - // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in gfx11 - // see https://atlvsp3.amd.com/sp3_gfx11_5_instructions.pdf page 482 - // TODO: remove this dependency in gfx12 https://ontrack-internal.amd.com/browse/DEGFXSP3-101 - const int lane = lIdx % 16; - - for(int ele = 0; ele < 16; ++ele) - { - b_frag[ele] = b[16 * lane + ele]; - } - // follow origin design - for(int ele = 0; ele < 16; ++ele) - { - a_frag[ele] = a[16 * lane + ele]; - } - - // sync threads, similar to mma_sync - __syncthreads(); - intrin_wmma_f32_16x16x16_f16_w32<16, 16>::Run( - a_frag, b_frag, c_thread_buf_.GetVectorTypeReference(Number<0>{})); - __syncthreads(); - // wait for results, similar to mma_sync - static_for<0, 8, 1>{}([&](auto ele) { - const int r = ele * 2 + (lIdx / 16); - // store results from unpacked c_thread_buf_ output - c[16 * r + lane] = c_thread_buf_[Number{}]; + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + bool pass = true; + + const auto matmul_default = ck::wmma_op_util::matmul; + const auto matmul_swizzle_a = + ck::wmma_op_util::matmul_swizzle_a; + + const auto wmma_kernel_container = std::make_tuple(matmul_default, matmul_swizzle_a); + + ck::static_for<0, 2, 1>{}([&](auto i) { + pass &= + ck::wmma_op_util::TestWmma{}>(wmma_kernel_container)), + SrcType, + SrcType, + DstType, + GPUAccType, + CPUAccType, + decltype(Row{}), + decltype(Col{}), + decltype(Row{}), + PassThrough, + PassThrough, + PassThrough, + AccNum>{}(std::get{}>(wmma_kernel_container)); }); -} - -__global__ void matmul_swizzle_a(const half_t* a, const half_t* b, float* c) -{ - const int lIdx = threadIdx.x; - - half16_t a_frag = {}; - half16_t b_frag = {}; - StaticBufferTupleOfVector c_thread_buf_; - - const int lane = lIdx % 16; - - for(int ele = 0; ele < 16; ++ele) - { - b_frag[ele] = b[16 * lane + ele]; - } - - const int offset_m = (((lane & 1) << 3) | (lane >> 1)); - for(int ele = 0; ele < 16; ++ele) - { - a_frag[ele] = a[16 * offset_m + ele]; - } - __syncthreads(); - intrin_wmma_f32_16x16x16_f16_w32<16, 16>::Run( - a_frag, b_frag, c_thread_buf_.GetVectorTypeReference(Number<0>{})); - __syncthreads(); - - static_for<0, 8, 1>{}([&](auto ele) { - const int blk = lIdx / 16; - const int r = ele; - c[16 * 8 * blk + 16 * r + lane] = c_thread_buf_[Number{}]; - }); + return pass ? 1 : 0; } -} // namespace ck - int main(int, char*[]) { - std::vector host_a(16 * 16); - std::vector host_b(16 * 16); - std::vector host_c(16 * 16); - std::vector wmma_c(16 * 16); - std::vector wmma_c_swizzle_a(16 * 16); - uint64_t num_element = 256; - - // generate matrix a - for(int i_m = 0; i_m < 16; i_m++) - { - for(int i_k = 0; i_k < 16; i_k++) - { - host_a[i_m * 16 + i_k] = float(i_m + 1) / 99.0 + (float(i_k + 1) / 100); - // host_a[i_m * 16 + i_k] = float(i_k); - } - } - - // generate matrix b - for(int i_n = 0; i_n < 16; i_n++) - { - for(int i_k = 0; i_k < 16; i_k++) - { - host_b[i_n * 16 + i_k] = float(i_n + 1) / 98.0 + (float(i_k + 1) / 100); - // host_b[i_n * 16 + i_k] = 1.0; - } - } - - // run mk_nk_mn gemm on cpu - for(int i_m = 0; i_m < 16; i_m++) - { - for(int i_n = 0; i_n < 16; i_n++) - { - for(int i_k = 0; i_k < 16; i_k++) - { - host_c[i_m * 16 + i_n] += host_a[i_m * 16 + i_k] * host_b[i_n * 16 + i_k]; - } - } - } - - DeviceMem device_a(sizeof(ck::half_t) * num_element); - DeviceMem device_b(sizeof(ck::half_t) * num_element); - DeviceMem device_c(sizeof(float) * num_element); - - std::vector fp16_a(16 * 16); - std::vector fp16_b(16 * 16); - // convert fp32 a and b into fp16 on host - for(int i = 0; i < 16 * 16; i++) - { - fp16_a[i] = __float2half_rn(host_a[i]); - fp16_b[i] = __float2half_rn(host_b[i]); - } - - device_a.ToDevice(fp16_a.data()); - device_b.ToDevice(fp16_b.data()); - - // run single wave wmma on GPU - ck::matmul<<<1, 32>>>(static_cast(device_a.GetDeviceBuffer()), - static_cast(device_b.GetDeviceBuffer()), - static_cast(device_c.GetDeviceBuffer())); - - device_c.FromDevice(wmma_c.data()); - - // run single wave wmma_swizzle_a on GPU - ck::matmul_swizzle_a<<<1, 32>>>(static_cast(device_a.GetDeviceBuffer()), - static_cast(device_b.GetDeviceBuffer()), - static_cast(device_c.GetDeviceBuffer())); - device_c.FromDevice(wmma_c_swizzle_a.data()); - - // result check - bool res = true; - bool res_swizzle_a = true; - res = ck::utils::check_err(wmma_c, host_c, "Error: Incorrect results!", 1e-2); - res_swizzle_a = - ck::utils::check_err(wmma_c_swizzle_a, host_c, "Error: Incorrect results!", 1e-2); - - if(res && res_swizzle_a) - { - std::cout << "test single wave wmma: Pass" << std::endl; - return 0; - } - else - { - std::cout << "test single wave wmma: Fail" << std::endl; - return -1; - } + bool pass = true; + // clang-format off + // |SrcType |DstType |GPUAccType |CPUAccType |AccNum + pass &= run_test(); + pass &= run_test(); + pass &= run_test(); + pass &= run_test(); + // clang-format on + + std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; + return pass ? 0 : 1; } diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp new file mode 100644 index 00000000000..4740e020020 --- /dev/null +++ b/test/wmma_op/wmma_op_util.hpp @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/utility/amd_wmma.hpp" + +namespace ck { +namespace wmma_op_util { + +template +__device__ void builtin_wmma_naive_selector(const src_vec&, const src_vec&, acc_vec&) +{ +} + +template <> +__device__ void +builtin_wmma_naive_selector>( + const half16_t& reg_a, + const half16_t& reg_b, + StaticBufferTupleOfVector& reg_c) +{ + intrin_wmma_f32_16x16x16_f16_w32<16, 16>::Run( + reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{})); +} + +template <> +__device__ void +builtin_wmma_naive_selector>( + const half16_t& reg_a, + const half16_t& reg_b, + StaticBufferTupleOfVector& reg_c) +{ + intrin_wmma_f16_16x16x16_f16_w32<16, 16, 0>::Run( + reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{})); +} + +template <> +__device__ void builtin_wmma_naive_selector< + bhalf16_t, + StaticBufferTupleOfVector>( + const bhalf16_t& reg_a, + const bhalf16_t& reg_b, + StaticBufferTupleOfVector& reg_c) +{ + intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, 0>::Run( + reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{})); +} + +template <> +__device__ void +builtin_wmma_naive_selector>( + const int8x16_t& reg_a, + const int8x16_t& reg_b, + StaticBufferTupleOfVector& reg_c) +{ + intrin_wmma_i32_16x16x16_iu8_w32<16, 16, true, true, false>::Run( + reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{})); +} + +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +template <> +__device__ void +builtin_wmma_naive_selector>( + const int4x16_t& reg_a, + const int4x16_t& reg_b, + StaticBufferTupleOfVector& reg_c) +{ + intrin_wmma_i32_16x16x16_iu4_w32<16, 16, true, true, false>::Run( + reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{})); +} +#endif + +template +__global__ void matmul(const src_t* a, const src_t* b, dst_t* c) +{ + const int lIdx = threadIdx.x; + // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and + // b a_frag will store one column of the 16x16 matrix tile b_frag will store one row of the + // 16x16 matrix tile + using src_vec = typename vector_type::type; + src_vec a_frag = {}; + src_vec b_frag = {}; + // initialize c fragment to 0 + using acc_vec = StaticBufferTupleOfVector; + acc_vec c_thread_buf_; + + // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in gfx11 + // see https://atlvsp3.amd.com/sp3_gfx11_5_instructions.pdf page 482 + // TODO: remove this dependency in gfx12 https://ontrack-internal.amd.com/browse/DEGFXSP3-101 + const int lane = lIdx % 16; + + for(int ele = 0; ele < 16; ++ele) + { + b_frag[ele] = b[16 * lane + ele]; + } + // follow origin design + for(int ele = 0; ele < 16; ++ele) + { + a_frag[ele] = a[16 * lane + ele]; + } + + // sync threads, similar to mma_sync + __syncthreads(); + builtin_wmma_naive_selector(a_frag, b_frag, c_thread_buf_); + __syncthreads(); + // wait for results, similar to mma_sync + static_for<0, 8, 1>{}([&](auto ele) { + const int r = ele * 2 + (lIdx / 16); + // store results from unpacked c_thread_buf_ output + c[16 * r + lane] = ck::type_convert(c_thread_buf_[Number{}]); + }); +} + +template +__global__ void matmul_swizzle_a(const src_t* a, const src_t* b, dst_t* c) +{ + const int lIdx = threadIdx.x; + + using src_vec = typename vector_type::type; + src_vec a_frag = {}; + src_vec b_frag = {}; + using acc_vec = StaticBufferTupleOfVector; + acc_vec c_thread_buf_; + + const int lane = lIdx % 16; + + for(int ele = 0; ele < 16; ++ele) + { + b_frag[ele] = b[16 * lane + ele]; + } + + const int offset_m = (((lane & 1) << 3) | (lane >> 1)); + for(int ele = 0; ele < 16; ++ele) + { + a_frag[ele] = a[16 * offset_m + ele]; + } + + __syncthreads(); + builtin_wmma_naive_selector(a_frag, b_frag, c_thread_buf_); + __syncthreads(); + + static_for<0, 8, 1>{}([&](auto ele) { + const int blk = lIdx / 16; + const int r = ele; + c[16 * 8 * blk + 16 * r + lane] = + ck::type_convert(c_thread_buf_[Number{}]); + }); +} + +struct GemmParams +{ + GemmParams() : M(16), N(16), K(16), StrideA(16), StrideB(16), StrideC(16), alpha(1), beta(0) {} + + ck::index_t M; + ck::index_t N; + ck::index_t K; + + ck::index_t StrideA; + ck::index_t StrideB; + ck::index_t StrideC; + + float alpha; + float beta; +}; + +template +void RunHostGEMM(const Tensor& A, + const Tensor& B, + Tensor& C, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) +{ + auto ref_gemm = GemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); +} + +template +bool RunDeviceGEMM(KernelType kernel, + const Tensor& A, + const Tensor& B, + Tensor& C) +{ + DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize()); + DeviceMem b_n_k_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize()); + + a_m_k_device_buf.ToDevice(A.mData.data()); + b_n_k_device_buf.ToDevice(B.mData.data()); + kernel<<<1, 32>>>(static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_n_k_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer())); + c_m_n_device_buf.FromDevice(C.mData.data()); + + return true; +} + +template +struct TestWmma +{ + auto PrepareGemmTensor(const ck::wmma_op_util::GemmParams& params) + { + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if(std::is_same::value) + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({stride, 1})); + } + else + { + return HostTensorDescriptor(std::vector({row, col}), + std::vector({1, stride})); + } + }; + + Tensor a_m_k( + f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{})); + Tensor b_n_k( + f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{})); + Tensor c_m_n_host_result( + f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); + Tensor c_m_n_device_result( + f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{})); + + auto f_generate_tensor_value = [](auto& tensor, auto type) { + using dataType = decltype(type); + + tensor.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + }; + + f_generate_tensor_value(a_m_k, ADataType{}); + f_generate_tensor_value(b_n_k, BDataType{}); + + return std::make_tuple(a_m_k, b_n_k, c_m_n_host_result, c_m_n_device_result); + } + + auto operator()(const DeviceWmma& wmma_kernel) + { + std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name + << ", CLayout = " << CLayout{}.name << std::endl; + + // Arrange + ck::wmma_op_util::GemmParams params; + params.M = 16; + params.N = 16; + params.K = 16; + params.StrideA = 16; + params.StrideB = 16; + params.StrideC = 16; + + auto host_tensors = PrepareGemmTensor(params); + + const Tensor& a = std::get<0>(host_tensors); + const Tensor& b = std::get<1>(host_tensors); + Tensor& c_host = std::get<2>(host_tensors); + Tensor& c_device = std::get<3>(host_tensors); + + auto a_element_op = AElementwiseOperation{}; + auto b_element_op = BElementwiseOperation{}; + auto c_element_op = CElementwiseOperation{}; + + using ReferenceGemmInstance = + ck::tensor_operation::host::ReferenceGemm; + ck::wmma_op_util::RunHostGEMM( + a, b, c_host, a_element_op, b_element_op, c_element_op); + + // Act + bool is_supported = ck::wmma_op_util::RunDeviceGEMM(wmma_kernel, a, b, c_device); + + if(is_supported) + { + // Assert + bool res = false; + if(std::is_same::value) + { + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + else if(std::is_same::value) + { + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + else if(std::is_same::value) + { + // 0.5 Pixel Error Tolerance is introduced by Accumulator difference. + // BF16 WMMA Accumulator is in BF16 Type while On Host-side Accumulator is Float. + res = ck::utils::check_err( + c_device.mData, c_host.mData, "Error: Incorrect results!", 0, 1.0); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + else if(std::is_same::value) + { + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + else if(std::is_same::value) + { + res = ck::utils::check_err(c_device.mData, c_host.mData); + std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl; + } + else + { + std::cout << "UNSUPPORTED CDataType" << std::endl; + } + + return res; + } + else + { + return true; + } + } +}; + +} // namespace wmma_op_util +} // namespace ck From 24faa1fc91095bfd26a8fd51a89dd93c499853bd Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 28 Oct 2022 16:23:22 +0000 Subject: [PATCH 005/118] Add f32_16x16x16_bf16 unit test --- test/wmma_op/wmma_op.cpp | 6 +++++- test/wmma_op/wmma_op_util.hpp | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp index 34ebf41a3ca..ebf99af4aff 100644 --- a/test/wmma_op/wmma_op.cpp +++ b/test/wmma_op/wmma_op.cpp @@ -55,10 +55,14 @@ int main(int, char*[]) bool pass = true; // clang-format off // |SrcType |DstType |GPUAccType |CPUAccType |AccNum - pass &= run_test(); + pass &= run_test(); + pass &= run_test(); pass &= run_test(); pass &= run_test(); pass &= run_test(); +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + pass &= run_test(); +#endif // clang-format on std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl; diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp index 4740e020020..ef3f831abde 100644 --- a/test/wmma_op/wmma_op_util.hpp +++ b/test/wmma_op/wmma_op_util.hpp @@ -32,6 +32,18 @@ builtin_wmma_naive_selector{})); } +template <> +__device__ void +builtin_wmma_naive_selector>( + const bhalf16_t& reg_a, + const bhalf16_t& reg_b, + StaticBufferTupleOfVector& reg_c) +{ + intrin_wmma_f32_16x16x16_bf16_w32<16, 16>::Run( + reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{})); +} + template <> __device__ void builtin_wmma_naive_selector Date: Wed, 16 Nov 2022 04:23:22 +0000 Subject: [PATCH 006/118] tempsave --- example/01_gemm/gemm_wmma_fp16.cpp | 39 + .../gpu/block/blockwise_gemm_wmma.hpp | 0 .../gpu/device/impl/device_gemm_wmma.hpp | 565 ++++++++++++ .../gpu/grid/gridwise_gemm_wmma_v1r1.hpp | 815 ++++++++++++++++++ .../tensor_operation/gpu/warp/wmma_gemm.hpp | 383 ++++++++ 5 files changed, 1802 insertions(+) create mode 100644 example/01_gemm/gemm_wmma_fp16.cpp create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp create mode 100644 include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp new file mode 100644 index 00000000000..d76ff09a4d9 --- /dev/null +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" + +using ADataType = ck::half_t; +using BDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = float; +using CDataType = ck::half_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma +// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MWMMA|NMMMA| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| +// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| +// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| +// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>; +// clang-format on + + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +#include "run_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp new file mode 100644 index 00000000000..f3515f407b2 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceGemmWmma : public DeviceGemm +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + + static constexpr auto K1Number = Number{}; + static constexpr auto M1Number = Number{}; + + static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA) + { + assert(K % K1 == 0); + + const index_t K0 = K / K1; + + const auto a_grid_desc_m_k = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); + } + }(); + + if constexpr(GemmSpec == GemmSpecialization::MNPadding) + { + const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_right_pad_transform(M, PadM)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + } + + static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB) + { + assert(K % K1 == 0); + + const index_t K0 = K / K1; + + const auto b_grid_desc_k_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB)); + } + }(); + + if constexpr(GemmSpec == GemmSpecialization::MNPadding) + { + const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + + return transform_tensor_descriptor( + b_grid_desc_k_n, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_right_pad_transform(N, PadN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + return transform_tensor_descriptor( + b_grid_desc_k_n, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + } + + static auto MakeCGridDescriptor_M0_N_M1(index_t M, index_t N, index_t StrideC) + { + assert(M % M1 == 0); + + const index_t M0 = M / M1; + + const auto c_grid_desc_m_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC)); + } + }(); + + if constexpr(GemmSpec == GemmSpecialization::MNPadding) + { + static_assert(false, "Padding Gemm Not implemented"); + /* Not implemented yet. + const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; + const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + + return transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + */ + } + else + { + + return transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(M0, M1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + } + + using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1)); + using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1)); + using CGridDesc_M0_N_M1 = decltype(MakeCGridDescriptor_M0_N_M1(1, 1, 1)); + + // GridwiseGemm + using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_m0nm1_wmma_v1r1< + BlockSize, + ADataType, // TODO: distinguish A/B datatype + AccDataType, + CDataType, + InMemoryDataOperationEnum::Set, + AGridDesc_K0_M_K1, + BGridDesc_K0_N_K1, + CGridDesc_M0_N_M1, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + MPerBlock, + NPerBlock, + K0PerBlock, + MPerXDL, + NPerXDL, + K1, + MXdlPerWave, + NXdlPerWave, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + false, // AThreadTransferSrcResetCoordinateAfterRun, + ABlockLdsAddExtraM, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + false, // BThreadTransferSrcResetCoordinateAfterRun, + BBlockLdsAddExtraN, + Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder, + CThreadTransferSrcDstVectorDim, + CThreadTransferDstScalarPerVector, + NumPrefetch, + LoopSched, + PipelineVer>; + + // Argument + struct Argument : public BaseArgument + { + Argument(const ADataType* p_a_grid, + const BDataType* p_b_grid, + CDataType* p_c_grid, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + index_t M01, + index_t N01, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + : p_a_grid_{p_a_grid}, + p_b_grid_{p_b_grid}, + p_c_grid_{p_c_grid}, + a_grid_desc_k0_m_k1_{}, + b_grid_desc_k0_n_k1_{}, + c_grid_desc_m0_n_m1_{}, + c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{}, + block_2_ctile_map_{}, + M01_{M01}, + N01_{N01}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + c_element_op_{c_element_op} + { + a_grid_desc_k0_m_k1_ = DeviceGemmWmma::MakeAGridDescriptor_K0_M_K1(M, K, StrideA); + b_grid_desc_k0_n_k1_ = DeviceGemmWmma::MakeBGridDescriptor_K0_N_K1(K, N, StrideB); + c_grid_desc_m0_n_m1_ = DeviceGemmWmma::MakeCGridDescriptor_M0_N_M1(M, N, StrideC); + + block_2_ctile_map_ = + GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m0_n_m1_, M01, N01); + + if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_, + b_grid_desc_k0_n_k1_, + c_grid_desc_m0_n_m1_, + block_2_ctile_map_)) + { + c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ = + GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n_m1_); + } + } + + // private: + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + CDataType* p_c_grid_; + AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_; + BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; + CGridDesc_M0_N_M1 c_grid_desc_m0_n_m1_; + typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 + c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_; + typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_; + index_t M01_; + index_t N01_; + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CElementwiseOperation c_element_op_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceGemmWmma::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { +#if 0 + { + std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) + << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " + << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl; + + std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0) + << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", " + << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl; + + std::cout << "arg.c_grid_desc_m0_n_m1_{ " << arg.c_grid_desc_m0_n_m1_.GetLength(I0) + << ", " << arg.c_grid_desc_m0_n_m1_.GetLength(I1) << ", " + << arg.c_grid_desc_m0_n_m1_.GetLength(I2) << "}" << std::endl; + } +#endif + + if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.c_grid_desc_m0_n_m1_, + arg.block_2_ctile_map_)) + { + throw std::runtime_error( + "wrong! GridwiseGemm_k0mk1_k0nk1_m0nm1_wmma_v1r1 has invalid setting"); + } + + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m0_n_m1_); + + const auto K = + arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); + + float ave_time = 0; + + if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) + { + const auto kernel = kernel_gemm_wmma_v1r1< + GridwiseGemm, + ADataType, // TODO: distiguish A/B datatype + CDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + remove_reference_t, + true>; // Last Option is W/O + + ave_time = launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); + } + else + { + const auto kernel = kernel_gemm_wmma_v1r1< + GridwiseGemm, + ADataType, // TODO: distiguish A/B datatype + CDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + remove_reference_t, + false>; + + ave_time = launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); + } + + return ave_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + if(ck::get_device_name() == "gfx1100") + { + if constexpr(!(is_same_v || is_same_v || + is_same_v)) + { + return false; + } + } + else + { + return false; + } + + return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.c_grid_desc_m0_n_m1_, + arg.block_2_ctile_map_); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const ADataType* p_a, + const BDataType* p_b, + CDataType* p_c, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{p_a, + p_b, + p_c, + M, + N, + K, + StrideA, + StrideB, + StrideC, + 1, + 1, + a_element_op, + b_element_op, + c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_c), + M, + N, + K, + StrideA, + StrideB, + StrideC, + 1, + 1, + a_element_op, + b_element_op, + c_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + + // clang-format off + str << "DeviceGemmWmma" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << K0PerBlock << ", " + << K1 << ", " + << MPerXDL << ", " + << NPerXDL << ", " + << MXdlPerWave << ", " + << NXdlPerWave + << ">" + << " NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp new file mode 100644 index 00000000000..778cc96265a --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp @@ -0,0 +1,815 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#define DISABLE_C_SHUFFLE +namespace ck { + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_wmma_v1r1( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const FloatC* __restrict__ p_c0_grid, + const FloatC* __restrict__ p_c1_grid, + const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, + const CGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, +#ifndef DISABLE_C_SHUFFLE + const C0GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + const C1GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, +#endif + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CElementwiseOperation c_element_op, + const Block2CTileMap block_2_ctile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + GridwiseGemm::template Run( + p_a_grid, + p_b_grid, + p_c_grid, + p_c0_grid, + p_c1_grid, + p_shared, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, +#ifndef DISABLE_C_SHUFFLE + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, +#endif + a_element_op, + b_element_op, + c_element_op, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = p_c0_grid; + ignore = p_c1_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + ignore = c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + ignore = c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx1100__)) +} + +template < + index_t BlockSize, + typename FloatAB, + typename FloatAcc, + typename FloatC, + InMemoryDataOperationEnum CGlobalMemoryDataOperation, + typename AGridDesc_K0_M_K1, + typename BGridDesc_K0_N_K1, + typename CGridDesc_M_N, + typename C0GridDesc_M_N, + typename C1GridDesc_M_N, + typename AElementwiseOperation, + typename BElementwiseOperation, + typename CElementwiseOperation, + index_t MPerBlock, + index_t NPerBlock, + index_t K0PerBlock, + index_t MPerWmma, + index_t NPerWmma, + index_t K1Value, + index_t MWmmaPerWave, + index_t NWmmaPerWave, + typename ABlockTransferThreadClusterLengths_K0_M_K1, + typename ABlockTransferThreadClusterArrangeOrder, + typename ABlockTransferSrcAccessOrder, + index_t ABlockTransferSrcVectorDim, + index_t ABlockTransferSrcScalarPerVector, + index_t ABlockTransferDstScalarPerVector_K1, + bool AThreadTransferSrcResetCoordinateAfterRun, + bool ABlockLdsExtraM, + typename BBlockTransferThreadClusterLengths_K0_N_K1, + typename BBlockTransferThreadClusterArrangeOrder, + typename BBlockTransferSrcAccessOrder, + index_t BBlockTransferSrcVectorDim, + index_t BBlockTransferSrcScalarPerVector, + index_t BBlockTransferDstScalarPerVector_K1, + bool BThreadTransferSrcResetCoordinateAfterRun, + bool BBlockLdsExtraN, + index_t CShuffleMWmmaPerWavePerShuffle, + index_t CShuffleNWmmaPerWavePerShuffle, + typename CBlockTransferClusterLengths_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma, + index_t CBlockTransferScalarPerVector_NWaveNPerWmma, + index_t NumGemmKPrefetchStage = 1, + PipelineVersion PipelineVer = PipelineVersion::v1> +struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + static constexpr auto I7 = Number<7>{}; + + // K1 should be Number<...> + static constexpr auto K1 = Number{}; + + using ThisThreadBlock = ThisThreadBlock; + + using GridwiseGemmPipe = remove_cvref_t())>; + + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K10_K1PerInst() + { + constexpr auto inst_max_size = 16 / sizeof(FloatAB); + constexpr auto k1perinst = (K1 {}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + // May have static err + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K10, k1perinst), k1perinst); + } + }(); + + return a_block_desc_k0_m_k1; + } + + __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K10_K1PerInst() + { + constexpr auto inst_max_size = 16 / sizeof(FloatAB); + constexpr auto k1perinst = (K1 {}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K10, k1perinst), k1perinst); + } + }(); + + return b_block_desc_k0_n_k1; + } + + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma() + { + constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); + constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); + + constexpr auto + c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + Number{}, + I1, + Number{}, + Number{})); + + return c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + } + + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + + constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + + constexpr auto max_lds_align = K1; + + constexpr auto a_block_space_size_aligned = + math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align); + + constexpr auto b_block_space_size_aligned = + math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align); + + // LDS allocation for C shuffle in LDS + constexpr auto c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = + GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma(); + + constexpr auto c_block_size = + c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetElementSpaceSize(); + + return math::max((a_block_space_size_aligned + b_block_space_size_aligned) * + sizeof(FloatAB), + c_block_size * sizeof(FloatC)); + } + + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} + template + __host__ __device__ static constexpr bool + CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const CGridDesc_M_N& c_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) + { + static_assert(is_known_at_compile_time>::value, + "wrong! K1 need to be known at compile-time"); + + static_assert((MPerBlock % (MPerWmma * MWmmaPerWave) == 0) && + (NPerBlock % (NWmmaPerWave * NPerWmma)) == 0, + "Invalid tuning param!"); + + const auto M = a_grid_desc_k0_m_k1.GetLength(I1); + const auto N = b_grid_desc_k0_n_k1.GetLength(I1); + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + + if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) && + K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && + K1 == b_grid_desc_k0_n_k1.GetLength(I2))) + return false; + + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) + return false; + + // check gridwise gemm pipeline + const auto num_k_loop = K0 / K0PerBlock; + + if(!GridwiseGemmPipe::IsSupported(num_k_loop)) + { + return false; + } + + if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n)) + { + return false; + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + return true; + } + + __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) + { + const index_t num_loop = K / (K0PerBlock * K1); + + return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma( + const CGridDesc_M_N_& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto MBlock = M / MPerBlock; + const auto NBlock = N / NPerBlock; + + constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); + constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); + + const auto c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple( + MBlock, Number{}, Number{})), + make_unmerge_transform(make_tuple( + NBlock, Number{}, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + } + + // return block_id to C matrix tile idx (m0, n0) mapping + __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap( + const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */) + { + return BlockToCTileMap_M00_N0_M01Adapt( + c_grid_desc_m_n); + } + using CGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma = + remove_cvref_t; +#ifndef DISABLE_C_SHUFFLE + using C0GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma = + remove_cvref_t; + + using C1GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma = + remove_cvref_t; +#endif + using DefaultBlock2CTileMap = + remove_cvref_t; + + template + __device__ static void + Run(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + const FloatC* __restrict__ p_c0_grid, + const FloatC* __restrict__ p_c1_grid, + void* __restrict__ p_shared, + const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const CGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma& + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, +#ifndef DISABLE_C_SHUFFLE + const C0GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma& + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + const C1GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma& + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, +#endif + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CElementwiseOperation& c_element_op, + const Block2CTileMap& block_2_ctile_map) + { + const auto a_grid_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetElementSpaceSize()); +#ifndef DISABLE_C_SHUFFLE + auto c0_grid_buf = make_dynamic_buffer( + p_c0_grid, + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetElementSpaceSize()); + auto c1_grid_buf = make_dynamic_buffer( + p_c1_grid, + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetElementSpaceSize()); +#endif + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + + // divide block work by [M, N] + const auto block_work_idx = + block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + + if(!block_2_ctile_map.ValidCTileIndex( + block_work_idx, + make_tuple( + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetLength(I0), + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetLength(I3)))) + { + return; + } + + // HACK: this force m/n_block_data_idx_on_grid into SGPR + const index_t m_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + + const index_t n_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); + + // lds max alignment + constexpr auto max_lds_align = K1; + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k0_m_k10_k11 = GetABlockDescriptor_K0PerBlock_MPerBlock_K10_K1PerInst(); + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k0_m_k10_k11 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K10_K1PerInst(); + + // A matrix blockwise copy + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_grid_desc_k0_m_k1), + decltype(a_block_desc_k0_m_k1), + ABlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + 1, + 1, + AThreadTransferSrcResetCoordinateAfterRun, + true>( + a_grid_desc_k0_m_k1, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_element_op, + a_block_desc_k0_m_k1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // B matrix blockwise copy + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(b_grid_desc_k0_n_k1), + decltype(b_block_desc_k0_n_k1), + BBlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc_k0_n_k1, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc_k0_n_k1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // GEMM definition + // c_mtx += transpose(a_mtx) * b_mtx + // a_mtx[K0PerBlock, MPerBlock] is in LDS + // b_mtx[K0PerBlock, NPerBlock] is in LDS + // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in + // register + // sanity check + + auto blockwise_gemm = + BlockwiseGemmWmmaops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; + + auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); + + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_space_size_aligned = + math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align); + + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); + + auto b_block_buf = make_dynamic_buffer( + static_cast(p_shared) + a_block_space_size_aligned, + b_block_desc_k0_n_k1.GetElementSpaceSize()); + + constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + + // gridwise GEMM pipeline + const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); + + GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, + a_block_desc_k0_m_k1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc_k0_n_k1, + b_block_desc_k0_n_k1, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + blockwise_gemm, + c_thread_buf, + K0BlockMainLoop); + + // shuffle C and write out + { + static_assert(MWmmaPerWave % CShuffleMWmmaPerWavePerShuffle == 0 && + NWmmaPerWave % CShuffleNWmmaPerWavePerShuffle == 0, + "wrong!"); + + constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); + constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); + + // TODO: hacky, fix it! + constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 = + blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); + + // TODO: hacky, fix it! + // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp = + blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); + + constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0); + constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1); + constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2); + constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3); + constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4); + constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5); + constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6); + constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7); + + constexpr auto c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = + GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma(); + + auto c_block_buf = make_dynamic_buffer( + static_cast(p_shared), + c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma + .GetElementSpaceSize()); + + constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( + c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + make_tuple(make_freeze_transform(I0), // freeze mblock + make_pass_through_transform( + Number{}), // M0 (MWmmaPerWave) per + // shuffle + make_unmerge_transform( + make_tuple(M1, M2, M3, M4)), // M1 = MWave, M2 * M3 * M4 = MPerWmma + make_freeze_transform(I0), // freeze nblock + make_pass_through_transform( + Number{}), // N0 (NWmmaPerWave) per + // shuffle + make_unmerge_transform( + make_tuple(N1, N2))), // M1 = MWave, M2 * M3 * M4 = MPerWmma + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<2, 4, 5, 6>{}, + Sequence<>{}, + Sequence<1>{}, + Sequence<3, 7>{}) + + ); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), + make_tuple(Sequence<0, 1, 2, 3, 4>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = + m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_to_n0_n1_n2_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(N0, N1, N2))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_idx = + n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + 7, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, + make_multi_index(0, + 0, + m_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + m_thread_data_on_block_idx[I3], + m_thread_data_on_block_idx[I4], + n_thread_data_on_block_idx[I2]), + ck::tensor_operation::element_wise::PassThrough{}}; + + auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r3< + ThisThreadBlock, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, + Sequence<1, + CShuffleMWmmaPerWavePerShuffle, + MWave * MPerWmma, + 1, + CShuffleNWmmaPerWavePerShuffle, + NWave * NPerWmma>, // BlockSliceLengths, + CBlockTransferClusterLengths_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma, + Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder, + FloatC, // typename Src0Data, + FloatC, // typename Src1Data, + FloatC, // typename Src2Data, + FloatC, // typename DstData, + decltype( + c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), + decltype( + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), + decltype( + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), + decltype( + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), + Sequence<0, 1, 2, 3, 4, 5>, // typename DimAccessOrder, + 5, // index_t VectorDim, + CBlockTransferScalarPerVector_NWaveNPerWmma, // index_t ScalarPerVector, + true, // bool ThreadTransferSrc0ResetCoordinateAfterRun, + false, // bool ThreadTransferSrc1ResetCoordinateAfterRun, + false, // bool ThreadTransferSrc2ResetCoordinateAfterRun, + false> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + make_multi_index(0, 0, 0, 0, 0, 0), + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), + c_element_op}; + + constexpr auto mwmmaperwave_forward_step = + make_multi_index(0, CShuffleMWmmaPerWavePerShuffle, 0, 0, 0, 0); + constexpr auto nwmmaperwave_forward_step = + make_multi_index(0, 0, 0, 0, CShuffleNWmmaPerWavePerShuffle, 0); + constexpr auto nwmmaperwave_backward_step = + make_multi_index(0, 0, 0, 0, -CShuffleNWmmaPerWavePerShuffle, 0); + + static_for<0, MWmmaPerWave, CShuffleMWmmaPerWavePerShuffle>{}([&](auto mwmmaperwave_iter) { + constexpr auto mwmmaperwave = mwmmaperwave_iter; + + static_for<0, + NWmmaPerWave, + CShuffleNWmmaPerWavePerShuffle>{}([&](auto nwmmaperwave_iter) { + constexpr bool nwmmaperwave_forward_sweep = + (mwmmaperwave % (2 * CShuffleMWmmaPerWavePerShuffle) == 0); + + constexpr index_t nwmmaperwave_value = + nwmmaperwave_forward_sweep + ? nwmmaperwave_iter + : (NWmmaPerWave - nwmmaperwave_iter - CShuffleNWmmaPerWavePerShuffle); + + constexpr auto nwmmaperwave = Number{}; + + // make sure it's safe to do ds_write + block_sync_lds(); + + // VGPR to LDS + c_thread_copy_vgpr_to_lds.Run( + c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, + make_tuple(mwmmaperwave, nwmmaperwave, I0, I0, I0, I0, I0, I0), + c_thread_buf, + c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, + c_block_buf); + + // make sure it's safe to do ds_read + block_sync_lds(); + + // LDS to global + c_block_copy_lds_to_global.Run( + c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + c_block_buf, + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + c0_grid_buf, + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + c1_grid_buf, + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + c_grid_buf); + + // move on nwmmaperwave dimension + if constexpr(nwmmaperwave_forward_sweep && + (nwmmaperwave < NWmmaPerWave - CShuffleNWmmaPerWavePerShuffle)) + { + c_block_copy_lds_to_global.MoveSrc1SliceWindow( + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + nwmmaperwave_forward_step); + + c_block_copy_lds_to_global.MoveSrc2SliceWindow( + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + nwmmaperwave_forward_step); + + c_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + nwmmaperwave_forward_step); + } + else if constexpr((!nwmmaperwave_forward_sweep) && (nwmmaperwave > 0)) + { + c_block_copy_lds_to_global.MoveSrc1SliceWindow( + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + nwmmaperwave_backward_step); + + c_block_copy_lds_to_global.MoveSrc2SliceWindow( + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + nwmmaperwave_backward_step); + + c_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + nwmmaperwave_backward_step); + } + }); + + // move on mwmmaperwave dimension + if constexpr(mwmmaperwave < MWmmaPerWave - CShuffleMWmmaPerWavePerShuffle) + { + c_block_copy_lds_to_global.MoveSrc1SliceWindow( + c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + mwmmaperwave_forward_step); + + c_block_copy_lds_to_global.MoveSrc2SliceWindow( + c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + mwmmaperwave_forward_step); + + c_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, + mwmmaperwave_forward_step); + } + }); + } + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp new file mode 100644 index 00000000000..3964510e6cf --- /dev/null +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -0,0 +1,383 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/utility/math.hpp" +#include "ck/utility/amd_wmma.hpp" + +namespace ck { + +enum struct WmmaInstr +{ + wmma_f32_16x16x16_f16_w32 = 0, + wmma_f32_16x16x16_bf16_w32 = 0, + wmma_f16_16x16x16_f16_w32 = 0, + wmma_bf16_16x16x16_bf16_w32 = 0, + wmma_i32_16x16x16_iu8_w32 = 0, + wmma_i32_16x16x16_iu4_w32 = 0 +}; + +template +struct wmma_type; + +template <> +struct wmma_type +{ + static constexpr index_t m_per_wave = 16; + static constexpr index_t n_per_wave = 16; + static constexpr index_t k_per_wave = 16; + static constexpr index_t wave_size = 32; + static constexpr index_t lane_size = 16; + static constexpr index_t src_data_size = 2; + static constexpr index_t acc_data_size = 4; + static constexpr index_t num_srcregs_per_wave = 8; + static constexpr index_t num_accregs_per_wave = 8; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); + } +}; + +template +struct WmmaSelector +{ + template + static constexpr auto GetWmma(); + + template <> + static constexpr auto GetWmma() + { + return WmmaInstr::wmma_f32_16x16x16_f16_w32; + } + + template <> + static constexpr auto GetWmma() + { + return WmmaInstr::wmma_f32_16x16x16_bf16_w32; + } + + template <> + static constexpr auto GetWmma() + { + return WmmaInstr::wmma_f16_16x16x16_f16_w32; + } + + template <> + static constexpr auto GetWmma() + { + return WmmaInstr::wmma_bf16_16x16x16_bf16_w32; + } + + template <> + static constexpr auto GetWmma() + { + return WmmaInstr::wmma_i32_16x16x16_iu8_w32; + } +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + template <> + static constexpr auto GetWmma() + { + return WmmaInstr::wmma_i32_16x16x16_iu4_w32; + } +#endif + + static constexpr auto selected_wmma = wmma_type()>{}; + + __host__ __device__ constexpr WmmaSelector() + { + static_assert(selected_wmma.m_per_wave == selected_wmma.n_per_wave, + "WRONG! WMMA_M must equal to WMMA_N"); + + static_assert(selected_wmma.m_per_wave == selected_wmma.k_per_wave, + "WRONG! WMMA_M must equal to WMMA_K"); + + static_assert(selected_wmma.k_per_wave == 16, + "WRONG! WMMA_M must equal to WMMA_N"); + + static_assert(selected_wmma.wave_size * selected_wmma.num_accregs_per_wave * selected_wmma.acc_data_size== + selected_wmma.m_per_wave * selected_wmma.n_per_wave * 4, + "WRONG! Number of Accumulator Register"); + + static_assert(selected_wmma.lane_size * selected_wmma.num_srcregs_per_wave * selected_wmma.src_data_size== + selected_wmma.m_per_wave * selected_wmma.k_per_wave * 4, + "WRONG! Number of Source Register"); + } +}; + +template +struct WmmaGemm +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + + using CIndex = MultiIndex<2>; + using CIndex4D = MultiIndex<4>; + + __device__ static constexpr index_t GetNumBlks() { return wmma_instr.num_output_blks; } + + __device__ static constexpr index_t GetNumXdlops() + { + return MPerWmma * NPerWmma / + (wmma_instr.m_per_blk * wmma_instr.n_per_blk * wmma_instr.num_output_blks); + } + + __host__ __device__ constexpr WmmaGemm() + { + static_assert(NPerWmma == 16 && MPerWmma == 16 , + "Only support GemmNPerWmma == 16 and GemmMPerWmma == 16 for wmma"); + + static_assert(KPack % wmma_instr.k_per_wave == 0, "KPack cannot be divided by k_per_wave"); + } + + // XDL output supporting C = A * B + // M2_N2 -> M2_M3_M4_N2 + template + __host__ __device__ static constexpr auto + MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2) + { + const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0); + const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1); + const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2); + const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3); + + return transform_tensor_descriptor( + c_desc_m0_n0_m1_n1_m2_n2, + make_tuple(make_pass_through_transform(M0), + make_pass_through_transform(N0), + make_pass_through_transform(M1), + make_pass_through_transform(N1), + make_unmerge_transform(make_tuple(Number{}, + Number{}, + Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4, 5, 6>{}, + Sequence<7>{})); + } + + // transposed XDL output supporting C' = B' * A' + // M2_N2 -> M2_N2_N3_N4 + template + __host__ __device__ static constexpr auto + MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2) + { + const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0); + const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1); + const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2); + const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3); + + return transform_tensor_descriptor( + c_desc_m0_n0_m1_n1_m2_n2, + make_tuple(make_pass_through_transform(M0), + make_pass_through_transform(N0), + make_pass_through_transform(M1), + make_pass_through_transform(N1), + make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple(Number{}, + Number{}, + Number{}))), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5, 6, 7>{})); + } + + template + __host__ __device__ static constexpr auto MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2( + const CDesc_G_M0_N0_M1_N1_M2_N2& c_desc_g_m0_n0_m1_n1_m2_n2) + { + const auto G = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I0); + const auto M0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I1); + const auto N0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I2); + const auto M1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I3); + const auto N1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I4); + + return transform_tensor_descriptor( + c_desc_g_m0_n0_m1_n1_m2_n2, + make_tuple(make_pass_through_transform(G), + make_pass_through_transform(M0), + make_pass_through_transform(N0), + make_pass_through_transform(M1), + make_pass_through_transform(N1), + make_unmerge_transform(make_tuple(wmma_instr.num_groups_per_blk, + wmma_instr.num_input_blks, + wmma_instr.group_size)), + make_pass_through_transform(wmma_instr.num_threads_per_blk)), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}, + Sequence<6>{}), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5, 6, 7>{}, + Sequence<8>{})); + } + + __device__ static constexpr index_t GetRegSizePerXdlops() + { + return MPerWmma * NPerWmma / wmma_instr.wave_size; + } + + __device__ static constexpr index_t GetWaveSize() { return wmma_instr.wave_size; } + + template + __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const + { + static_assert((is_same::value && is_same::value) || + (is_same::value && is_same::value) || + (is_same::value && is_same::value) || + (is_same::value && is_same::value) || + (is_same::value && is_same::value) +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + || (is_same::value && is_same::value) +#endif + , + "base type couple must be (half, float), (bhalf, float), (half, half), + (bhalf, bhalf), (int8, int32) or (int4, int32)!"); + + static_for<0, KPack / wmma_instr.k_per_wave, 1>{}([&](auto k) { + if constexpr(!TransposeC) + { + wmma_instr.template run( + p_a_wave[k], p_b_wave[k], p_c_thread); + } + else + { + wmma_instr.template run( + p_b_wave[k], p_a_wave[k], p_c_thread); + } + }); + } + + __device__ static auto GetLaneId() { return get_thread_local_1d_id() % wmma_instr.wave_size; } + + __device__ static auto GetBlkIdx() + { + const auto laneId = GetLaneId(); + + constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform( + make_tuple(1, wmma_instr.num_input_blks, wmma_instr.num_threads_per_blk))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto blk_idx = + threadidx_to_blk_idx_adaptor.CalculateBottomIndex(make_multi_index(laneId)); + + const auto blk_id = blk_idx[I1]; + const auto blk_td = blk_idx[I2]; + + return make_tuple(blk_id, blk_td); + } + + __host__ __device__ static auto CalculateAThreadOriginDataIndex() + { + const auto laneId = GetLaneId(); + const auto blk_idx = GetBlkIdx(); + + const auto blk_id = blk_idx[I0]; + const auto blk_td = blk_idx[I1]; + + if constexpr(wmma_instr.is_k_reduction) + { + return make_tuple(blk_id, blk_td); + } + else + { + return make_tuple(0, laneId); + } + } + + __host__ __device__ static auto CalculateBThreadOriginDataIndex() + { + const auto laneId = GetLaneId(); + const auto blk_idx = GetBlkIdx(); + + const auto blk_id = blk_idx[I0]; + const auto blk_td = blk_idx[I1]; + + if constexpr(wmma_instr.is_k_reduction) + { + return make_tuple(blk_id, blk_td); + } + else + { + return make_tuple(0, laneId); + } + } + + __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i) + { + const auto blk_idx = GetBlkIdx(); + + const auto blk_id = blk_idx[I0]; + const auto blk_td = blk_idx[I1]; + + index_t n_offset = blk_i * wmma_instr.n_per_blk + blk_td; + index_t m_offset = xdlops_i * wmma_instr.m_per_blk + blk_id * wmma_instr.group_size; + + return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset}; + } + + __device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */) + { + const auto blk_idx = GetBlkIdx(); + + const auto blk_id = blk_idx[I0]; + const auto blk_td = blk_idx[I1]; + + return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td}; + } + + static constexpr auto mfma = MfmaSelector{}; + + static constexpr auto wmma_instr = mfma.selected_mfma; + + static constexpr auto KPerXdlops = mfma.GetKPerXdlops(); + static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops(); + static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops; + + __host__ __device__ static constexpr auto GetCM0M1M2NThreadBlkLengths() + { + return make_tuple( + Number{}, I1, Number{}, I1); + } +}; + +} // namespace ck From d16063db1d5feb3e35087a31eda8bfe55ed799c5 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 22 Nov 2022 16:02:27 +0000 Subject: [PATCH 007/118] tempsave --- example/01_gemm/CMakeLists.txt | 5 + .../gpu/block/blockwise_gemm_wmma.hpp | 433 ++++++++++++++++++ .../gpu/device/impl/device_gemm_wmma.hpp | 24 +- .../gpu/grid/gridwise_gemm_wmma_v1r1.hpp | 134 +++--- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 126 ++--- 5 files changed, 551 insertions(+), 171 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index c403e51ed99..9b9e100edf7 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -35,3 +35,8 @@ add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp) add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16) add_dependencies(example_gemm_xdl example_gemm_xdl_fp64) + +add_custom_target(example_gemm_wmma) +add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp) +add_dependencies(example_gemm_wmma example_gemm_wmma_fp16) + diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index e69de29bb2d..891d60f9667 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp" +#include "ck/tensor_description/tensor_adaptor.hpp" + +namespace ck { + +enum struct LoopScheduler +{ + Default, +}; + +constexpr LoopScheduler make_default_loop_scheduler() +{ + return LoopScheduler::Default; +} + +template +struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + + using ThisThreadBlock = ThisThreadBlock; + + static constexpr index_t WaveSize = get_warp_size(); + + static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); + static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); + static constexpr index_t KPerBlock = BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); + + static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); + static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); + static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); + static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); + + static constexpr auto wmma_gemm = WMMAGemm{}; + + static constexpr index_t KPerThread = KPerBlock / wmma_gemm.K0PerWMMA; + + static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); + static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); + + StaticBufferTupleOfVector + c_thread_buf_; + + __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } + + __device__ static auto GetWaveIdx() + { + const index_t thread_id = ThisThreadBlock::GetThreadId(); + + constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); + } + + __device__ static auto CalculateAThreadOriginDataIndex() + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + + const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); + + return make_tuple(0, waveId_m, WMMA_a_idx[I1], KPerThread * WMMA_a_idx[I0]); + } + + __device__ static auto CalculateBThreadOriginDataIndex() + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_n = wave_idx[I1]; + + const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); + + return make_tuple(0, waveId_n, WMMA_b_idx[I1], KPerThread * WMMA_b_idx[I0]); + } + + template + __device__ static auto + CalculateCThreadOriginDataIndex(Number, Number, Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(WMMA_i, blk_i); + + constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( + make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; + const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( + make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; + + return make_tuple(c_thread_m, c_thread_n); + } + + template + __device__ static auto + CalculateCThreadOriginDataIndex8D(Number, Number, Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk4D(WMMA_i, blk_i); + + return make_tuple(Number{}, + Number{}, + waveId_m, + waveId_n, + blk_idx[I0], + blk_idx[I1], + blk_idx[I2], + blk_idx[I3]); + } + + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1() + { + static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && + BK0NK1BlockDesc::IsKnownAtCompileTime(), + "wrong! Desc should be known at compile-time"); + + static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, + "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); + + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, + "wrong!"); + } + + __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2() + { + constexpr auto c_m0_m1_m2_n_tblk_lens = wmma_gemm.GetCM0M1M2NThreadBlkLengths(); + + constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0]; + constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1]; + constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2]; + constexpr auto N = c_m0_m1_m2_n_tblk_lens[I3]; + + return make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}, I1, I1, M0, M1, M2, N)); + } + + __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2() + { + constexpr auto c_m0_m1_m2_n_tblk_lens = wmma_gemm.GetCM0M1M2NThreadBlkLengths(); + + constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0]; + constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1]; + constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2]; + constexpr auto N = c_m0_m1_m2_n_tblk_lens[I3]; + + return make_naive_tensor_descriptor_packed( + make_tuple(I1, Number{}, Number{}, I1, I1, M0, M1, M2, N)); + } + + __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2() + { + constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2); + } + + __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2() + { + constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 = + make_naive_tensor_descriptor_packed(make_tuple(I1, + Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2( + c_block_desc_g_m0_n0_m1_n1_m2_n2); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{})); + + return wmma_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2); + } + + __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() + { + return transform_tensor_descriptor( + AK0MK1BlockDesc{}, + make_tuple( + make_pass_through_transform(make_tuple(Number{}, Number{})), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{}))), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}), + make_tuple(Sequence<0, 4>{}, Sequence<1, 2, 3>{})); + } + + __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() + { + return transform_tensor_descriptor( + BK0NK1BlockDesc{}, + make_tuple( + make_pass_through_transform(make_tuple(Number{}, Number{})), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{}))), + make_tuple(Sequence<0, 2>{}, Sequence<1>{}), + make_tuple(Sequence<0, 4>{}, Sequence<1, 2, 3>{})); + } + + static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); + static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); + + template + __device__ void Run(const ABlockBuffer& a_block_buf, + const BBlockBuffer& b_block_buf, + CThreadBuffer& c_thread_buf) const + { + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + constexpr auto RepeatDiff = MRepeat - NRepeat; + constexpr auto WmmaK = wmma_gemm.k_per_wmma; + + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto iWmmaK){ + // Cut to Repeat Retangle to Square, assume MRepeat > NRepeat + static_for<0, RepeatDiff, 1>{}([&](auto iCut){ + static_for<0, NRepeat, 1>{}([&](auto iN){ + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + wmma_gemm.template Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, iCut, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, I0, I0, I0), + a_thread_buf); + }); + // Run FIFO fashion loopover in Square + static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + static_for{}([&](auto iN){ + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); + wmma_gemm.template Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, WmmaInnerloop+RepeatDiff, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, I0, I0, I0), + a_thread_buf); + static_for{}([&](auto iM){ + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); + wmma_gemm.template Run( + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, WmmaInnerloop, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, I0, I0, I0), + b_thread_buf); + }); + }); + } + + protected: + // A[M0, M1, M2, K0 = WmmaK] + static constexpr auto a_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); + + // B[N0, N1, N2, K0 = WmmaK] + static constexpr auto b_thread_desc_ = + make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); + + // C[M, N, NumRegWMMA] + static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWMMA())); + + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + A_K1, + A_K1>; + + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3>, + 3, + B_K1, + B_K1>; + + AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; + BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; +}; + +template +constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2_Selector() +{ + if constexpr(LoopSched == LoopScheduler::Default) + { + return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2{}; + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index f3515f407b2..4f81b30cbbf 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -36,10 +36,10 @@ template " << " NumPrefetch: " << NumPrefetch << ", " diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp index 778cc96265a..02fa7d2fa5e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp @@ -141,7 +141,7 @@ template < index_t CBlockTransferScalarPerVector_NWaveNPerWmma, index_t NumGemmKPrefetchStage = 1, PipelineVersion PipelineVer = PipelineVersion::v1> -struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 +struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -160,52 +160,35 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 using GridwiseGemmPipe = remove_cvref_t())>; - __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K10_K1PerInst() + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_K10_MPerBlock_K1PerInst() { constexpr auto inst_max_size = 16 / sizeof(FloatAB); constexpr auto k1perinst = (K1 {}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - // May have static err - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K10, k1perinst), k1perinst); - } + constexpr auto a_block_desc_k0_k10_m_k1perinst = [&]() { + // May have static err + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, K10, Number{}, k1perinst), k1perinst); }(); - return a_block_desc_k0_m_k1; + return a_block_desc_k0_k10_m_k1perinst; } - __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K10_K1PerInst() + __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_K10_NPerBlock_K1PerInst() { constexpr auto inst_max_size = 16 / sizeof(FloatAB); constexpr auto k1perinst = (K1 {}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K10, k1perinst), k1perinst); - } + constexpr auto b_block_desc_k0_k10_n_k1perinst = [&]() { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, K10, Number{}, k1perinst), k1perinst); }(); - return b_block_desc_k0_n_k1; + return b_block_desc_k0_k10_n_k1perinst; } __host__ __device__ static constexpr auto @@ -230,18 +213,20 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + constexpr auto a_block_desc_k0_k10_m_k1perinst = GetABlockDescriptor_K0PerBlock_K10_MPerBlock_K1PerInst(); - constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + constexpr auto b_block_desc_k0_k10_n_k1perinst = GetBBlockDescriptor_K0PerBlock_K10_NPerBlock_K1PerInst(); - constexpr auto max_lds_align = K1; + constexpr auto max_lds_align = a_block_desc_k0_k10_m_k1perinst.GetLength(I3); constexpr auto a_block_space_size_aligned = - math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align); + math::integer_least_multiple(a_block_desc_k0_k10_m_k1perinst.GetElementSpaceSize(), max_lds_align); constexpr auto b_block_space_size_aligned = - math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align); - + math::integer_least_multiple(b_block_desc_k0_k10_n_k1perinst.GetElementSpaceSize(), max_lds_align); + + constexpr auto c_block_size = 0; +#ifndef DISABLE_C_SHUFFLE // LDS allocation for C shuffle in LDS constexpr auto c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma(); @@ -249,7 +234,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 constexpr auto c_block_size = c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma .GetElementSpaceSize(); - +#endif return math::max((a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB), c_block_size * sizeof(FloatC)); @@ -423,42 +408,42 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); - // lds max alignment - constexpr auto max_lds_align = K1; - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0_m_k10_k11 = GetABlockDescriptor_K0PerBlock_MPerBlock_K10_K1PerInst(); + constexpr auto a_block_desc_k0_k10_m_k1perinst = GetABlockDescriptor_K0PerBlock_MPerBlock_K10_K1PerInst(); // B matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0_m_k10_k11 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K10_K1PerInst(); + constexpr auto b_block_desc_k0_k10_n_k1perinst = GetBBlockDescriptor_K0PerBlock_NPerBlock_K10_K1PerInst(); + + // lds max alignment + constexpr auto max_lds_align = a_block_desc_k0_m_k10_k11.GetLength(I3); // A matrix blockwise copy auto a_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - ABlockTransferThreadClusterLengths_K0_M_K1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_grid_desc_k0_m_k1), - decltype(a_block_desc_k0_m_k1), - ABlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K1, - 1, - 1, - AThreadTransferSrcResetCoordinateAfterRun, - true>( + ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, +/* typename SrcElementwiseOperation, */ AElementwiseOperation, +/* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, +/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, +/* typename BlockSliceLengths, */ Sequence, +/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, +/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ FloatAB, +/* typename DstData, */ FloatAB, +/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), +/* typename DstDesc, */ decltype(a_block_desc_k0_k10_m_k1perinst), +/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<1, 0, 2>, +/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, +/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( a_grid_desc_k0_m_k1, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, - a_block_desc_k0_m_k1, + a_block_desc_k0_k10_m_k1perinst, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); @@ -474,7 +459,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 FloatAB, FloatAB, decltype(b_grid_desc_k0_n_k1), - decltype(b_block_desc_k0_n_k1), + decltype(b_block_desc_k0_k10_n_k1perinst), BBlockTransferSrcAccessOrder, Sequence<1, 0, 2>, BBlockTransferSrcVectorDim, @@ -488,7 +473,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 b_grid_desc_k0_n_k1, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, - b_block_desc_k0_n_k1, + b_block_desc_k0_k10_n_k1perinst, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); @@ -504,8 +489,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 BlockwiseGemmWmmaops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1( - static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); + static_cast(p_shared), a_block_desc_k0_k10_m_k1perinst.GetElementSpaceSize()); auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + a_block_space_size_aligned, - b_block_desc_k0_n_k1.GetElementSpaceSize()); + b_block_desc_k0_k10_n_k1perinst.GetElementSpaceSize()); constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); @@ -532,13 +517,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, - a_block_desc_k0_m_k1, + a_block_desc_k0_k10_m_k1perinst, a_blockwise_copy, a_grid_buf, a_block_buf, a_block_slice_copy_step, b_grid_desc_k0_n_k1, - b_block_desc_k0_n_k1, + b_block_desc_k0_k10_n_k1perinst, b_blockwise_copy, b_grid_buf, b_block_buf, @@ -546,7 +531,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 blockwise_gemm, c_thread_buf, K0BlockMainLoop); - +#ifndef DISABLE_C_SHUFFLE // shuffle C and write out { static_assert(MWmmaPerWave % CShuffleMWmmaPerWavePerShuffle == 0 && @@ -809,6 +794,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmmaops_v3r3 } }); } +#endif } }; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 3964510e6cf..31cf4b82b1c 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -25,15 +25,15 @@ struct wmma_type; template <> struct wmma_type { - static constexpr index_t m_per_wave = 16; - static constexpr index_t n_per_wave = 16; - static constexpr index_t k_per_wave = 16; + static constexpr index_t m_per_wmma = 16; + static constexpr index_t n_per_wmma = 16; + static constexpr index_t k_per_wmma = 16; static constexpr index_t wave_size = 32; static constexpr index_t lane_size = 16; static constexpr index_t src_data_size = 2; static constexpr index_t acc_data_size = 4; - static constexpr index_t num_srcregs_per_wave = 8; - static constexpr index_t num_accregs_per_wave = 8; + static constexpr index_t num_srcregs_per_wmma = 8; + static constexpr index_t num_accregs_per_wmma = 8; template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const @@ -45,7 +45,7 @@ struct wmma_type template struct WmmaSelector { - template + template static constexpr auto GetWmma(); template <> @@ -89,21 +89,21 @@ struct WmmaSelector __host__ __device__ constexpr WmmaSelector() { - static_assert(selected_wmma.m_per_wave == selected_wmma.n_per_wave, + static_assert(selected_wmma.m_per_wmma == selected_wmma.n_per_wmma, "WRONG! WMMA_M must equal to WMMA_N"); - static_assert(selected_wmma.m_per_wave == selected_wmma.k_per_wave, + static_assert(selected_wmma.m_per_wmma == selected_wmma.k_per_wmma, "WRONG! WMMA_M must equal to WMMA_K"); - static_assert(selected_wmma.k_per_wave == 16, + static_assert(selected_wmma.k_per_wmma == 16, "WRONG! WMMA_M must equal to WMMA_N"); - static_assert(selected_wmma.wave_size * selected_wmma.num_accregs_per_wave * selected_wmma.acc_data_size== - selected_wmma.m_per_wave * selected_wmma.n_per_wave * 4, + static_assert(selected_wmma.wave_size * selected_wmma.num_accregs_per_wmma * selected_wmma.acc_data_size== + selected_wmma.m_per_wmma * selected_wmma.n_per_wmma * 4, "WRONG! Number of Accumulator Register"); - static_assert(selected_wmma.lane_size * selected_wmma.num_srcregs_per_wave * selected_wmma.src_data_size== - selected_wmma.m_per_wave * selected_wmma.k_per_wave * 4, + static_assert(selected_wmma.lane_size * selected_wmma.num_srcregs_per_wmma * selected_wmma.src_data_size== + selected_wmma.m_per_wmma * selected_wmma.k_per_wmma * 4, "WRONG! Number of Source Register"); } }; @@ -126,20 +126,12 @@ struct WmmaGemm using CIndex = MultiIndex<2>; using CIndex4D = MultiIndex<4>; - __device__ static constexpr index_t GetNumBlks() { return wmma_instr.num_output_blks; } - - __device__ static constexpr index_t GetNumXdlops() - { - return MPerWmma * NPerWmma / - (wmma_instr.m_per_blk * wmma_instr.n_per_blk * wmma_instr.num_output_blks); - } - __host__ __device__ constexpr WmmaGemm() { static_assert(NPerWmma == 16 && MPerWmma == 16 , "Only support GemmNPerWmma == 16 and GemmMPerWmma == 16 for wmma"); - static_assert(KPack % wmma_instr.k_per_wave == 0, "KPack cannot be divided by k_per_wave"); + static_assert(KPack == wmma_instr.k_per_wmma, "KPack should be k_per_wmma"); } // XDL output supporting C = A * B @@ -267,79 +259,43 @@ struct WmmaGemm #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 || (is_same::value && is_same::value) #endif - , - "base type couple must be (half, float), (bhalf, float), (half, half), - (bhalf, bhalf), (int8, int32) or (int4, int32)!"); - - static_for<0, KPack / wmma_instr.k_per_wave, 1>{}([&](auto k) { - if constexpr(!TransposeC) - { - wmma_instr.template run( - p_a_wave[k], p_b_wave[k], p_c_thread); - } - else - { - wmma_instr.template run( - p_b_wave[k], p_a_wave[k], p_c_thread); - } - }); + ,"base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), (int8, int32) or (int4, int32)!"); + if constexpr(!TransposeC) + { + wmma_instr.template run( + p_a_wave[0], p_b_wave[0], p_c_thread); + } + else + { + wmma_instr.template run( + p_b_wave[0], p_a_wave[0], p_c_thread); + } } __device__ static auto GetLaneId() { return get_thread_local_1d_id() % wmma_instr.wave_size; } - __device__ static auto GetBlkIdx() + __device__ static auto GetLaneIdHigh() { - const auto laneId = GetLaneId(); - - constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform( - make_tuple(1, wmma_instr.num_input_blks, wmma_instr.num_threads_per_blk))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto blk_idx = - threadidx_to_blk_idx_adaptor.CalculateBottomIndex(make_multi_index(laneId)); - - const auto blk_id = blk_idx[I1]; - const auto blk_td = blk_idx[I2]; + return GetLaneId() / 16; + } - return make_tuple(blk_id, blk_td); + __device__ static auto GetLaneIdLow() + { + return GetLaneId() % 16; + } + __device__ static auto GetSwizzledLaneIdLow() + { + return ((GetLaneIdLow() & 1) << 3 ) | (GetLaneIdLow() >> 1); } __host__ __device__ static auto CalculateAThreadOriginDataIndex() { - const auto laneId = GetLaneId(); - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - if constexpr(wmma_instr.is_k_reduction) - { - return make_tuple(blk_id, blk_td); - } - else - { - return make_tuple(0, laneId); - } + return make_tuple(0, GetSwizzledLaneIdLow()); } __host__ __device__ static auto CalculateBThreadOriginDataIndex() { - const auto laneId = GetLaneId(); - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - if constexpr(wmma_instr.is_k_reduction) - { - return make_tuple(blk_id, blk_td); - } - else - { - return make_tuple(0, laneId); - } + return make_tuple(0, GetLaneIdLow()); } __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i) @@ -365,12 +321,12 @@ struct WmmaGemm return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td}; } - static constexpr auto mfma = MfmaSelector{}; + static constexpr auto wmma = WmmaSelector{}; - static constexpr auto wmma_instr = mfma.selected_mfma; + static constexpr auto wmma_instr = wmma.selected_wmma; - static constexpr auto KPerXdlops = mfma.GetKPerXdlops(); - static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops(); + static constexpr auto KPerXdlops = wmma.GetKPerXdlops(); + static constexpr auto K1PerXdlops = wmma.GetK1PerXdlops(); static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops; __host__ __device__ static constexpr auto GetCM0M1M2NThreadBlkLengths() From b3cc22a384d337e50244e8352c3850a247e020a3 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 24 Nov 2022 18:05:16 +0000 Subject: [PATCH 008/118] tempsave --- .../gpu/block/blockwise_gemm_wmma.hpp | 141 ++-- .../gpu/device/impl/device_gemm_wmma.hpp | 71 +- .../gpu/grid/gridwise_gemm_wmma_v1r1.hpp | 634 +++++------------- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 300 ++++----- include/ck/utility/amd_wmma.hpp | 15 +- 5 files changed, 425 insertions(+), 736 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 891d60f9667..5b211055cd7 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -30,12 +30,14 @@ template -struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 +// MRepeat_MWave_MLaneHigh_NRepeat_NWave_NLane_MLanelow +struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; + static constexpr auto I3 = Number<4>{}; using ThisThreadBlock = ThisThreadBlock; @@ -85,8 +87,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 const auto waveId_m = wave_idx[I0]; const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); - - return make_tuple(0, waveId_m, WMMA_a_idx[I1], KPerThread * WMMA_a_idx[I0]); + // |KRepeat |MRepeat|Mwave |MLane |KPack + return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); } __device__ static auto CalculateBThreadOriginDataIndex() @@ -96,20 +98,20 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 const auto waveId_n = wave_idx[I1]; const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); - - return make_tuple(0, waveId_n, WMMA_b_idx[I1], KPerThread * WMMA_b_idx[I0]); + // |KRepeat |NRepeat|Nwave |NLane |KPack + return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); } - template + template __device__ static auto - CalculateCThreadOriginDataIndex(Number, Number, Number, Number) + CalculateCThreadOriginDataIndex(Number, Number) { const auto wave_idx = GetWaveIdx(); const auto waveId_m = wave_idx[I0]; const auto waveId_n = wave_idx[I1]; - const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(WMMA_i, blk_i); + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), @@ -129,27 +131,6 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 return make_tuple(c_thread_m, c_thread_n); } - template - __device__ static auto - CalculateCThreadOriginDataIndex8D(Number, Number, Number, Number) - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk4D(WMMA_i, blk_i); - - return make_tuple(Number{}, - Number{}, - waveId_m, - waveId_n, - blk_idx[I0], - blk_idx[I1], - blk_idx[I2], - blk_idx[I3]); - } - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1() { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && @@ -162,59 +143,31 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } - - __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2() + // Thread level, register decriptor. + __host__ __device__ static constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() { - constexpr auto c_m0_m1_m2_n_tblk_lens = wmma_gemm.GetCM0M1M2NThreadBlkLengths(); + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0]; - constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1]; - constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2]; - constexpr auto N = c_m0_m1_m2_n_tblk_lens[I3]; + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; return make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, M0, M1, M2, N)); + // |MRepeat |MWave |MSubGroup |NRepeat |NWave |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, I1, MSubGroup, Number{}, I1, NThreadPerSubGroup, MAccVgprs)); } - __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2() + __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() { - constexpr auto c_m0_m1_m2_n_tblk_lens = wmma_gemm.GetCM0M1M2NThreadBlkLengths(); - - constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0]; - constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1]; - constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2]; - constexpr auto N = c_m0_m1_m2_n_tblk_lens[I3]; - - return make_naive_tensor_descriptor_packed( - make_tuple(I1, Number{}, Number{}, I1, I1, M0, M1, M2, N)); - } - - __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2() - { - constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 = + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, Number{}, - Number{}, Number{}, - Number{})); - - return wmma_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2); - } - - __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2() - { - constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 = - make_naive_tensor_descriptor_packed(make_tuple(I1, - Number{}, Number{}, - Number{}, Number{}, - Number{}, Number{})); - return wmma_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2( - c_block_desc_g_m0_n0_m1_n1_m2_n2); + return wmma_gemm.MakeCDesc_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); } template @@ -234,32 +187,46 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 return wmma_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2); } - __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() + __host__ __device__ static constexpr auto MakeABlockDescriptor_KRepeat_M0_M1_M2_KPack() { - return transform_tensor_descriptor( + static constexpr auto a_block_desc_temp_km0m1m2 = transform_tensor_descriptor( AK0MK1BlockDesc{}, make_tuple( - make_pass_through_transform(make_tuple(Number{}, Number{})), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{}))), + make_merge_transform(make_tuple(Number{}, Number{})), + make_unmerge_transform(make_tuple(Number{}, Number{}, Number{}))), make_tuple(Sequence<0, 2>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{})); + + return transform_tensor_descriptor( + a_block_desc_temp_km0m1m2, + make_tuple( + make_unmerge_transform(make_tuple(Number{}, Number{})), + make_pass_through_transform(make_tuple(Number{}, Number{}, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}), make_tuple(Sequence<0, 4>{}, Sequence<1, 2, 3>{})); } - __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() + __host__ __device__ static constexpr auto MakeBBlockDescriptor_KRepeat_N0_N1_N2_KPack() { - return transform_tensor_descriptor( + static constexpr auto b_block_desc_temp_kn0n1n2 = transform_tensor_descriptor( BK0NK1BlockDesc{}, make_tuple( - make_pass_through_transform(make_tuple(Number{}, Number{})), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{}))), + make_merge_transform(make_tuple(Number{}, Number{})), + make_unmerge_transform(make_tuple(Number{}, Number{}, Number{}))), make_tuple(Sequence<0, 2>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{})); + + return transform_tensor_descriptor( + b_block_desc_temp_kn0n1n2, + make_tuple( + make_unmerge_transform(make_tuple(Number{}, Number{})), + make_pass_through_transform(make_tuple(Number{}, Number{}, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}), make_tuple(Sequence<0, 4>{}, Sequence<1, 2, 3>{})); } - static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); - static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); + static constexpr auto a_block_desc_krepeat_m0_m1_m2_kpack = MakeABlockDescriptor_KRepeat_M0_M1_M2_KPack(); + static constexpr auto b_block_desc_krepeat_n0_n1_n2_kpack = MakeBBlockDescriptor_KRepeat_N0_N1_N2_KPack(); template __device__ void Run(const ABlockBuffer& a_block_buf, @@ -298,7 +265,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); }); - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + a_thread_copy_.Run(a_block_desc_krepeat_m0_m1_m2_kpack, make_tuple(Number{}, iCut, I0, I0, I0), a_block_buf, a_thread_desc_, @@ -328,7 +295,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); }); - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + a_thread_copy_.Run(a_block_desc_krepeat_m0_m1_m2_kpack, make_tuple(Number{}, WmmaInnerloop+RepeatDiff, I0, I0, I0), a_block_buf, a_thread_desc_, @@ -355,7 +322,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); }); - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + b_thread_copy_.Run(b_block_desc_krepeat_n0_n1_n2_kpack, make_tuple(Number{}, WmmaInnerloop, I0, I0, I0), b_block_buf, b_thread_desc_, @@ -380,7 +347,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, Sequence<0, 1, 2, 3>, @@ -390,7 +357,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2 using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, Sequence<0, 1, 2, 3>, @@ -413,11 +380,11 @@ template -constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2_Selector() +constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_Selector() { if constexpr(LoopSched == LoopScheduler::Default) { - return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1n0n1n2m2{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; - + // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; - static constexpr auto M1Number = Number{}; static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA) { @@ -87,10 +86,12 @@ struct DeviceGemmWmma : public DeviceGemm::value) { return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); } +#endif }(); if constexpr(GemmSpec == GemmSpecialization::MNPadding) @@ -154,12 +155,8 @@ struct DeviceGemmWmma : public DeviceGemm::value) { @@ -173,8 +170,6 @@ struct DeviceGemmWmma : public DeviceGemm{}, Sequence<1>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); - */ } else { return transform_tensor_descriptor( c_grid_desc_m_n, - make_tuple(make_unmerge_transform(make_tuple(M0, M1Number)), - make_pass_through_transform(N)), + make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)), make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + make_tuple(Sequence<0>{}, Sequence<1>{})); } } + // Gridwise descriptor, mapping to whole given provblem. using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1)); using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1)); - using CGridDesc_M0_N_M1 = decltype(MakeCGridDescriptor_M0_N_M1(1, 1, 1)); + using CGridDesc_M_N = decltype(MakeCGridDescriptor_M_N(1, 1, 1)); // GridwiseGemm - using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_m0nm1_wmma_v1r1< + using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1< BlockSize, ADataType, // TODO: distinguish A/B datatype AccDataType, @@ -210,7 +204,7 @@ struct DeviceGemmWmma : public DeviceGemm, // CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, +#endif NumPrefetch, - LoopSched, PipelineVer>; // Argument - struct Argument : public BaseArgument + struct Argument : public BaseArgumentW { Argument(const ADataType* p_a_grid, const BDataType* p_b_grid, @@ -267,8 +262,8 @@ struct DeviceGemmWmma : public DeviceGemm, remove_reference_t, - remove_reference_t, + remove_reference_t, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, @@ -375,7 +370,7 @@ struct DeviceGemmWmma : public DeviceGemm, remove_reference_t, - remove_reference_t, + remove_reference_t, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, @@ -406,7 +401,7 @@ struct DeviceGemmWmma : public DeviceGemm struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 @@ -160,84 +134,66 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 using GridwiseGemmPipe = remove_cvref_t())>; - __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_K10_MPerBlock_K1PerInst() + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() { - constexpr auto inst_max_size = 16 / sizeof(FloatAB); - constexpr auto k1perinst = (K1 {}, K10, Number{}, k1perinst), k1perinst); + constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } }(); - return a_block_desc_k0_k10_m_k1perinst; + return a_block_desc_k0perblock_mperblock_k1; } - __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_K10_NPerBlock_K1PerInst() + __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() { - constexpr auto inst_max_size = 16 / sizeof(FloatAB); - constexpr auto k1perinst = (K1 {}, K10, Number{}, k1perinst), k1perinst); + constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } }(); - return b_block_desc_k0_k10_n_k1perinst; - } - - __host__ __device__ static constexpr auto - GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma() - { - constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); - constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); - - constexpr auto - c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = - make_naive_tensor_descriptor_packed( - make_tuple(I1, - Number{}, - Number{}, - I1, - Number{}, - Number{})); - - return c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + return b_block_desc_k0perblock_nperblock_k1; } __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_desc_k0_k10_m_k1perinst = GetABlockDescriptor_K0PerBlock_K10_MPerBlock_K1PerInst(); + constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - constexpr auto b_block_desc_k0_k10_n_k1perinst = GetBBlockDescriptor_K0PerBlock_K10_NPerBlock_K1PerInst(); + constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - constexpr auto max_lds_align = a_block_desc_k0_k10_m_k1perinst.GetLength(I3); + constexpr auto max_lds_align = K1; constexpr auto a_block_space_size_aligned = - math::integer_least_multiple(a_block_desc_k0_k10_m_k1perinst.GetElementSpaceSize(), max_lds_align); + math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); constexpr auto b_block_space_size_aligned = - math::integer_least_multiple(b_block_desc_k0_k10_n_k1perinst.GetElementSpaceSize(), max_lds_align); - - constexpr auto c_block_size = 0; -#ifndef DISABLE_C_SHUFFLE - // LDS allocation for C shuffle in LDS - constexpr auto c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = - GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma(); - - constexpr auto c_block_size = - c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetElementSpaceSize(); -#endif - return math::max((a_block_space_size_aligned + b_block_space_size_aligned) * - sizeof(FloatAB), - c_block_size * sizeof(FloatC)); + math::integer_least_multiple(b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); + + return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB); } // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} @@ -293,7 +249,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 template __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma( + MakeCGridDescriptor_MBlock_MRepeat_Mwave_MSubGroup_NBlock_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs( const CGridDesc_M_N_& c_grid_desc_m_n) { const auto M = c_grid_desc_m_n.GetLength(I0); @@ -305,17 +261,21 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); - const auto c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = + constexpr index_t MLaneHigh = 2; + constexpr index_t MLaneLow = NWmmaPerWave / MLaneHigh; + constexpr index_t NLane = NWmmaPerWave; + + const auto c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( c_grid_desc_m_n, make_tuple(make_unmerge_transform(make_tuple( - MBlock, Number{}, Number{})), + MBlock, Number{}, Number{}, Number{}, Number{})), make_unmerge_transform(make_tuple( - NBlock, Number{}, Number{}))), + NBlock, Number{}, Number{}, Number{}))), make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + make_tuple(Sequence<0, 1, 2, 3, 8>{}, Sequence<4, 5, 6, 7>{})); - return c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma; + return c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs; } // return block_id to C matrix tile idx (m0, n0) mapping @@ -325,21 +285,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 return BlockToCTileMap_M00_N0_M01Adapt( c_grid_desc_m_n); } - using CGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma = + using CGridDescriptor_MBlock_MRepeat_Mwave_MSubGroup_NBlock_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs = remove_cvref_t; -#ifndef DISABLE_C_SHUFFLE - using C0GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma = - remove_cvref_t; - - using C1GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma = - remove_cvref_t; -#endif using DefaultBlock2CTileMap = remove_cvref_t; @@ -348,74 +297,45 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 Run(const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, - const FloatC* __restrict__ p_c0_grid, - const FloatC* __restrict__ p_c1_grid, void* __restrict__ p_shared, const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, - const CGridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma& - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, -#ifndef DISABLE_C_SHUFFLE - const C0GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma& - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - const C1GridDescriptor_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma& - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, -#endif + const CGridDescriptor_MBlock_MRepeat_Mwave_MSubGroup_NBlock_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs& + c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs, const AElementwiseOperation& a_element_op, const BElementwiseOperation& b_element_op, const CElementwiseOperation& c_element_op, const Block2CTileMap& block_2_ctile_map) { +// clang-format off +/*******************************************************************************/ +// Memory buffer zone. const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( - p_c_grid, - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetElementSpaceSize()); -#ifndef DISABLE_C_SHUFFLE - auto c0_grid_buf = make_dynamic_buffer( - p_c0_grid, - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetElementSpaceSize()); - auto c1_grid_buf = make_dynamic_buffer( - p_c1_grid, - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetElementSpaceSize()); -#endif - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - - // divide block work by [M, N] - const auto block_work_idx = - block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + p_c_grid, c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetElementSpaceSize()); +/*******************************************************************************/ +// BlockIdx.x -> [BlockId.m, BlockId.n] + const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); if(!block_2_ctile_map.ValidCTileIndex( block_work_idx, - make_tuple( - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetLength(I0), - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetLength(I3)))) - { - return; - } + make_tuple(c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I0), + c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4)))) + { return; } - // HACK: this force m/n_block_data_idx_on_grid into SGPR - const index_t m_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); - - const index_t n_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0_k10_m_k1perinst = GetABlockDescriptor_K0PerBlock_MPerBlock_K10_K1PerInst(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_k0_k10_n_k1perinst = GetBBlockDescriptor_K0PerBlock_NPerBlock_K10_K1PerInst(); + // Store BlockId into SGPR + const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); - // lds max alignment - constexpr auto max_lds_align = a_block_desc_k0_m_k10_k11.GetLength(I3); +/*******************************************************************************/ +// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + constexpr auto max_lds_align = K1; + constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); // A matrix blockwise copy auto a_blockwise_copy = @@ -429,7 +349,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 /* typename SrcData, */ FloatAB, /* typename DstData, */ FloatAB, /* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), -/* typename DstDesc, */ decltype(a_block_desc_k0_k10_m_k1perinst), +/* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, /* typename DstDimAccessOrder, */ Sequence<1, 0, 2>, /* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, @@ -443,7 +363,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 a_grid_desc_k0_m_k1, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, - a_block_desc_k0_k10_m_k1perinst, + a_block_desc_k0perblock_mperblock_k1, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); @@ -459,7 +379,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 FloatAB, FloatAB, decltype(b_grid_desc_k0_n_k1), - decltype(b_block_desc_k0_k10_n_k1perinst), + decltype(b_block_desc_k0perblock_nperblock_k1), BBlockTransferSrcAccessOrder, Sequence<1, 0, 2>, BBlockTransferSrcVectorDim, @@ -473,43 +393,42 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 b_grid_desc_k0_n_k1, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, - b_block_desc_k0_k10_n_k1perinst, + b_block_desc_k0perblock_nperblock_k1, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); +/*******************************************************************************/ // GEMM definition - // c_mtx += transpose(a_mtx) * b_mtx - // a_mtx[K0PerBlock, MPerBlock] is in LDS - // b_mtx[K0PerBlock, NPerBlock] is in LDS - // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in - // register - // sanity check + // c_mtx += a_mtx * b_mtx + // a_mtx[K0PerBlock, MPerBlock] is in LDS + // b_mtx[K0PerBlock, NPerBlock] is in LDS + // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in register - auto blockwise_gemm = - BlockwiseGemmWmmaops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1{}; + constexpr auto WmmaK = 16; + constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + auto blockwise_gemm = + BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3{}; + + // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); +/*******************************************************************************/ + constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_space_size_aligned = - math::integer_least_multiple(a_block_desc_k0_k10_m_k1perinst.GetElementSpaceSize(), max_lds_align); - - auto a_block_buf = make_dynamic_buffer( - static_cast(p_shared), a_block_desc_k0_k10_m_k1perinst.GetElementSpaceSize()); - - auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + a_block_space_size_aligned, - b_block_desc_k0_k10_n_k1perinst.GetElementSpaceSize()); - + auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); + auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + + // Shift Per SUB_K constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); @@ -517,13 +436,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, - a_block_desc_k0_k10_m_k1perinst, + a_block_desc_k0perblock_mperblock_k1, a_blockwise_copy, a_grid_buf, a_block_buf, a_block_slice_copy_step, b_grid_desc_k0_n_k1, - b_block_desc_k0_k10_n_k1perinst, + b_block_desc_k0perblock_nperblock_k1, b_blockwise_copy, b_grid_buf, b_block_buf, @@ -531,270 +450,79 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 blockwise_gemm, c_thread_buf, K0BlockMainLoop); -#ifndef DISABLE_C_SHUFFLE - // shuffle C and write out + // NO C-shuffle, direct write { - static_assert(MWmmaPerWave % CShuffleMWmmaPerWavePerShuffle == 0 && - NWmmaPerWave % CShuffleNWmmaPerWavePerShuffle == 0, - "wrong!"); - - constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); - constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); - - // TODO: hacky, fix it! - constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 = - blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); - - // TODO: hacky, fix it! - // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths - constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp = - blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(); - - constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0); - constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1); - constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2); - constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3); - constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4); - constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5); - constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6); - constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7); - - constexpr auto c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma = - GetCBlockDescriptor_MBlock_NWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma(); - - auto c_block_buf = make_dynamic_buffer( - static_cast(p_shared), - c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma - .GetElementSpaceSize()); - - constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor( - c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - make_tuple(make_freeze_transform(I0), // freeze mblock - make_pass_through_transform( - Number{}), // M0 (MWmmaPerWave) per - // shuffle - make_unmerge_transform( - make_tuple(M1, M2, M3, M4)), // M1 = MWave, M2 * M3 * M4 = MPerWmma - make_freeze_transform(I0), // freeze nblock - make_pass_through_transform( - Number{}), // N0 (NWmmaPerWave) per - // shuffle - make_unmerge_transform( - make_tuple(N1, N2))), // M1 = MWave, M2 * M3 * M4 = MPerWmma - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<2, 4, 5, 6>{}, - Sequence<>{}, - Sequence<1>{}, - Sequence<3, 7>{}) - - ); - - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = - blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); - - const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; - const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; - - const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))), - make_tuple(Sequence<0, 1, 2, 3, 4>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_block_idx = - m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_block)); - - const auto n_thread_data_on_block_to_n0_n1_n2_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(N0, N1, N2))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_block_idx = - n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_block)); - - // VGPR to LDS - auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0, 1, 2, 3, 4, 5, 6, 7>, - 7, - 1, - InMemoryDataOperationEnum::Set, - 1, - true>{ - c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, - make_multi_index(0, - 0, - m_thread_data_on_block_idx[I1], - n_thread_data_on_block_idx[I1], - m_thread_data_on_block_idx[I2], - m_thread_data_on_block_idx[I3], - m_thread_data_on_block_idx[I4], - n_thread_data_on_block_idx[I2]), - ck::tensor_operation::element_wise::PassThrough{}}; - - auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r3< - ThisThreadBlock, // ThreadGroup - CElementwiseOperation, // ElementwiseOperation, - CGlobalMemoryDataOperation, // DstInMemOp, - Sequence<1, - CShuffleMWmmaPerWavePerShuffle, - MWave * MPerWmma, - 1, - CShuffleNWmmaPerWavePerShuffle, - NWave * NPerWmma>, // BlockSliceLengths, - CBlockTransferClusterLengths_MBlock_MWmmaPerWave_MWaveMPerWmma_NBlock_NWmmaPerWave_NWaveNPerWmma, - Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder, - FloatC, // typename Src0Data, - FloatC, // typename Src1Data, - FloatC, // typename Src2Data, - FloatC, // typename DstData, - decltype( - c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), - decltype( - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), - decltype( - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), - decltype( - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma), - Sequence<0, 1, 2, 3, 4, 5>, // typename DimAccessOrder, - 5, // index_t VectorDim, - CBlockTransferScalarPerVector_NWaveNPerWmma, // index_t ScalarPerVector, - true, // bool ThreadTransferSrc0ResetCoordinateAfterRun, - false, // bool ThreadTransferSrc1ResetCoordinateAfterRun, - false, // bool ThreadTransferSrc2ResetCoordinateAfterRun, - false> // bool ThreadTransferDstResetCoordinateAfterRun> - {c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - make_multi_index(0, 0, 0, 0, 0, 0), - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0), - c_element_op}; - - constexpr auto mwmmaperwave_forward_step = - make_multi_index(0, CShuffleMWmmaPerWavePerShuffle, 0, 0, 0, 0); - constexpr auto nwmmaperwave_forward_step = - make_multi_index(0, 0, 0, 0, CShuffleNWmmaPerWavePerShuffle, 0); - constexpr auto nwmmaperwave_backward_step = - make_multi_index(0, 0, 0, 0, -CShuffleNWmmaPerWavePerShuffle, 0); - - static_for<0, MWmmaPerWave, CShuffleMWmmaPerWavePerShuffle>{}([&](auto mwmmaperwave_iter) { - constexpr auto mwmmaperwave = mwmmaperwave_iter; - - static_for<0, - NWmmaPerWave, - CShuffleNWmmaPerWavePerShuffle>{}([&](auto nwmmaperwave_iter) { - constexpr bool nwmmaperwave_forward_sweep = - (mwmmaperwave % (2 * CShuffleMWmmaPerWavePerShuffle) == 0); - - constexpr index_t nwmmaperwave_value = - nwmmaperwave_forward_sweep - ? nwmmaperwave_iter - : (NWmmaPerWave - nwmmaperwave_iter - CShuffleNWmmaPerWavePerShuffle); - - constexpr auto nwmmaperwave = Number{}; - - // make sure it's safe to do ds_write - block_sync_lds(); - - // VGPR to LDS - c_thread_copy_vgpr_to_lds.Run( - c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2, - make_tuple(mwmmaperwave, nwmmaperwave, I0, I0, I0, I0, I0, I0), - c_thread_buf, - c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2, - c_block_buf); - - // make sure it's safe to do ds_read - block_sync_lds(); - - // LDS to global - c_block_copy_lds_to_global.Run( - c_block_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - c_block_buf, - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - c0_grid_buf, - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - c1_grid_buf, - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - c_grid_buf); - - // move on nwmmaperwave dimension - if constexpr(nwmmaperwave_forward_sweep && - (nwmmaperwave < NWmmaPerWave - CShuffleNWmmaPerWavePerShuffle)) - { - c_block_copy_lds_to_global.MoveSrc1SliceWindow( - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - nwmmaperwave_forward_step); - - c_block_copy_lds_to_global.MoveSrc2SliceWindow( - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - nwmmaperwave_forward_step); - - c_block_copy_lds_to_global.MoveDstSliceWindow( - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - nwmmaperwave_forward_step); - } - else if constexpr((!nwmmaperwave_forward_sweep) && (nwmmaperwave > 0)) - { - c_block_copy_lds_to_global.MoveSrc1SliceWindow( - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - nwmmaperwave_backward_step); - - c_block_copy_lds_to_global.MoveSrc2SliceWindow( - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - nwmmaperwave_backward_step); - - c_block_copy_lds_to_global.MoveDstSliceWindow( - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - nwmmaperwave_backward_step); - } - }); - - // move on mwmmaperwave dimension - if constexpr(mwmmaperwave < MWmmaPerWave - CShuffleMWmmaPerWavePerShuffle) - { - c_block_copy_lds_to_global.MoveSrc1SliceWindow( - c0_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - mwmmaperwave_forward_step); - - c_block_copy_lds_to_global.MoveSrc2SliceWindow( - c1_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - mwmmaperwave_forward_step); - - c_block_copy_lds_to_global.MoveDstSliceWindow( - c_grid_desc_mblock_mwmmaperwave_mwavemperwmma_nblock_nwmmaperwave_nwavenperwmma, - mwmmaperwave_forward_step); - } - }); + constexpr c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MLaneHigh_NRepeat_NWave_NLane_MLaneLow(); + constexpr c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.MakeCDesc_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + constexpr auto MRepeat = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I0); + constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I1); + constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I2); + constexpr auto NRepeat = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I3); + constexpr auto Nwave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I5); + constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I6); + + // Mapping + const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); + const index_t m_thread_data_on_grid = m_block_data_idx_on_grid + c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_grid = n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(NRepeat, Nwave, NThreadPerSubGroup))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_grid_idx = m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor( + make_multi_index(m_thread_data_on_grid)); + + const auto n_thread_data_on_grid_idx = n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup( + make_multi_index(n_thread_data_on_grid)); + + + auto c_thread_copy = + ThreadwiseTensorSliceTransfer_v1r3< + /* typename SrcData */ FloatAcc, + /* typename DstData */ FloatC, + /* typename SrcDesc */ decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs), + /* typename DstDesc */ decltype(c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs), + /* typename ElementwiseOperation */ CElementwiseOperation, + /* typename SliceLengths */ Sequence, + /* typename DimAccessOrder */ CThreadTransferSrcDstAccessOrder, + /* index_t DstVectorDim */ CThreadTransferSrcDstVectorDim, + /* index_t DstScalarPerVector */ CThreadTransferDstScalarPerVector, + /* InMemoryDataOperationEnum DstInMemOp */ CGlobalMemoryDataOperation, + /* index_t DstScalarStrideInVector */ 1, + /* bool DstResetCoordinateAfterRun */ true> + { + /* dst_desc */ c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* dst_slice_origin_idx */ make_multi_index(m_thread_data_on_grid_idx[I0], + m_thread_data_on_grid_idx[I1], + m_thread_data_on_grid_idx[I2], + n_thread_data_on_grid_idx[I0], + n_thread_data_on_grid_idx[I1], + n_thread_data_on_grid_idx[I2], + m_thread_data_on_grid_idx[I3]), + /* element_op */ c_element_op + }; + + c_thread_copy.Run( + /* c_thread_desc */ c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* c_start point */ make_tuple(I0, I0, I0, I0, I0, I0, I0), + /* c_buffer */ c_thread_buf, + /* c_grid_desc */ c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* c_grid_buf */ c_grid_buf); } -#endif + // clang-format on } }; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 31cf4b82b1c..2254521b1f3 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -11,34 +11,106 @@ namespace ck { enum struct WmmaInstr { - wmma_f32_16x16x16_f16_w32 = 0, - wmma_f32_16x16x16_bf16_w32 = 0, - wmma_f16_16x16x16_f16_w32 = 0, - wmma_bf16_16x16x16_bf16_w32 = 0, - wmma_i32_16x16x16_iu8_w32 = 0, - wmma_i32_16x16x16_iu4_w32 = 0 + wmma_f32_16x16x16_f16 = 0, + wmma_f32_16x16x16_bf16 = 0, + wmma_f16_16x16x16_f16 = 0, + wmma_bf16_16x16x16_bf16 = 0, + wmma_i32_16x16x16_iu8 = 0, + wmma_i32_16x16x16_iu4 = 0 }; -template +/* + * WMMA Wave Tile Always MxNxK = 16x16x16 + * WAVE32 + ----------------------------------- + |RC0| | | | | | | | | | | | | | | | SubGroup 0 + |RC1| | | | | | | | | | | | | | | | + |RC2| | | | | | | | | | | | | | | | + |RC3|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| + |RC4|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1| + |RC5|1|2|3|4|5|6|7|8|9|0|1|2|3|4|5| + |RC6| | | | | | | | | | | | | | | | + |RC7| | | | | | | | | | | | | | | | + ----------------------------------- + | | | | | | | | | | | | | | | | | SubGroup 1 + | | | | | | | | | | | | | | | | | + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| + | 1 |1|1|1|2|2|2|2|2|2|2|2|2|2|3|3| + | 6 |7|8|9|0|1|2|3|4|5|6|7|8|9|0|1| + | | | | | | | | | | | | | | | | | + | | | | | | | | | | | | | | | | | + | | | | | | | | | | | | | | | | | + ----------------------------------- + + + * WAVE64 + ----------------------------------- + |RC0|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 0 + |RC1|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1| + |RC2|1|2|3|4|5|6|7|8|9|0|1|2|3|4|5| + |RC3|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| + ----------------------------------- + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 1 + | 1 |1|1|1|2|2|2|2|2|2|2|2|2|2|3|3| + | 6 |7|8|9|0|1|2|3|4|5|6|7|8|9|0|1| + | | | | | | | | | | | | | | | | | + ----------------------------------- + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 2 + | 3 |3|3|3|3|3|3|3|4|4|4|4|4|4|4|4| + | 2 |3|4|5|6|7|8|9|0|1|2|3|4|5|6|7| + | | | | | | | | | | | | | | | | | + ----------------------------------- + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 3 + | 4 |4|5|5|5|5|5|5|5|5|5|5|6|6|6|6| + | 8 |9|0|1|2|3|4|5|6|7|8|9|0|1|2|3| + | | | | | | | | | | | | | | | | | + ----------------------------------- + +* RC = Register for storing accumalted result +* T = Thread ID +*/ + +template :: = false> struct wmma_type; -template <> -struct wmma_type +// A-swizzled +template +struct wmma_type { - static constexpr index_t m_per_wmma = 16; - static constexpr index_t n_per_wmma = 16; - static constexpr index_t k_per_wmma = 16; - static constexpr index_t wave_size = 32; - static constexpr index_t lane_size = 16; - static constexpr index_t src_data_size = 2; - static constexpr index_t acc_data_size = 4; - static constexpr index_t num_srcregs_per_wmma = 8; - static constexpr index_t num_accregs_per_wmma = 8; +// Absolute fixing property + // * Data Pixel + static constexpr index_t m_per_wmma = 16; + static constexpr index_t n_per_wmma = 16; + static constexpr index_t k_per_wmma = 16; + static constexpr index_t src_a_data_size = 2; + static constexpr index_t src_b_data_size = 2; + static constexpr index_t acc_data_size = 4; + // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction + static constexpr index_t num_thread_per_subgroups = n_per_wmma; + +// Wave mode dependent propety + static constexpr index_t wave_size = Number{}; + // * Fixed in Navi3x, Will be wave mode dependent on Navi4x + static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; + static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; + // * num_acc_vgprs_per_wave alone M direction + // * num_subgroups alone M direction + static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; + static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const { - intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); + if constexpr(wave_size == 32) + { + intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); + } + else if constexpr(wave_size == 64) + { + intrin_wmma_f32_16x16x16_f16_w64::Run(a, b, reg_c); + } } }; @@ -51,54 +123,54 @@ struct WmmaSelector template <> static constexpr auto GetWmma() { - return WmmaInstr::wmma_f32_16x16x16_f16_w32; + return WmmaInstr::wmma_f32_16x16x16_f16; } template <> static constexpr auto GetWmma() { - return WmmaInstr::wmma_f32_16x16x16_bf16_w32; + return WmmaInstr::wmma_f32_16x16x16_bf16; } template <> static constexpr auto GetWmma() { - return WmmaInstr::wmma_f16_16x16x16_f16_w32; + return WmmaInstr::wmma_f16_16x16x16_f16; } template <> static constexpr auto GetWmma() { - return WmmaInstr::wmma_bf16_16x16x16_bf16_w32; + return WmmaInstr::wmma_bf16_16x16x16_bf16; } template <> static constexpr auto GetWmma() { - return WmmaInstr::wmma_i32_16x16x16_iu8_w32; + return WmmaInstr::wmma_i32_16x16x16_iu8; } #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 template <> static constexpr auto GetWmma() { - return WmmaInstr::wmma_i32_16x16x16_iu4_w32; + return WmmaInstr::wmma_i32_16x16x16_iu4; } #endif - static constexpr auto selected_wmma = wmma_type()>{}; + static constexpr auto selected_wmma = wmma_type(), get_warp_size()>{}; __host__ __device__ constexpr WmmaSelector() { - static_assert(selected_wmma.m_per_wmma == selected_wmma.n_per_wmma, - "WRONG! WMMA_M must equal to WMMA_N"); + static_assert(selected_wmma.m_per_wmma == 16, + "WRONG! WMMA_M must equal to 16"); - static_assert(selected_wmma.m_per_wmma == selected_wmma.k_per_wmma, - "WRONG! WMMA_M must equal to WMMA_K"); + static_assert(selected_wmma.m_per_wmma == 16, + "WRONG! WMMA_M must equal to 16"); static_assert(selected_wmma.k_per_wmma == 16, - "WRONG! WMMA_M must equal to WMMA_N"); + "WRONG! WMMA_M must equal to 16"); - static_assert(selected_wmma.wave_size * selected_wmma.num_accregs_per_wmma * selected_wmma.acc_data_size== + static_assert(selected_wmma.wave_size * selected_wmma.num_acc_vgprs_per_wave * selected_wmma.acc_data_size== selected_wmma.m_per_wmma * selected_wmma.n_per_wmma * 4, "WRONG! Number of Accumulator Register"); @@ -135,26 +207,26 @@ struct WmmaGemm } // XDL output supporting C = A * B - // M2_N2 -> M2_M3_M4_N2 - template + // MPerWMMA_NPerWMMA -> MSubGroup_..._NPerWMMA_MAccVgprPerWave + template __host__ __device__ static constexpr auto - MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2) + MakeCDesc_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs + (const CDesc_MRepeat_Mwave_MPerWMMA_NRepeat_NWave_NPerWMMA& c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma) { - const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0); - const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1); - const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2); - const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3); + const auto MRepeat = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I0); + const auto NRepeat = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I3); + const auto MWave = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I1); + const auto NWave = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I4); return transform_tensor_descriptor( - c_desc_m0_n0_m1_n1_m2_n2, - make_tuple(make_pass_through_transform(M0), - make_pass_through_transform(N0), - make_pass_through_transform(M1), - make_pass_through_transform(N1), - make_unmerge_transform(make_tuple(Number{}, - Number{}, - Number{})), - make_pass_through_transform(Number{})), + c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma, + make_tuple(make_pass_through_transform(MRepeat), + make_pass_through_transform(Mwave), + make_unmerge_transform(make_tuple(Number{}, + Number{})), + make_pass_through_transform(NRepeat), + make_pass_through_transform(NWave), + make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -163,91 +235,22 @@ struct WmmaGemm Sequence<5>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4, 5, 6>{}, - Sequence<7>{})); - } - - // transposed XDL output supporting C' = B' * A' - // M2_N2 -> M2_N2_N3_N4 - template - __host__ __device__ static constexpr auto - MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2) - { - const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0); - const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1); - const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2); - const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3); - - return transform_tensor_descriptor( - c_desc_m0_n0_m1_n1_m2_n2, - make_tuple(make_pass_through_transform(M0), - make_pass_through_transform(N0), - make_pass_through_transform(M1), - make_pass_through_transform(N1), - make_pass_through_transform(Number{}), - make_unmerge_transform(make_tuple(Number{}, - Number{}, - Number{}))), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, + Sequence<2, 6>{}, Sequence<3>{}, Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5, 6, 7>{})); + Sequence<5>{})); } - template - __host__ __device__ static constexpr auto MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2( - const CDesc_G_M0_N0_M1_N1_M2_N2& c_desc_g_m0_n0_m1_n1_m2_n2) + __device__ static constexpr index_t GetRegSizePerWmma() { - const auto G = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I0); - const auto M0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I1); - const auto N0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I2); - const auto M1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I3); - const auto N1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I4); - - return transform_tensor_descriptor( - c_desc_g_m0_n0_m1_n1_m2_n2, - make_tuple(make_pass_through_transform(G), - make_pass_through_transform(M0), - make_pass_through_transform(N0), - make_pass_through_transform(M1), - make_pass_through_transform(N1), - make_unmerge_transform(make_tuple(wmma_instr.num_groups_per_blk, - wmma_instr.num_input_blks, - wmma_instr.group_size)), - make_pass_through_transform(wmma_instr.num_threads_per_blk)), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5>{}, - Sequence<6>{}), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5, 6, 7>{}, - Sequence<8>{})); + return wmma_instr.num_acc_vgprs_per_wave; } - __device__ static constexpr index_t GetRegSizePerXdlops() - { - return MPerWmma * NPerWmma / wmma_instr.wave_size; + __device__ static constexpr index_t GetWaveSize() + { + return wmma_instr.wave_size; } - __device__ static constexpr index_t GetWaveSize() { return wmma_instr.wave_size; } - template __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const { @@ -272,67 +275,50 @@ struct WmmaGemm } } - __device__ static auto GetLaneId() { return get_thread_local_1d_id() % wmma_instr.wave_size; } + __device__ static auto GetLaneId() + { + return get_thread_local_1d_id() % wmma_instr.wave_size; + } - __device__ static auto GetLaneIdHigh() + __device__ static auto GetSubGroupId() { - return GetLaneId() / 16; + return (GetLaneId() / wmma_instr.num_thread_per_subgroups) % wmma_instr.num_subgroups; } - __device__ static auto GetLaneIdLow() + __device__ static auto GetLaneIdUnderSubGroup() { - return GetLaneId() % 16; + return GetLaneId() % wmma_instr.num_thread_per_subgroups; } __device__ static auto GetSwizzledLaneIdLow() { - return ((GetLaneIdLow() & 1) << 3 ) | (GetLaneIdLow() >> 1); + return ((GetLaneIdUnderSubGroup() & 1) << 3 ) | (GetLaneIdUnderSubGroup() >> 1); } __host__ __device__ static auto CalculateAThreadOriginDataIndex() { - return make_tuple(0, GetSwizzledLaneIdLow()); + return GetSwizzledLaneIdLow(); } __host__ __device__ static auto CalculateBThreadOriginDataIndex() { - return make_tuple(0, GetLaneIdLow()); + return GetLaneIdUnderSubGroup(); } - __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i) + __device__ static CIndex GetBeginOfThreadBlk() { - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - index_t n_offset = blk_i * wmma_instr.n_per_blk + blk_td; - index_t m_offset = xdlops_i * wmma_instr.m_per_blk + blk_id * wmma_instr.group_size; + index_t n_offset = GetLaneIdUnderSubGroup(); + index_t m_offset = GetSubGroupId() * wmma_instr.num_acc_vgprs_per_wave; return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset}; } - __device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */) - { - const auto blk_idx = GetBlkIdx(); - - const auto blk_id = blk_idx[I0]; - const auto blk_td = blk_idx[I1]; - - return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td}; - } - static constexpr auto wmma = WmmaSelector{}; - static constexpr auto wmma_instr = wmma.selected_wmma; - static constexpr auto KPerXdlops = wmma.GetKPerXdlops(); - static constexpr auto K1PerXdlops = wmma.GetK1PerXdlops(); - static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops; - - __host__ __device__ static constexpr auto GetCM0M1M2NThreadBlkLengths() + __host__ __device__ static constexpr auto GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths() { return make_tuple( - Number{}, I1, Number{}, I1); + Number{}); } }; diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index ee3759d7e48..2da5537c2e7 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -8,7 +8,6 @@ // TODO: Add arch limitation namespace ck { -// wave32 only // src: fp16, dst: fp32 template struct intrin_wmma_f32_16x16x16_f16_w32; @@ -24,6 +23,20 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> } }; +template +struct intrin_wmma_f32_16x16x16_f16_w64; + +template <> +struct intrin_wmma_f32_16x16x16_f16_w64<16, 16> +{ + template + __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + } +}; + // src: bf16, dst: fp32 template struct intrin_wmma_f32_16x16x16_bf16_w32; From 9adf2e60dba1abe03c8c74e5d1668e2f69aacff4 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 30 Nov 2022 08:11:16 +0000 Subject: [PATCH 009/118] runtime bug, cannot find symbol --- example/01_gemm/gemm_wmma_fp16.cpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 162 +++++++----------- .../gpu/device/impl/device_gemm_wmma.hpp | 49 +++--- ...m_wmma_v1r1.hpp => gridwise_gemm_wmma.hpp} | 159 +++++++++-------- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 44 +++-- 5 files changed, 201 insertions(+), 215 deletions(-) rename include/ck/tensor_operation/gpu/grid/{gridwise_gemm_wmma_v1r1.hpp => gridwise_gemm_wmma.hpp} (78%) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index d76ff09a4d9..774207c3e54 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma // ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| // ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 7, 1>; + < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, 6, 1>; // clang-format on diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 5b211055cd7..c5b574b75c5 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -10,16 +10,6 @@ namespace ck { -enum struct LoopScheduler -{ - Default, -}; - -constexpr LoopScheduler make_default_loop_scheduler() -{ - return LoopScheduler::Default; -} - template -// MRepeat_MWave_MLaneHigh_NRepeat_NWave_NLane_MLanelow +/* A: K0PerBlock x MPerBlock x K1 + * B: K0PerBlock x NPerBlock x K1 + * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs + */ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; - static constexpr auto I3 = Number<4>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto WmmaK = Number<16>{}; using ThisThreadBlock = ThisThreadBlock; - static constexpr index_t WaveSize = get_warp_size(); + static constexpr index_t WaveSize = 32; static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); @@ -52,7 +46,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); - static constexpr auto wmma_gemm = WMMAGemm{}; + static constexpr auto wmma_gemm = WmmaGemm{}; static constexpr index_t KPerThread = KPerBlock / wmma_gemm.K0PerWMMA; @@ -62,7 +56,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 StaticBufferTupleOfVector c_thread_buf_; @@ -87,7 +81,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 const auto waveId_m = wave_idx[I0]; const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); - // |KRepeat |MRepeat|Mwave |MLane |KPack + // |KRepeat |MRepeat|MWave |MLane |KPack return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); } @@ -131,7 +125,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 return make_tuple(c_thread_m, c_thread_n); } - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1() + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3() { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && BK0NK1BlockDesc::IsKnownAtCompileTime(), @@ -157,76 +151,49 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 make_tuple(Number{}, I1, MSubGroup, Number{}, I1, NThreadPerSubGroup, MAccVgprs)); } - __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number{})); - - return wmma_gemm.MakeCDesc_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); - } - template __host__ __device__ static constexpr auto - MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n) + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(const CGridDesc_M_N& c_grid_desc_m_n) { const auto M = c_grid_desc_m_n.GetLength(I0); const auto N = c_grid_desc_m_n.GetLength(I1); - const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor( + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = transform_tensor_descriptor( c_grid_desc_m_n, make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{})); + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - return wmma_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2); + return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); } - __host__ __device__ static constexpr auto MakeABlockDescriptor_KRepeat_M0_M1_M2_KPack() + __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() { - static constexpr auto a_block_desc_temp_km0m1m2 = transform_tensor_descriptor( - AK0MK1BlockDesc{}, - make_tuple( - make_merge_transform(make_tuple(Number{}, Number{})), - make_unmerge_transform(make_tuple(Number{}, Number{}, Number{}))), - make_tuple(Sequence<0, 2>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{})); - return transform_tensor_descriptor( - a_block_desc_temp_km0m1m2, + AK0MK1BlockDesc{}, make_tuple( - make_unmerge_transform(make_tuple(Number{}, Number{})), - make_pass_through_transform(make_tuple(Number{}, Number{}, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}), - make_tuple(Sequence<0, 4>{}, Sequence<1, 2, 3>{})); + make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } - __host__ __device__ static constexpr auto MakeBBlockDescriptor_KRepeat_N0_N1_N2_KPack() + __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() { - static constexpr auto b_block_desc_temp_kn0n1n2 = transform_tensor_descriptor( - BK0NK1BlockDesc{}, - make_tuple( - make_merge_transform(make_tuple(Number{}, Number{})), - make_unmerge_transform(make_tuple(Number{}, Number{}, Number{}))), - make_tuple(Sequence<0, 2>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{})); - return transform_tensor_descriptor( - b_block_desc_temp_kn0n1n2, + BK0NK1BlockDesc{}, make_tuple( - make_unmerge_transform(make_tuple(Number{}, Number{})), - make_pass_through_transform(make_tuple(Number{}, Number{}, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}), - make_tuple(Sequence<0, 4>{}, Sequence<1, 2, 3>{})); + make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } - static constexpr auto a_block_desc_krepeat_m0_m1_m2_kpack = MakeABlockDescriptor_KRepeat_M0_M1_M2_KPack(); - static constexpr auto b_block_desc_krepeat_n0_n1_n2_kpack = MakeBBlockDescriptor_KRepeat_N0_N1_N2_KPack(); + static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); + static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); template __device__ void Run(const ABlockBuffer& a_block_buf, @@ -239,9 +206,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 b_thread_desc_.GetElementSpaceSize()); constexpr auto RepeatDiff = MRepeat - NRepeat; - constexpr auto WmmaK = wmma_gemm.k_per_wmma; - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto iWmmaK){ + static_for<0, KPerBlock, WmmaK>{}([&](auto iWmmaK){ // Cut to Repeat Retangle to Square, assume MRepeat > NRepeat static_for<0, RepeatDiff, 1>{}([&](auto iCut){ static_for<0, NRepeat, 1>{}([&](auto iN){ @@ -251,25 +217,25 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK/A_K1, iCut, 0, 0, iK%A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK/B_K1, iN, 0, 0, iK%B_K1))>{}]; }); using wmma_input_type = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); wmma_gemm.template Run( - a_thread_vec.template AsType(), - b_thread_vec.template AsType(), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - a_thread_copy_.Run(a_block_desc_krepeat_m0_m1_m2_kpack, - make_tuple(Number{}, iCut, I0, I0, I0), + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, Number{}), a_block_buf, a_thread_desc_, - make_tuple(I0, I0, I0, I0), + make_tuple(I0, Number{}, I0, I0, I0), a_thread_buf); }); // Run FIFO fashion loopover in Square @@ -281,25 +247,25 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK/A_K1, WmmaInnerloop+RepeatDiff, 0, 0, iK%A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK/B_K1, iN, 0, 0, iK%B_K1))>{}]; }); using wmma_input_type = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); wmma_gemm.template Run( - a_thread_vec.template AsType(), - b_thread_vec.template AsType(), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - a_thread_copy_.Run(a_block_desc_krepeat_m0_m1_m2_kpack, - make_tuple(Number{}, WmmaInnerloop+RepeatDiff, I0, I0, I0), + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, Number{}), a_block_buf, a_thread_desc_, - make_tuple(I0, I0, I0, I0), + make_tuple(I0, Number{}, I0, I0, I0), a_thread_buf); static_for{}([&](auto iM){ vector_type a_thread_vec; @@ -308,25 +274,25 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK/A_K1, iM, 0, 0, iK%A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK/B_K1, WmmaInnerloop, 0, 0, iK%B_K1))>{}]; }); using wmma_input_type = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); wmma_gemm.template Run( - a_thread_vec.template AsType(), - b_thread_vec.template AsType(), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - b_thread_copy_.Run(b_block_desc_krepeat_n0_n1_n2_kpack, - make_tuple(Number{}, WmmaInnerloop, I0, I0, I0), + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, Number{}), b_block_buf, b_thread_desc_, - make_tuple(I0, I0, I0, I0), + make_tuple(I0, Number{}, I0, I0, I0), b_thread_buf); }); }); @@ -335,33 +301,33 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 protected: // A[M0, M1, M2, K0 = WmmaK] static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{}, I1, I1, Number{})); // B[N0, N1, N2, K0 = WmmaK] static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number{})); + make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{}, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWMMA())); + make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, + Sequence, + Sequence<3, 0, 1, 2, 4>, + 4, A_K1, A_K1>; using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3>, - 3, + Sequence, + Sequence<3, 0, 1, 2, 4>, + 4, B_K1, B_K1>; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index c5a9bf5ff60..849f024740d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -12,7 +12,7 @@ #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_gemm.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -38,8 +38,8 @@ template , // CThreadTransferSrcDstAccessOrder, + Sequence<0, 1, 2, 3, 4, 5, 6>, // CThreadTransferSrcDstAccessOrder, CThreadTransferSrcDstVectorDim, CThreadTransferDstScalarPerVector, -#endif NumPrefetch, + LoopSched, PipelineVer>; // Argument - struct Argument : public BaseArgumentW + struct Argument : public BaseArgument { Argument(const ADataType* p_a_grid, const BDataType* p_b_grid, @@ -263,7 +262,7 @@ struct DeviceGemmWmma : public DeviceGemm, remove_reference_t, - remove_reference_t, + remove_reference_t, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, remove_reference_t, true>; // Last Option is W/O - + + std::cout<<"Host kernel type is "<< type_name()<, remove_reference_t, - remove_reference_t, + remove_reference_t, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, @@ -401,7 +402,7 @@ struct DeviceGemmWmma : public DeviceGemm" << " NumPrefetch: " << NumPrefetch << ", " diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp similarity index 78% rename from include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp rename to include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 7abacc2de35..765b0643f48 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_v1r1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -22,7 +22,7 @@ template -struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 +struct GridwiseGemm_k0mk1_k0nk1_mn_wmma { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -132,7 +133,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = remove_cvref_t())>; + GridwiseGemmPipeline_Selector())>; __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() { @@ -207,8 +208,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 static_assert(is_known_at_compile_time>::value, "wrong! K1 need to be known at compile-time"); - static_assert((MPerBlock % (MPerWmma * MWmmaPerWave) == 0) && - (NPerBlock % (NWmmaPerWave * NPerWmma)) == 0, + static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && + (NPerBlock % (NRepeat * NPerWmma)) == 0, "Invalid tuning param!"); const auto M = a_grid_desc_k0_m_k1.GetLength(I1); @@ -247,35 +248,57 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } - template __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlock_MRepeat_Mwave_MSubGroup_NBlock_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - const CGridDesc_M_N_& c_grid_desc_m_n) + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CGridDesc_M_N& c_grid_desc_m_n) { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto MBlock = M / MPerBlock; - const auto NBlock = N / NPerBlock; - - constexpr index_t MWave = MPerBlock / (MWmmaPerWave * MPerWmma); - constexpr index_t NWave = NPerBlock / (NWmmaPerWave * NPerWmma); - - constexpr index_t MLaneHigh = 2; - constexpr index_t MLaneLow = NWmmaPerWave / MLaneHigh; - constexpr index_t NLane = NWmmaPerWave; - - const auto c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple(make_unmerge_transform(make_tuple( - MBlock, Number{}, Number{}, Number{}, Number{})), - make_unmerge_transform(make_tuple( - NBlock, Number{}, Number{}, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2, 3, 8>{}, Sequence<4, 5, 6, 7>{})); - - return c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs; + constexpr auto max_lds_align = K1; + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); + + constexpr auto WmmaK = 16; + constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + + using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3; + + return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_m_n); } // return block_id to C matrix tile idx (m0, n0) mapping @@ -285,9 +308,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 return BlockToCTileMap_M00_N0_M01Adapt( c_grid_desc_m_n); } - using CGridDescriptor_MBlock_MRepeat_Mwave_MSubGroup_NBlock_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs = + using CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs = remove_cvref_t; using DefaultBlock2CTileMap = remove_cvref_t; @@ -300,8 +323,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 void* __restrict__ p_shared, const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, - const CGridDescriptor_MBlock_MRepeat_Mwave_MSubGroup_NBlock_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs& - c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs& + c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs, const AElementwiseOperation& a_element_op, const BElementwiseOperation& b_element_op, const CElementwiseOperation& c_element_op, @@ -315,15 +338,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetElementSpaceSize()); + p_c_grid, c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs.GetElementSpaceSize()); /*******************************************************************************/ // BlockIdx.x -> [BlockId.m, BlockId.n] const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); if(!block_2_ctile_map.ValidCTileIndex( block_work_idx, - make_tuple(c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I0), - c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4)))) + make_tuple(c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I0), + c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4)))) { return; } // Store BlockId into SGPR @@ -415,8 +438,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 decltype(b_block_desc_k0perblock_nperblock_k1), MPerWmma, NPerWmma, - MWmmaPerWave, - NWmmaPerWave, + MRepeat, + NRepeat, KPack>{}; // Prepare Register for C matrix @@ -450,20 +473,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 blockwise_gemm, c_thread_buf, K0BlockMainLoop); - // NO C-shuffle, direct write +/*******************************************************************************/ + // write out C matrix, c shuffle not implemented { - constexpr c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MLaneHigh_NRepeat_NWave_NLane_MLaneLow(); - constexpr c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm.MakeCDesc_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - - constexpr auto MRepeat = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I0); - constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I1); - constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I2); - constexpr auto NRepeat = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I3); - constexpr auto Nwave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4); - constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I5); - constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I6); + constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + constexpr auto MWave = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I1); + constexpr auto MSubGroup = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I2); + constexpr auto Nwave = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I5); + constexpr auto MAccVgprs = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I6); // Mapping const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); @@ -476,16 +496,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 make_tuple(Sequence<0, 1, 2, 3>{}), make_tuple(Sequence<0>{})); - const auto n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup = + const auto n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor = make_single_stage_tensor_adaptor( make_tuple(make_merge_transform(make_tuple(NRepeat, Nwave, NThreadPerSubGroup))), make_tuple(Sequence<0, 1, 2>{}), make_tuple(Sequence<0>{})); - const auto m_thread_data_on_grid_idx = m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor( + const auto m_thread_data_on_grid_idx = m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( make_multi_index(m_thread_data_on_grid)); - const auto n_thread_data_on_grid_idx = n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup( + const auto n_thread_data_on_grid_idx = n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( make_multi_index(n_thread_data_on_grid)); @@ -494,8 +514,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 /* typename SrcData */ FloatAcc, /* typename DstData */ FloatC, /* typename SrcDesc */ decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs), - /* typename DstDesc */ decltype(c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs), + /* typename DstDesc */ decltype(c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs), /* typename ElementwiseOperation */ CElementwiseOperation, + // Thread register Mapping /* typename SliceLengths */ Sequence, /* typename DimAccessOrder */ CThreadTransferSrcDstAccessOrder, /* index_t DstVectorDim */ CThreadTransferSrcDstVectorDim, @@ -504,7 +525,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 /* index_t DstScalarStrideInVector */ 1, /* bool DstResetCoordinateAfterRun */ true> { - /* dst_desc */ c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* dst_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs, /* dst_slice_origin_idx */ make_multi_index(m_thread_data_on_grid_idx[I0], m_thread_data_on_grid_idx[I1], m_thread_data_on_grid_idx[I2], @@ -517,9 +538,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma_v1 c_thread_copy.Run( /* c_thread_desc */ c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, - /* c_start point */ make_tuple(I0, I0, I0, I0, I0, I0, I0), - /* c_buffer */ c_thread_buf, - /* c_grid_desc */ c_grid_desc_mblock_mrepeat_mwave_msubgroup_n_block_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* c_register_beginning*/ make_tuple(I0, I0, I0, I0, I0, I0, I0), + /* c_local(register) */ c_thread_buf, + /* c_grid_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs, /* c_grid_buf */ c_grid_buf); } // clang-format on diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 2254521b1f3..f3d2787c03d 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -72,12 +72,14 @@ enum struct WmmaInstr template :: = false> -struct wmma_type; + typename = void> +struct wmma_type{}; // A-swizzled template -struct wmma_type +struct wmma_type> { // Absolute fixing property // * Data Pixel @@ -172,11 +174,7 @@ struct WmmaSelector static_assert(selected_wmma.wave_size * selected_wmma.num_acc_vgprs_per_wave * selected_wmma.acc_data_size== selected_wmma.m_per_wmma * selected_wmma.n_per_wmma * 4, - "WRONG! Number of Accumulator Register"); - - static_assert(selected_wmma.lane_size * selected_wmma.num_srcregs_per_wmma * selected_wmma.src_data_size== - selected_wmma.m_per_wmma * selected_wmma.k_per_wmma * 4, - "WRONG! Number of Source Register"); + "WRONG! Invalid Number of Accumulator Register"); } }; @@ -206,25 +204,25 @@ struct WmmaGemm static_assert(KPack == wmma_instr.k_per_wmma, "KPack should be k_per_wmma"); } - // XDL output supporting C = A * B + // WMMA output supporting C = A * B // MPerWMMA_NPerWMMA -> MSubGroup_..._NPerWMMA_MAccVgprPerWave - template + template __host__ __device__ static constexpr auto - MakeCDesc_MRepeat_Mwave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs - (const CDesc_MRepeat_Mwave_MPerWMMA_NRepeat_NWave_NPerWMMA& c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma) + MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs + (const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) { - const auto MRepeat = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I0); - const auto NRepeat = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I3); - const auto MWave = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I1); - const auto NWave = c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma.GetLength(I4); + const auto MBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); + const auto NBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); + const auto MWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); + const auto NWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); return transform_tensor_descriptor( - c_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma, - make_tuple(make_pass_through_transform(MRepeat), - make_pass_through_transform(Mwave), + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma, + make_tuple(make_pass_through_transform(MBlockxRepeat), + make_pass_through_transform(MWave), make_unmerge_transform(make_tuple(Number{}, Number{})), - make_pass_through_transform(NRepeat), + make_pass_through_transform(NBlockxRepeat), make_pass_through_transform(NWave), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, @@ -266,12 +264,12 @@ struct WmmaGemm if constexpr(!TransposeC) { wmma_instr.template run( - p_a_wave[0], p_b_wave[0], p_c_thread); + p_a_wave, p_b_wave, p_c_thread); } else { wmma_instr.template run( - p_b_wave[0], p_a_wave[0], p_c_thread); + p_b_wave, p_a_wave, p_c_thread); } } @@ -318,7 +316,7 @@ struct WmmaGemm __host__ __device__ static constexpr auto GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths() { return make_tuple( - Number{}); + I1, I1, Number{}); } }; From 0cd587d9e593d66e48c6917af78570dcf58c9d5c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 1 Dec 2022 02:47:23 +0000 Subject: [PATCH 010/118] workaround for incorrect HIP warpSize return value --- .../gpu/grid/gridwise_gemm_wmma.hpp | 2 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 4 ++-- include/ck/utility/common_header.hpp | 23 +++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 765b0643f48..4511a1d9779 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -315,7 +315,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma using DefaultBlock2CTileMap = remove_cvref_t; - template + template __device__ static void Run(const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index f3d2787c03d..08ecc71dd11 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -158,8 +158,8 @@ struct WmmaSelector return WmmaInstr::wmma_i32_16x16x16_iu4; } #endif - - static constexpr auto selected_wmma = wmma_type(), get_warp_size()>{}; + // get_warp_size do not return the correct wavesize, hardcode to 32 as workaround + static constexpr auto selected_wmma = wmma_type(), Number<32>{}>{}; __host__ __device__ constexpr WmmaSelector() { diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 1378bbe448e..1911a3cbe80 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -49,3 +49,26 @@ #ifdef CK_USE_AMD_MFMA #include "ck/utility/amd_xdlops.hpp" #endif + +#include + +template +constexpr auto type_name() { + std::string_view name, prefix, suffix; +#ifdef __clang__ + name = __PRETTY_FUNCTION__; + prefix = "auto type_name() [T = "; + suffix = "]"; +#elif defined(__GNUC__) + name = __PRETTY_FUNCTION__; + prefix = "constexpr auto type_name() [with T = "; + suffix = "]"; +#elif defined(_MSC_VER) + name = __FUNCSIG__; + prefix = "auto __cdecl type_name<"; + suffix = ">(void)"; +#endif + name.remove_prefix(prefix.size()); + name.remove_suffix(suffix.size()); + return name; +} From 43a209976ace2c28ed26d306a7ab7bdfbf0db9fc Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 2 Dec 2022 02:20:18 +0000 Subject: [PATCH 011/118] debugging --- example/01_gemm/run_gemm_example.inc | 7 ++----- .../ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp | 1 + .../tensor_operation/gpu/device/impl/device_gemm_wmma.hpp | 2 -- .../ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 2 +- .../gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp | 1 + include/ck/utility/common_header.hpp | 8 ++++++++ 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 10b9917376a..3927ef494fc 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -32,10 +32,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) { case 0: break; case 1: - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k.begin(), - a_m_k.end()); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n.begin(), - b_k_n.end()); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k.begin(), a_m_k.end()); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n.begin(), b_k_n.end()); break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k.begin(), a_m_k.end()); @@ -102,7 +100,6 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) return true; } - float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); std::size_t flop = 2_uz * M * N * K; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index c5b574b75c5..b8fcc9c27d5 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -226,6 +226,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + // debug_hexprinter(0x3c003c00, a_thread_vec.template AsType()(Number<0>{})); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 849f024740d..f032423c87a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -359,8 +359,6 @@ struct DeviceGemmWmma : public DeviceGemm, true>; // Last Option is W/O - std::cout<<"Host kernel type is "<< type_name()<()).c_str()); constexpr auto max_lds_align = K1; constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); @@ -457,7 +458,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // gridwise GEMM pipeline const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); - GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, a_block_desc_k0perblock_mperblock_k1, a_blockwise_copy, diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index bb28c194f4b..f7399d343af 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -208,6 +208,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_buf.template Get(src_coord_.GetOffset(), is_src_valid)}; // apply SrcElementwiseOperation on src_vector_container + debug_hexprinter(0xffffffff, src_coord_.GetOffset()); static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { SrcData src_v; diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 1911a3cbe80..81bb19f569a 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -72,3 +72,11 @@ constexpr auto type_name() { name.remove_suffix(suffix.size()); return name; } + +template +__device__ +void debug_hexprinter(const uint32_t v_target, T v_val){ + const uint32_t v_dbg = *(reinterpret_cast(&v_val)); + if(v_dbg != v_target) + printf("@Thread: %d, Val: %08x != Target: %08x\n", ck::get_thread_local_1d_id(), v_dbg, v_target); +} From 73959956a78c856316a93daff09a2e7651efcd2c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 5 Dec 2022 02:23:45 +0000 Subject: [PATCH 012/118] tempsave --- .../ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 7 +++++-- .../gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 3c3c5ee1194..5b1b3b00679 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -356,11 +356,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /*******************************************************************************/ // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - printf("A_GRID_DESC: %s \n", std::string(type_name()).c_str()); + // printf("K0 = %d, M = %d, K1 = %d\n", K0, a_grid_desc_k0_m_k1.GetLength(I1), (a_grid_desc_k0_m_k1.GetLength(I2))()); constexpr auto max_lds_align = K1; constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - + printf("blockdesc: K0 = %d, M = %d, K1 = %d\n", (a_block_desc_k0perblock_mperblock_k1.GetLength(I0))(), + (a_block_desc_k0perblock_mperblock_k1.GetLength(I1))(), (a_block_desc_k0perblock_mperblock_k1.GetLength(I2))()); // A matrix blockwise copy auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, @@ -390,6 +391,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma a_block_desc_k0perblock_mperblock_k1, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); + printf("BlockSliceLengths K0 = %d, M = %d, K1 = %d\n", K0PerBlock, MPerBlock, K1()); + // printf("a_block_wise_copy: %s\n", std::string(type_name()).c_str()); // B matrix blockwise copy auto b_blockwise_copy = diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index f7399d343af..d47d4d0e569 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -96,6 +96,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_element_op_(src_element_op), dst_element_op_(dst_element_op) { + printf("global desc: %s\n", __PRETTY_FUNCTION__); } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -127,11 +128,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1 detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - + printf("src_access_lengths: %d, %d, %d\n", (src_access_lengths[Number<0>{}])(), src_access_lengths[Number<1>{}](), src_access_lengths[Number<2>{}]()); constexpr auto src_dim_access_order = SrcDimAccessOrder{}; constexpr auto ordered_src_access_lengths = container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + printf("ordered_src_access_lengths: %d, %d, %d\n", (ordered_src_access_lengths[Number<0>{}])(), ordered_src_access_lengths[Number<1>{}](), ordered_src_access_lengths[Number<2>{}]()); // make forward steps const auto src_forward_steps = generate_tuple( @@ -145,6 +147,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 return make_tensor_coordinate_step(src_desc, forward_step_idx); }, Number{}); + printf("src_forward_steps: %d, %d, %d\n", (src_forward_steps.GetIndexDiff()[Number<0>{}])(), + (src_forward_steps.GetIndexDiff()[Number<1>{}])(), + (src_forward_steps.GetIndexDiff()[Number<2>{}])() ); // make backward steps const auto src_backward_steps = generate_tuple( From 9bd44685e46d9857d254d2f79f3489d1d3e8dc81 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 9 Dec 2022 08:33:08 +0000 Subject: [PATCH 013/118] Correctness OK, waiting for optimization --- example/01_gemm/gemm_wmma_fp16.cpp | 11 +- example/01_gemm/run_gemm_example.inc | 7 +- .../gpu/block/blockwise_gemm_wmma.hpp | 136 ++++++- .../gpu/device/impl/device_gemm_wmma.hpp | 80 ++-- .../gpu/grid/gridwise_gemm_wmma.hpp | 372 ++++++++++++++++-- .../threadwise_tensor_slice_transfer.hpp | 30 +- .../threadwise_tensor_slice_transfer_v3r1.hpp | 11 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 35 ++ include/ck/utility/common_header.hpp | 26 +- .../include/ck/library/utility/check_err.hpp | 12 +- library/include/ck/library/utility/fill.hpp | 18 + 11 files changed, 637 insertions(+), 101 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 774207c3e54..7d8ae1e9bbc 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -22,14 +22,21 @@ using CElementOp = PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // clang-format off -using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma +// using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmWmma // ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MWMMA|NMMMA| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| // ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| // ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, 6, 1>; + // < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, 6, 1>; // clang-format on +using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle +// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MWmma|NWmma| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave| _MBlock_MWaveMPerWmma| ScalarPerVector| +// ######| | | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| +// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, 1, 1, S<1, 32, 1, 8>, 8>; + using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 3927ef494fc..c3d6f605c8e 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -32,8 +32,11 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) { case 0: break; case 1: - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k.begin(), a_m_k.end()); - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n.begin(), b_k_n.end()); + // CONFIRMED + // ck::utils::FillMNID{}(a_m_k.begin(), a_m_k.end()); + // ck::utils::FillMNID{}(b_k_n.begin(), b_k_n.end()); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k.begin(), a_m_k.end()); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n.begin(), b_k_n.end()); break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k.begin(), a_m_k.end()); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index b8fcc9c27d5..001de16d902 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -137,7 +137,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } - // Thread level, register decriptor. + // Thread level, register decriptor. Vector-write __host__ __device__ static constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() { constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); @@ -168,6 +168,51 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); } + // Thread level, register decriptor. Per-pixel write + __host__ __device__ static constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave |NThreadPerSubGroup + make_tuple(Number{}, I1, MSubGroup, MAccVgprs, Number{}, I1, NThreadPerSubGroup)); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // Provide dimension size + __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + } + __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() { return transform_tensor_descriptor( @@ -205,8 +250,28 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); - constexpr auto RepeatDiff = MRepeat - NRepeat; - + // constexpr auto RepeatDiff = MRepeat - NRepeat; + + // debug_hexprinter(0xffffffff, a_thread_buf[Number{}], "Avalue "); + /* First local prefetch, move out of blockwise operation. + static_for<0, NRepeat, 1>{}([&](auto iN){ + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(I0, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); + }); + static_for<0, MRepeat, 1>{}([&](auto iN){ + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(I0, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); + }); + */ + /* static_for<0, KPerBlock, WmmaK>{}([&](auto iWmmaK){ // Cut to Repeat Retangle to Square, assume MRepeat > NRepeat static_for<0, RepeatDiff, 1>{}([&](auto iCut){ @@ -297,16 +362,77 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 b_thread_buf); }); }); + */ + + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, m0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, I0, I0, I0, I0), + a_thread_buf); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, I0, I0, I0, I0), + b_thread_buf); + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + a_thread_vec.template AsType()(i) = a_thread_buf + [Number{}]; + b_thread_vec.template AsType()(i) = b_thread_buf + [Number{}]; + }); + + using wmma_input_type = + typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + + // static_for<0, 16, 1>{}([&](auto i){ + // char info[4]; + // info[0] = 'A'; + // info[1] = i/10 + '0'; + // info[2] = i%10 + '0'; + // info[3] = '\0'; + // debug_hexprinter(0xffffffff, a_thread_buf[Number{}], info); + // }); + + // static_for<0, 16, 1>{}([&](auto i){ + // char info[4]; + // info[0] = 'B'; + // info[1] = i/10 + '0'; + // info[2] = i%10 + '0'; + // info[3] = '\0'; + // debug_hexprinter(0xffffffff, b_thread_buf[Number{}], info); + // }); } protected: // A[M0, M1, M2, K0 = WmmaK] static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{}, I1, I1, Number{})); + make_naive_tensor_descriptor_packed(make_tuple(Number{}, I1, I1, I1, Number{})); // B[N0, N1, N2, K0 = WmmaK] static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, Number{}, I1, I1, Number{})); + make_naive_tensor_descriptor_packed(make_tuple(Number{}, I1, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index f032423c87a..9e572cf1dc7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -20,13 +20,14 @@ namespace ck { namespace tensor_operation { namespace device { -template -struct DeviceGemmWmma : public DeviceGemm +struct DeviceGemmWmma_CShuffle : public DeviceGemm { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -200,6 +203,7 @@ struct DeviceGemmWmma : public DeviceGemm, // CThreadTransferSrcDstAccessOrder, - CThreadTransferSrcDstVectorDim, - CThreadTransferDstScalarPerVector, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer>; @@ -262,7 +267,7 @@ struct DeviceGemmWmma : public DeviceGemm, - remove_reference_t, - remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, @@ -369,7 +374,7 @@ struct DeviceGemmWmma : public DeviceGemm, - remove_reference_t, - remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, @@ -400,7 +405,7 @@ struct DeviceGemmWmma : public DeviceGemm || is_same_v || - is_same_v)) + if constexpr(!(is_same_v || is_same_v)) { return false; } @@ -530,7 +534,7 @@ struct DeviceGemmWmma : public DeviceGemm @@ -179,6 +182,23 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return b_block_desc_k0perblock_nperblock_k1; } + __host__ __device__ static constexpr auto + // *Caution Here repeat is shuffle repeat + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + { + constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma); + constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma); + + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + I1, + Number{})); + + return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat; + } + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment @@ -248,6 +268,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } + // Vector write __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( const CGridDesc_M_N& c_grid_desc_m_n) @@ -301,6 +322,79 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_m_n); } + // Per pixel + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + const CGridDesc_M_N& c_grid_desc_m_n) + { + constexpr auto max_lds_align = K1; + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); + + constexpr auto WmmaK = 16; + constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + + using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3; + + return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(c_grid_desc_m_n); + } + + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto MBlock = M / MPerBlock; + const auto NBlock = N / NPerBlock; + + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), + make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + return c_grid_desc_mblock_mperblock_nblock_nperblock; + } + // return block_id to C matrix tile idx (m0, n0) mapping __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap( const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */) @@ -308,10 +402,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return BlockToCTileMap_M00_N0_M01Adapt( c_grid_desc_m_n); } - using CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs = - remove_cvref_t; + // using CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup = remove_cvref_t; + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = + remove_cvref_t; using DefaultBlock2CTileMap = remove_cvref_t; @@ -323,8 +418,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma void* __restrict__ p_shared, const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, - const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs& - c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock, + // const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup& + // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, const AElementwiseOperation& a_element_op, const BElementwiseOperation& b_element_op, const CElementwiseOperation& c_element_op, @@ -338,15 +435,15 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs.GetElementSpaceSize()); + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); /*******************************************************************************/ // BlockIdx.x -> [BlockId.m, BlockId.n] const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); if(!block_2_ctile_map.ValidCTileIndex( block_work_idx, - make_tuple(c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I0), - c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4)))) + make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), + c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) { return; } // Store BlockId into SGPR @@ -360,8 +457,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto max_lds_align = K1; constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - printf("blockdesc: K0 = %d, M = %d, K1 = %d\n", (a_block_desc_k0perblock_mperblock_k1.GetLength(I0))(), - (a_block_desc_k0perblock_mperblock_k1.GetLength(I1))(), (a_block_desc_k0perblock_mperblock_k1.GetLength(I2))()); + // printf("blockdesc: K0 = %d, M = %d, K1 = %d\n", (a_block_desc_k0perblock_mperblock_k1.GetLength(I0))(), + // (a_block_desc_k0perblock_mperblock_k1.GetLength(I1))(), (a_block_desc_k0perblock_mperblock_k1.GetLength(I2))()); // A matrix blockwise copy auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, @@ -391,7 +488,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma a_block_desc_k0perblock_mperblock_k1, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); - printf("BlockSliceLengths K0 = %d, M = %d, K1 = %d\n", K0PerBlock, MPerBlock, K1()); + // printf("BlockSliceLengths K0 = %d, M = %d, K1 = %d\n", K0PerBlock, MPerBlock, K1()); // printf("a_block_wise_copy: %s\n", std::string(type_name()).c_str()); // B matrix blockwise copy @@ -477,21 +574,38 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma c_thread_buf, K0BlockMainLoop); /*******************************************************************************/ +#ifdef CK_EXPERIMENTAL_ARBITRARY_WRITEOUT // write out C matrix, c shuffle not implemented { + static_for<0, 16, 1>{}([&](auto i){ + char info[4]; + info[0] = 'C'; + info[1] = i/10 + '0'; + info[2] = i%10 + '0'; + info[3] = '\0'; + debug_hexprinter(0xffffffff, c_thread_buf[Number{}], info); + }); + constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - constexpr auto MWave = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I1); - constexpr auto MSubGroup = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I2); - constexpr auto Nwave = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4); - constexpr auto NThreadPerSubGroup = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I5); - constexpr auto MAccVgprs = c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I6); - + // This API Provide All dimension (size) you need + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I1); + constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I2); + constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I5); + constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I6); + // printf("MWave = %d, MSubGroup = %d, NWave = %d, NThreadPerSubGroup = %d, MAccVgprs = %d\n", MWave, MSubGroup, NWave, NThreadPerSubGroup, MAccVgprs); // Mapping const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); const index_t m_thread_data_on_grid = m_block_data_idx_on_grid + c_thread_mtx_on_block[I0]; const index_t n_thread_data_on_grid = n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; + // Checked + // debug_hexprinter(0xffffffff, m_thread_data_on_grid, "c_m"); + // debug_hexprinter(0xffffffff, n_thread_data_on_grid, "c_n"); const auto m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = make_single_stage_tensor_adaptor( @@ -501,25 +615,31 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma const auto n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(NRepeat, Nwave, NThreadPerSubGroup))), + make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), make_tuple(Sequence<0, 1, 2>{}), make_tuple(Sequence<0>{})); const auto m_thread_data_on_grid_idx = m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( make_multi_index(m_thread_data_on_grid)); + debug_hexprinter(0x4, MRepeat, "mblockxrepeat"); + debug_hexprinter(0x2, MWave, "mwave"); + debug_hexprinter(0x2, MSubGroup, "msubgroup"); + debug_hexprinter(0x8, MAccVgprs, "maccvgprs"); + debug_hexprinter(0x4, NWave, "nwave"); const auto n_thread_data_on_grid_idx = n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( make_multi_index(n_thread_data_on_grid)); + // printf("write out dimension access order = (%d, %d, %d, %d, %d, %d, %d)\n", CThreadTransferSrcDstAccessOrder{}[Number<0>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<1>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<2>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<3>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<4>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<5>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<6>{}].value); auto c_thread_copy = ThreadwiseTensorSliceTransfer_v1r3< /* typename SrcData */ FloatAcc, /* typename DstData */ FloatC, /* typename SrcDesc */ decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs), - /* typename DstDesc */ decltype(c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs), + /* typename DstDesc */ decltype(c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup), /* typename ElementwiseOperation */ CElementwiseOperation, - // Thread register Mapping + // Thread register Mapping 0 1 2 4 5 6 3 /* typename SliceLengths */ Sequence, /* typename DimAccessOrder */ CThreadTransferSrcDstAccessOrder, /* index_t DstVectorDim */ CThreadTransferSrcDstVectorDim, @@ -528,14 +648,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /* index_t DstScalarStrideInVector */ 1, /* bool DstResetCoordinateAfterRun */ true> { - /* dst_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* dst_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, /* dst_slice_origin_idx */ make_multi_index(m_thread_data_on_grid_idx[I0], m_thread_data_on_grid_idx[I1], m_thread_data_on_grid_idx[I2], + m_thread_data_on_grid_idx[I3], n_thread_data_on_grid_idx[I0], n_thread_data_on_grid_idx[I1], - n_thread_data_on_grid_idx[I2], - m_thread_data_on_grid_idx[I3]), + n_thread_data_on_grid_idx[I2]), /* element_op */ c_element_op }; @@ -543,9 +663,193 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /* c_thread_desc */ c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, /* c_register_beginning*/ make_tuple(I0, I0, I0, I0, I0, I0, I0), /* c_local(register) */ c_thread_buf, - /* c_grid_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_nblockxrepeat_nwave_nthreadpersubgroup_maccvgprs, + /* c_grid_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, /* c_grid_buf */ c_grid_buf); } +#endif + { + // write out to C, implement shuffle + constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + // This API Provide All dimension (size) you need + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = + blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); + constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); + constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I5); + constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I6); + + // LDS descriptor, shuffle and write out in MRepeat x NRepeat times + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); + + auto c_shuffle_block_buf = make_dynamic_buffer( + static_cast(p_shared), + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); + + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + make_tuple( + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // MRepeat per shuffle repeat + MWave, // MWave + MSubGroup, // MSubGroup * MAccVgprs = MPerWmma + MAccVgprs)), + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // NRepeat per shuffle repeat + NWave, // NWave + NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<>{}, Sequence<0, 1, 2, 6>{}, Sequence<>{}, Sequence<3, 4, 5>{})); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // shuffle: threadwise copy C from VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + 1, // vector write pixel + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + make_multi_index(0, + m_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + 0, + n_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I2], + m_thread_data_on_block_idx[I3]), + ck::tensor_operation::element_wise::PassThrough{}}; + + // shuffle: blockwise copy C from LDS to global + auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< + ThisThreadBlock, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, + Sequence<1, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, + 1, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + FloatCShuffle, // typename SrcData, + FloatC, // typename DstData, + decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + true, // bool ThreadTransferSrcResetCoordinateAfterRun, + false> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + make_multi_index(0, 0, 0, 0), + c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0), + c_element_op}; + + // space filling curve for local reg & global memory + // space filling curve for threadwise C in VGPR + constexpr auto sfc_c_vgpr = + SpaceFillingCurve, + Sequence<0, 1, 2, 3, 4, 5, 6>, + Sequence>{}; + + // space filling curve for shuffled blockwise C in global mem + constexpr auto sfc_c_global = + SpaceFillingCurve, + Sequence<0, 2, 1, 3>, + Sequence<1, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, + 1, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{}; + + constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); + + static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!"); + + static_for<0, num_access, 1>{}([&](auto access_id) { + // make sure it's safe to write to LDS + block_sync_lds(); + + // each thread write its data from VGPR to LDS + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + sfc_c_vgpr.GetIndexTupleOfNumber(access_id), + c_thread_buf, + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + c_shuffle_block_buf); + + // make sure it's safe to read from LDS + block_sync_lds(); + + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global.Run( + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + + if constexpr(access_id < num_access - 1) + { + constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); + // CONFIRMED + // printf("c_global_step = (%d, %d, %d, %d)\n", + // c_global_step[Number<0>{}], + // c_global_step[Number<1>{}], + // c_global_step[Number<2>{}], + // c_global_step[Number<3>{}]); + // move on C + c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); + } + }); + } // clang-format on } }; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index b0f453b025f..84800da0c93 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -119,7 +119,29 @@ struct ThreadwiseTensorSliceTransfer_v1r3 using SpaceFillingCurve = SpaceFillingCurve>; - + // printf("SpaceFillingCurve access_lengths = (%d, %d, %d, %d, %d, %d, %d)\n", SpaceFillingCurve::access_lengths[Number<0>{}].value, + // SpaceFillingCurve::access_lengths[Number<1>{}].value, + // SpaceFillingCurve::access_lengths[Number<2>{}].value, + // SpaceFillingCurve::access_lengths[Number<3>{}].value, + // SpaceFillingCurve::access_lengths[Number<4>{}].value, + // SpaceFillingCurve::access_lengths[Number<5>{}].value, + // SpaceFillingCurve::access_lengths[Number<6>{}].value); +// + // // printf("SpaceFillingCurve dim_access_order = (%d, %d, %d, %d, %d, %d, %d)\n", SpaceFillingCurve::dim_access_order[Number<0>{}].value, + // SpaceFillingCurve::dim_access_order[Number<1>{}].value, + // SpaceFillingCurve::dim_access_order[Number<2>{}].value, + // SpaceFillingCurve::dim_access_order[Number<3>{}].value, + // SpaceFillingCurve::dim_access_order[Number<4>{}].value, + // SpaceFillingCurve::dim_access_order[Number<5>{}].value, + // SpaceFillingCurve::dim_access_order[Number<6>{}].value); +// + // // // printf("SpaceFillingCurve ordered_access_lengths = (%d, %d, %d, %d, %d, %d, %d)\n", SpaceFillingCurve::ordered_access_lengths[Number<0>{}].value, + // SpaceFillingCurve::ordered_access_lengths[Number<1>{}].value, + // SpaceFillingCurve::ordered_access_lengths[Number<2>{}].value, + // SpaceFillingCurve::ordered_access_lengths[Number<3>{}].value, + // SpaceFillingCurve::ordered_access_lengths[Number<4>{}].value, + // SpaceFillingCurve::ordered_access_lengths[Number<5>{}].value, + // SpaceFillingCurve::ordered_access_lengths[Number<6>{}].value); // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector? static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector, "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector"); @@ -136,7 +158,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 static_for<0, DstScalarPerVector, 1>{}([&](auto i) { constexpr index_t src_offset = src_desc.CalculateOffset( src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); - + // debug_hexprinter(0xffffffff, src_offset, "src_coord_iteration"); SrcData v; // apply element-wise operation @@ -154,11 +176,11 @@ struct ThreadwiseTensorSliceTransfer_v1r3 dst_coord_.GetOffset(), is_dst_valid, dst_vector.template AsType()[Number<0>{}]); - + // debug_hexprinter(0xffffffff, dst_coord_.GetOffset(), "dst_coord_iteration"); if constexpr(idx_1d.value != num_access - 1) { constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - + // printf("move forward = (%d, %d, %d, %d, %d, %d, %d)\n", forward_step[Number<0>{}], forward_step[Number<1>{}], forward_step[Number<2>{}], forward_step[Number<3>{}], forward_step[Number<4>{}], forward_step[Number<5>{}], forward_step[Number<6>{}]); move_tensor_coordinate( dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index d47d4d0e569..1cfaaf09378 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -96,7 +96,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_element_op_(src_element_op), dst_element_op_(dst_element_op) { - printf("global desc: %s\n", __PRETTY_FUNCTION__); + // printf("global desc: %s\n", __PRETTY_FUNCTION__); } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -128,12 +128,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1 detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - printf("src_access_lengths: %d, %d, %d\n", (src_access_lengths[Number<0>{}])(), src_access_lengths[Number<1>{}](), src_access_lengths[Number<2>{}]()); + // printf("src_access_lengths: %d, %d, %d\n", (src_access_lengths[Number<0>{}])(), src_access_lengths[Number<1>{}](), src_access_lengths[Number<2>{}]()); constexpr auto src_dim_access_order = SrcDimAccessOrder{}; constexpr auto ordered_src_access_lengths = container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - printf("ordered_src_access_lengths: %d, %d, %d\n", (ordered_src_access_lengths[Number<0>{}])(), ordered_src_access_lengths[Number<1>{}](), ordered_src_access_lengths[Number<2>{}]()); + // printf("ordered_src_access_lengths: %d, %d, %d\n", (ordered_src_access_lengths[Number<0>{}])(), ordered_src_access_lengths[Number<1>{}](), ordered_src_access_lengths[Number<2>{}]()); // make forward steps const auto src_forward_steps = generate_tuple( @@ -147,9 +147,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1 return make_tensor_coordinate_step(src_desc, forward_step_idx); }, Number{}); - printf("src_forward_steps: %d, %d, %d\n", (src_forward_steps.GetIndexDiff()[Number<0>{}])(), - (src_forward_steps.GetIndexDiff()[Number<1>{}])(), - (src_forward_steps.GetIndexDiff()[Number<2>{}])() ); // make backward steps const auto src_backward_steps = generate_tuple( @@ -213,7 +210,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_buf.template Get(src_coord_.GetOffset(), is_src_valid)}; // apply SrcElementwiseOperation on src_vector_container - debug_hexprinter(0xffffffff, src_coord_.GetOffset()); + // debug_hexprinter(0xffffffff, src_coord_.GetOffset()); static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { SrcData src_v; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 08ecc71dd11..3667c5f7370 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -205,6 +205,7 @@ struct WmmaGemm } // WMMA output supporting C = A * B + // Vector Write // MPerWMMA_NPerWMMA -> MSubGroup_..._NPerWMMA_MAccVgprPerWave template __host__ __device__ static constexpr auto @@ -239,6 +240,40 @@ struct WmmaGemm Sequence<5>{})); } + // Per-Pixel write + template + __host__ __device__ static constexpr auto + MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup + (const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) + { + const auto MBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); + const auto NBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); + const auto MWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); + const auto NWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); + + return transform_tensor_descriptor( + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma, + make_tuple(make_pass_through_transform(MBlockxRepeat), + make_pass_through_transform(MWave), + make_unmerge_transform(make_tuple(Number{}, + Number{})), + make_pass_through_transform(NBlockxRepeat), + make_pass_through_transform(NWave), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2, 3>{}, + Sequence<4>{}, + Sequence<5>{}, + Sequence<6>{})); + } + __device__ static constexpr index_t GetRegSizePerWmma() { return wmma_instr.num_acc_vgprs_per_wave; diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 81bb19f569a..f85ab7e76c6 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -73,10 +73,26 @@ constexpr auto type_name() { return name; } +// Accepet int, float, and Number<> as input template -__device__ -void debug_hexprinter(const uint32_t v_target, T v_val){ - const uint32_t v_dbg = *(reinterpret_cast(&v_val)); - if(v_dbg != v_target) - printf("@Thread: %d, Val: %08x != Target: %08x\n", ck::get_thread_local_1d_id(), v_dbg, v_target); +__host__ __device__ +void debug_hexprinter(const uint32_t v_target, const T v_val, const char* info){ + if constexpr(std::is_same_v || std::is_same_v ) + { + const uint32_t v_dbg = *(reinterpret_cast(&v_val)); + if(v_dbg != v_target) + printf("%s@Thread: %d, Val: %08x != Target: %08x\n", info, ck::get_thread_local_1d_id(), v_dbg, v_target); + } + else if constexpr(std::is_same_v) + { + const uint16_t v_dbg = *(reinterpret_cast(&v_val)); + if(v_dbg != v_target) + printf("%s@Thread: %d, Val: %04x != Target: %08x\n", info, ck::get_thread_local_1d_id(), v_dbg, v_target); + } + else + { + const uint32_t v_dbg = *(reinterpret_cast(&(v_val.value))); + if(v_dbg != v_target) + printf("%s@Thread: %d, Val: %08x != Target: %08x\n", info, ck::get_thread_local_1d_id(), v_dbg, v_target); + } } diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 3a5cd1da760..cbb53bd644d 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -49,7 +49,7 @@ check_err(const std::vector& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 5) + if(err_count < 16384) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << out[i] << " != " << ref[i] << std::endl; @@ -59,6 +59,7 @@ check_err(const std::vector& out, } if(!res) { + std::cerr << "err count: " << err_count << std::endl; std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; @@ -93,7 +94,7 @@ check_err(const std::vector& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 5) + if(err_count < 16384) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; @@ -103,6 +104,7 @@ check_err(const std::vector& out, } if(!res) { + std::cerr << "err count: " << err_count << std::endl; std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; @@ -136,7 +138,7 @@ check_err(span out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 5) + if(err_count < 16384) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; @@ -146,6 +148,7 @@ check_err(span out, } if(!res) { + std::cerr << "err count: " << err_count << std::endl; std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; @@ -196,7 +199,7 @@ check_err(const std::vector& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 5) + if(err_count < 16384) { std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; @@ -206,6 +209,7 @@ check_err(const std::vector& out, } if(!res) { + std::cerr << "err count: " << err_count << std::endl; std::cerr << "max err: " << max_err << std::endl; } return res; diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp index d717738dc45..b01c3d1b4df 100644 --- a/library/include/ck/library/utility/fill.hpp +++ b/library/include/ck/library/utility/fill.hpp @@ -103,5 +103,23 @@ struct FillConstant } }; +template +struct FillMNID +{ + T step_{0.1}; + int k_num_{32}; + int mn_num_{128}; + + template + void operator()(ForwardIter first, ForwardIter last) const + { + std::generate(first, last, [=, iter = 0]() mutable { + auto tmp = ((iter/k_num_) % mn_num_ ) * step_; + iter ++; + return tmp; + }); + } +}; + } // namespace utils } // namespace ck From 0a8087248b09e88cb3799b88cce10fd4c5c9a7da Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 9 Dec 2022 09:49:43 +0000 Subject: [PATCH 014/118] Tidy up + format --- .../gpu/block/blockwise_gemm_wmma.hpp | 251 +++++----- .../gpu/grid/gridwise_gemm_wmma.hpp | 214 ++++----- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 431 ++++++++++++------ include/ck/utility/amd_wmma.hpp | 106 ++++- 4 files changed, 634 insertions(+), 368 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 001de16d902..5d452d744be 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -23,23 +23,26 @@ template {}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; static constexpr auto WmmaK = Number<16>{}; using ThisThreadBlock = ThisThreadBlock; + // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. static constexpr index_t WaveSize = 32; static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); - static constexpr index_t KPerBlock = BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); + static constexpr index_t KPerBlock = + BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); @@ -48,8 +51,6 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static constexpr auto wmma_gemm = WmmaGemm{}; - static constexpr index_t KPerThread = KPerBlock / wmma_gemm.K0PerWMMA; - static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -81,8 +82,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 const auto waveId_m = wave_idx[I0]; const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); - // |KRepeat |MRepeat|MWave |MLane |KPack - return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); + // |KRepeat |MRepeat|MWave |MLane |KPack + return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); } __device__ static auto CalculateBThreadOriginDataIndex() @@ -92,13 +93,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 const auto waveId_n = wave_idx[I1]; const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); - // |KRepeat |NRepeat|Nwave |NLane |KPack - return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); + // |KRepeat |NRepeat|Nwave |NLane |KPack + return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); } template - __device__ static auto - CalculateCThreadOriginDataIndex(Number, Number) + __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) { const auto wave_idx = GetWaveIdx(); @@ -125,7 +125,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 return make_tuple(c_thread_m, c_thread_n); } - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3() + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle() { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && BK0NK1BlockDesc::IsKnownAtCompileTime(), @@ -134,73 +134,103 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); - static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && + NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } // Thread level, register decriptor. Vector-write - __host__ __device__ static constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, I1, MSubGroup, Number{}, I1, NThreadPerSubGroup, MAccVgprs)); + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + MSubGroup, + Number{}, + I1, + NThreadPerSubGroup, + MAccVgprs)); } template __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(const CGridDesc_M_N& c_grid_desc_m_n) + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CGridDesc_M_N& c_grid_desc_m_n) { const auto M = c_grid_desc_m_n.GetLength(I0); const auto N = c_grid_desc_m_n.GetLength(I1); - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); } // Thread level, register decriptor. Per-pixel write - __host__ __device__ static constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave |NThreadPerSubGroup - make_tuple(Number{}, I1, MSubGroup, MAccVgprs, Number{}, I1, NThreadPerSubGroup)); + // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave + // |NThreadPerSubGroup + make_tuple(Number{}, + I1, + MSubGroup, + MAccVgprs, + Number{}, + I1, + NThreadPerSubGroup)); } template __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(const CGridDesc_M_N& c_grid_desc_m_n) + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + const CGridDesc_M_N& c_grid_desc_m_n) { const auto M = c_grid_desc_m_n.GetLength(I0); const auto N = c_grid_desc_m_n.GetLength(I1); - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); } // Provide dimension size - __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() { constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = make_naive_tensor_descriptor_packed(make_tuple(Number{}, @@ -210,17 +240,19 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 Number{}, Number{})); - return wmma_gemm.MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); } __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() { return transform_tensor_descriptor( AK0MK1BlockDesc{}, - make_tuple( - make_pass_through_transform(Number{}), - make_unmerge_transform(make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } @@ -229,14 +261,15 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 { return transform_tensor_descriptor( BK0NK1BlockDesc{}, - make_tuple( - make_pass_through_transform(Number{}), - make_unmerge_transform(make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } + // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); @@ -252,7 +285,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 // constexpr auto RepeatDiff = MRepeat - NRepeat; - // debug_hexprinter(0xffffffff, a_thread_buf[Number{}], "Avalue "); + // debug_hexprinter(0xffffffff, a_thread_buf[Number{}], "Avalue "); /* First local prefetch, move out of blockwise operation. static_for<0, NRepeat, 1>{}([&](auto iN){ b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, @@ -291,18 +325,16 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - // debug_hexprinter(0x3c003c00, a_thread_vec.template AsType()(Number<0>{})); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), + // debug_hexprinter(0x3c003c00, a_thread_vec.template + AsType()(Number<0>{})); wmma_gemm.template Run( a_thread_vec.template + AsType()(Number<0>{}), b_thread_vec.template + AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, Number{}), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); + make_tuple(Number{}, Number{}, I0, I0, + Number{}), a_block_buf, a_thread_desc_, make_tuple(I0, Number{}, I0, I0, + I0), a_thread_buf); }); // Run FIFO fashion loopover in Square static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ @@ -328,8 +360,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 c_thread_buf.GetVectorTypeReference(Number{})); }); a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, Number{}), - a_block_buf, + make_tuple(Number{}, + Number{}, I0, I0, Number{}), a_block_buf, a_thread_desc_, make_tuple(I0, Number{}, I0, I0, I0), a_thread_buf); @@ -355,11 +387,9 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 c_thread_buf.GetVectorTypeReference(Number{})); }); b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, Number{}), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); + make_tuple(Number{}, Number{}, I0, + I0, Number{}), b_block_buf, b_thread_desc_, make_tuple(I0, + Number{}, I0, I0, I0), b_thread_buf); }); }); */ @@ -368,7 +398,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_for<0, MRepeat, 1>{}([&](auto m0) { // read A a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0), + make_tuple(Number{}, m0, I0, I0, I0), a_block_buf, a_thread_desc_, make_tuple(I0, I0, I0, I0, I0), @@ -377,7 +407,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 static_for<0, NRepeat, 1>{}([&](auto n0) { // read B b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0), + make_tuple(Number{}, n0, I0, I0, I0), b_block_buf, b_thread_desc_, make_tuple(I0, I0, I0, I0, I0), @@ -386,14 +416,15 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = a_thread_buf - [Number{}]; - b_thread_vec.template AsType()(i) = b_thread_buf - [Number{}]; + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; }); - using wmma_input_type = - typename vector_type::type; + using wmma_input_type = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -405,34 +436,16 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 }); }); }); - - // static_for<0, 16, 1>{}([&](auto i){ - // char info[4]; - // info[0] = 'A'; - // info[1] = i/10 + '0'; - // info[2] = i%10 + '0'; - // info[3] = '\0'; - // debug_hexprinter(0xffffffff, a_thread_buf[Number{}], info); - // }); - - // static_for<0, 16, 1>{}([&](auto i){ - // char info[4]; - // info[0] = 'B'; - // info[1] = i/10 + '0'; - // info[2] = i%10 + '0'; - // info[3] = '\0'; - // debug_hexprinter(0xffffffff, b_thread_buf[Number{}], info); - // }); } protected: // A[M0, M1, M2, K0 = WmmaK] - static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, I1, I1, I1, Number{})); + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, I1, Number{})); // B[N0, N1, N2, K0 = WmmaK] - static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, I1, I1, I1, Number{})); + static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( @@ -442,7 +455,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 FloatAB, decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), - Sequence, + Sequence, Sequence<3, 0, 1, 2, 4>, 4, A_K1, @@ -452,7 +465,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3 FloatAB, decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), - Sequence, + Sequence, Sequence<3, 0, 1, 2, 4>, 4, B_K1, @@ -473,20 +486,20 @@ template -constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_Selector() +constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_Selector() { if constexpr(LoopSched == LoopScheduler::Default) { - return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3{}; + return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle{}; } }; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index ac4648ca382..9b3bf5e272a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -38,8 +38,10 @@ __global__ void FloatC* __restrict__ p_c_grid, const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, - // const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock, + // const + // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, @@ -49,18 +51,17 @@ __global__ void #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - GridwiseGemm::template Run( - p_a_grid, - p_b_grid, - p_c_grid, - p_shared, - a_grid_desc_k0_m_k1, - b_grid_desc_k0_n_k1, - c_grid_desc_mblock_mperblock_nblock_nperblock, - a_element_op, - b_element_op, - c_element_op, - block_2_ctile_map); + GridwiseGemm::template Run(p_a_grid, + p_b_grid, + p_c_grid, + p_shared, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b_element_op, + c_element_op, + block_2_ctile_map); #else ignore = p_a_grid; ignore = p_b_grid; @@ -75,50 +76,49 @@ __global__ void #endif // end of if (defined(__gfx1100__)) } -template < - index_t BlockSize, - typename FloatAB, - typename FloatAcc, - typename FloatCShuffle, - typename FloatC, - InMemoryDataOperationEnum CGlobalMemoryDataOperation, - typename AGridDesc_K0_M_K1, - typename BGridDesc_K0_N_K1, - typename CGridDesc_M_N, - typename AElementwiseOperation, - typename BElementwiseOperation, - typename CElementwiseOperation, - index_t MPerBlock, - index_t NPerBlock, - index_t K0PerBlock, - index_t MPerWmma, - index_t NPerWmma, - index_t K1Value, - index_t MRepeat, - index_t NRepeat, - typename ABlockTransferThreadClusterLengths_K0_M_K1, - typename ABlockTransferThreadClusterArrangeOrder, - typename ABlockTransferSrcAccessOrder, - index_t ABlockTransferSrcVectorDim, - index_t ABlockTransferSrcScalarPerVector, - index_t ABlockTransferDstScalarPerVector_K1, - bool AThreadTransferSrcResetCoordinateAfterRun, - bool ABlockLdsExtraM, - typename BBlockTransferThreadClusterLengths_K0_N_K1, - typename BBlockTransferThreadClusterArrangeOrder, - typename BBlockTransferSrcAccessOrder, - index_t BBlockTransferSrcVectorDim, - index_t BBlockTransferSrcScalarPerVector, - index_t BBlockTransferDstScalarPerVector_K1, - bool BThreadTransferSrcResetCoordinateAfterRun, - bool BBlockLdsExtraN, - index_t CShuffleMRepeatPerShuffle, - index_t CShuffleNRepeatPerShuffle, - typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, - index_t CShuffleBlockTransferScalarPerVector_NPerBlock, - index_t NumGemmKPrefetchStage = 1, - LoopScheduler LoopSched = make_default_loop_scheduler(), - PipelineVersion PipelineVer = PipelineVersion::v1> +template struct GridwiseGemm_k0mk1_k0nk1_mn_wmma { static constexpr auto I0 = Number<0>{}; @@ -202,17 +202,19 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + constexpr auto a_block_desc_k0perblock_mperblock_k1 = + GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + constexpr auto b_block_desc_k0perblock_nperblock_k1 = + GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); constexpr auto max_lds_align = K1; - constexpr auto a_block_space_size_aligned = - math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); + constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); - constexpr auto b_block_space_size_aligned = - math::integer_least_multiple(b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); + constexpr auto b_block_space_size_aligned = math::integer_least_multiple( + b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB); } @@ -308,18 +310,21 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); - using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3; - - return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs(c_grid_desc_m_n); + using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle< + BlockSize, + FloatAB, + FloatAcc, + decltype(a_block_desc_k0perblock_mperblock_k1), + decltype(b_block_desc_k0perblock_nperblock_k1), + MPerWmma, + NPerWmma, + MRepeat, + NRepeat, + KPack>; + + return BlockwiseGemm:: + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_grid_desc_m_n); } // Per pixel @@ -362,18 +367,21 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); - using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3; - - return BlockwiseGemm::MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup(c_grid_desc_m_n); + using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle< + BlockSize, + FloatAB, + FloatAcc, + decltype(a_block_desc_k0perblock_mperblock_k1), + decltype(b_block_desc_k0perblock_nperblock_k1), + MPerWmma, + NPerWmma, + MRepeat, + NRepeat, + KPack>; + + return BlockwiseGemm:: + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + c_grid_desc_m_n); } __host__ __device__ static constexpr auto @@ -402,11 +410,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return BlockToCTileMap_M00_N0_M01Adapt( c_grid_desc_m_n); } - // using CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup = remove_cvref_t; - using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = - remove_cvref_t; + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; using DefaultBlock2CTileMap = remove_cvref_t; @@ -419,15 +429,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& - c_grid_desc_mblock_mperblock_nblock_nperblock, - // const CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup& - // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, + c_grid_desc_mblock_mperblock_nblock_nperblock, + // const + // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup& + // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, const AElementwiseOperation& a_element_op, const BElementwiseOperation& b_element_op, const CElementwiseOperation& c_element_op, const Block2CTileMap& block_2_ctile_map) { -// clang-format off + // clang-format off /*******************************************************************************/ // Memory buffer zone. const auto a_grid_buf = make_dynamic_buffer( @@ -453,12 +464,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /*******************************************************************************/ // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - // printf("K0 = %d, M = %d, K1 = %d\n", K0, a_grid_desc_k0_m_k1.GetLength(I1), (a_grid_desc_k0_m_k1.GetLength(I2))()); constexpr auto max_lds_align = K1; constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - // printf("blockdesc: K0 = %d, M = %d, K1 = %d\n", (a_block_desc_k0perblock_mperblock_k1.GetLength(I0))(), - // (a_block_desc_k0perblock_mperblock_k1.GetLength(I1))(), (a_block_desc_k0perblock_mperblock_k1.GetLength(I2))()); // A matrix blockwise copy auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, @@ -532,7 +540,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3{}], - // c_global_step[Number<1>{}], - // c_global_step[Number<2>{}], - // c_global_step[Number<3>{}]); // move on C c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); } }); } - // clang-format on + // clang-format on } }; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 3667c5f7370..7b8887b3957 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -12,106 +12,273 @@ namespace ck { enum struct WmmaInstr { wmma_f32_16x16x16_f16 = 0, - wmma_f32_16x16x16_bf16 = 0, - wmma_f16_16x16x16_f16 = 0, - wmma_bf16_16x16x16_bf16 = 0, - wmma_i32_16x16x16_iu8 = 0, - wmma_i32_16x16x16_iu4 = 0 + wmma_f32_16x16x16_bf16, + wmma_f16_16x16x16_f16, + wmma_bf16_16x16x16_bf16, + wmma_i32_16x16x16_iu8, + wmma_i32_16x16x16_iu4 }; /* * WMMA Wave Tile Always MxNxK = 16x16x16 * WAVE32 - ----------------------------------- - |RC0| | | | | | | | | | | | | | | | SubGroup 0 - |RC1| | | | | | | | | | | | | | | | - |RC2| | | | | | | | | | | | | | | | - |RC3|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| - |RC4|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1| - |RC5|1|2|3|4|5|6|7|8|9|0|1|2|3|4|5| - |RC6| | | | | | | | | | | | | | | | - |RC7| | | | | | | | | | | | | | | | - ----------------------------------- - | | | | | | | | | | | | | | | | | SubGroup 1 - | | | | | | | | | | | | | | | | | - | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| - | 1 |1|1|1|2|2|2|2|2|2|2|2|2|2|3|3| - | 6 |7|8|9|0|1|2|3|4|5|6|7|8|9|0|1| - | | | | | | | | | | | | | | | | | - | | | | | | | | | | | | | | | | | - | | | | | | | | | | | | | | | | | - ----------------------------------- + ----------------------------------- + |RC0| | | | | | | | | | | | | | | | SubGroup 0 + |RC1| | | | | | | | | | | | | | | | + |RC2| | | | | | | | | | | | | | | | + |RC3|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| + |RC4|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1| + |RC5|1|2|3|4|5|6|7|8|9|0|1|2|3|4|5| + |RC6| | | | | | | | | | | | | | | | + |RC7| | | | | | | | | | | | | | | | + ----------------------------------- + | | | | | | | | | | | | | | | | | SubGroup 1 + | | | | | | | | | | | | | | | | | + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| + | 1 |1|1|1|2|2|2|2|2|2|2|2|2|2|3|3| + | 6 |7|8|9|0|1|2|3|4|5|6|7|8|9|0|1| + | | | | | | | | | | | | | | | | | + | | | | | | | | | | | | | | | | | + | | | | | | | | | | | | | | | | | + ----------------------------------- * WAVE64 - ----------------------------------- - |RC0|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 0 - |RC1|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1| - |RC2|1|2|3|4|5|6|7|8|9|0|1|2|3|4|5| - |RC3|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| - ----------------------------------- - | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 1 - | 1 |1|1|1|2|2|2|2|2|2|2|2|2|2|3|3| - | 6 |7|8|9|0|1|2|3|4|5|6|7|8|9|0|1| - | | | | | | | | | | | | | | | | | - ----------------------------------- - | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 2 - | 3 |3|3|3|3|3|3|3|4|4|4|4|4|4|4|4| - | 2 |3|4|5|6|7|8|9|0|1|2|3|4|5|6|7| - | | | | | | | | | | | | | | | | | - ----------------------------------- - | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 3 - | 4 |4|5|5|5|5|5|5|5|5|5|5|6|6|6|6| - | 8 |9|0|1|2|3|4|5|6|7|8|9|0|1|2|3| - | | | | | | | | | | | | | | | | | - ----------------------------------- + ----------------------------------- + |RC0|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 0 + |RC1|0|0|0|0|0|0|0|0|0|1|1|1|1|1|1| + |RC2|1|2|3|4|5|6|7|8|9|0|1|2|3|4|5| + |RC3|T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| + ----------------------------------- + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 1 + | 1 |1|1|1|2|2|2|2|2|2|2|2|2|2|3|3| + | 6 |7|8|9|0|1|2|3|4|5|6|7|8|9|0|1| + | | | | | | | | | | | | | | | | | + ----------------------------------- + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 2 + | 3 |3|3|3|3|3|3|3|4|4|4|4|4|4|4|4| + | 2 |3|4|5|6|7|8|9|0|1|2|3|4|5|6|7| + | | | | | | | | | | | | | | | | | + ----------------------------------- + | T |T|T|T|T|T|T|T|T|T|T|T|T|T|T|T| SubGroup 3 + | 4 |4|5|5|5|5|5|5|5|5|5|5|6|6|6|6| + | 8 |9|0|1|2|3|4|5|6|7|8|9|0|1|2|3| + | | | | | | | | | | | | | | | | | + ----------------------------------- * RC = Register for storing accumalted result * T = Thread ID */ -template -struct wmma_type{}; +template +struct wmma_type +{ +}; // A-swizzled template -struct wmma_type> +struct wmma_type> { -// Absolute fixing property - // * Data Pixel + // Absolute fixing property + // * Data Pixel + static constexpr index_t m_per_wmma = 16; + static constexpr index_t n_per_wmma = 16; + static constexpr index_t k_per_wmma = 16; + static constexpr index_t src_a_data_size = 2; + static constexpr index_t src_b_data_size = 2; + static constexpr index_t acc_data_size = 4; + // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction + static constexpr index_t num_thread_per_subgroups = n_per_wmma; + + // Wave mode dependent propety + static constexpr index_t wave_size = Number{}; + // * Fixed in Navi3x, Will be wave mode dependent on Navi4x + static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; + static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; + // * num_acc_vgprs_per_wave alone M direction + // * num_subgroups alone M direction + static constexpr index_t num_acc_vgprs_per_wave = + m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; + static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + if constexpr(wave_size == 32) + { + intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); + } + else if constexpr(wave_size == 64) + { + intrin_wmma_f32_16x16x16_f16_w64::Run(a, b, reg_c); + } + } +}; + +template +struct wmma_type> +{ + // Absolute fixing property static constexpr index_t m_per_wmma = 16; static constexpr index_t n_per_wmma = 16; static constexpr index_t k_per_wmma = 16; static constexpr index_t src_a_data_size = 2; static constexpr index_t src_b_data_size = 2; static constexpr index_t acc_data_size = 4; - // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction static constexpr index_t num_thread_per_subgroups = n_per_wmma; - -// Wave mode dependent propety + + // Wave mode dependent propety static constexpr index_t wave_size = Number{}; - // * Fixed in Navi3x, Will be wave mode dependent on Navi4x static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; - // * num_acc_vgprs_per_wave alone M direction - // * num_subgroups alone M direction - static constexpr index_t num_acc_vgprs_per_wave = m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; - static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; + static constexpr index_t num_acc_vgprs_per_wave = + m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; + static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const { if constexpr(wave_size == 32) { - intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); + intrin_wmma_f32_16x16x16_bf16_w32::Run(a, b, reg_c); } else if constexpr(wave_size == 64) { - intrin_wmma_f32_16x16x16_f16_w64::Run(a, b, reg_c); + intrin_wmma_f32_16x16x16_bf16_w64::Run(a, b, reg_c); + } + } +}; + +#ifdef CK_UNPACKED_ACC_DESC_LOGIC +template +struct wmma_type> +{ + // Absolute fixing property + static constexpr index_t m_per_wmma = 16; + static constexpr index_t n_per_wmma = 16; + static constexpr index_t k_per_wmma = 16; + static constexpr index_t src_a_data_size = 2; + static constexpr index_t src_b_data_size = 2; + static constexpr index_t acc_data_size = 2; + static constexpr index_t num_thread_per_subgroups = n_per_wmma; + + // Wave mode dependent propety + static constexpr index_t wave_size = Number{}; + static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; + static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; + static constexpr index_t num_acc_vgprs_per_wave = + m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; + static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + if constexpr(wave_size == 32) + { + intrin_wmma_f16_16x16x16_f16_w32::Run(a, b, reg_c); + } + else if constexpr(wave_size == 64) + { + intrin_wmma_f16_16x16x16_f16_w64::Run(a, b, reg_c); + } + } +}; + +template +struct wmma_type> +{ + // Absolute fixing property + static constexpr index_t m_per_wmma = 16; + static constexpr index_t n_per_wmma = 16; + static constexpr index_t k_per_wmma = 16; + static constexpr index_t src_a_data_size = 2; + static constexpr index_t src_b_data_size = 2; + static constexpr index_t acc_data_size = 2; + static constexpr index_t num_thread_per_subgroups = n_per_wmma; + + // Wave mode dependent propety + static constexpr index_t wave_size = Number{}; + static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; + static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; + static constexpr index_t num_acc_vgprs_per_wave = + m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; + static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + if constexpr(wave_size == 32) + { + intrin_wmma_bf16_16x16x16_bf16_w32::Run(a, b, reg_c); + } + else if constexpr(wave_size == 64) + { + intrin_wmma_bf16_16x16x16_bf16_w64::Run(a, b, reg_c); + } + } +}; + +#endif + +template +struct wmma_type> +{ + // Absolute fixing property + static constexpr index_t m_per_wmma = 16; + static constexpr index_t n_per_wmma = 16; + static constexpr index_t k_per_wmma = 16; + static constexpr index_t src_a_data_size = 2; + static constexpr index_t src_b_data_size = 2; + static constexpr index_t acc_data_size = 4; + static constexpr index_t num_thread_per_subgroups = n_per_wmma; + + // Wave mode dependent propety + static constexpr index_t wave_size = Number{}; + static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; + static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; + static constexpr index_t num_acc_vgprs_per_wave = + m_per_wmma * n_per_wmma * acc_data_size / wave_size / 4; + static constexpr index_t num_subgroups = wave_size / num_thread_per_subgroups; + + template + __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const + { + if constexpr(wave_size == 32) + { + intrin_wmma_i32_16x16x16_iu8_w32::Run( + a, b, reg_c); + } + else if constexpr(wave_size == 64) + { + intrin_wmma_i32_16x16x16_iu8_w64::Run( + a, b, reg_c); } } }; @@ -159,21 +326,20 @@ struct WmmaSelector } #endif // get_warp_size do not return the correct wavesize, hardcode to 32 as workaround - static constexpr auto selected_wmma = wmma_type(), Number<32>{}>{}; + static constexpr auto selected_wmma = + wmma_type(), Number<32>{}>{}; __host__ __device__ constexpr WmmaSelector() { - static_assert(selected_wmma.m_per_wmma == 16, - "WRONG! WMMA_M must equal to 16"); - - static_assert(selected_wmma.m_per_wmma == 16, - "WRONG! WMMA_M must equal to 16"); - - static_assert(selected_wmma.k_per_wmma == 16, - "WRONG! WMMA_M must equal to 16"); - - static_assert(selected_wmma.wave_size * selected_wmma.num_acc_vgprs_per_wave * selected_wmma.acc_data_size== - selected_wmma.m_per_wmma * selected_wmma.n_per_wmma * 4, + static_assert(selected_wmma.m_per_wmma == 16, "WRONG! WMMA_M must equal to 16"); + + static_assert(selected_wmma.m_per_wmma == 16, "WRONG! WMMA_M must equal to 16"); + + static_assert(selected_wmma.k_per_wmma == 16, "WRONG! WMMA_M must equal to 16"); + + static_assert(selected_wmma.wave_size * selected_wmma.num_acc_vgprs_per_wave * + selected_wmma.acc_data_size == + selected_wmma.m_per_wmma * selected_wmma.n_per_wmma * 4, "WRONG! Invalid Number of Accumulator Register"); } }; @@ -198,7 +364,7 @@ struct WmmaGemm __host__ __device__ constexpr WmmaGemm() { - static_assert(NPerWmma == 16 && MPerWmma == 16 , + static_assert(NPerWmma == 16 && MPerWmma == 16, "Only support GemmNPerWmma == 16 and GemmMPerWmma == 16 for wmma"); static_assert(KPack == wmma_instr.k_per_wmma, "KPack should be k_per_wmma"); @@ -209,23 +375,29 @@ struct WmmaGemm // MPerWMMA_NPerWMMA -> MSubGroup_..._NPerWMMA_MAccVgprPerWave template __host__ __device__ static constexpr auto - MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs - (const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) + MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) { - const auto MBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); - const auto NBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); - const auto MWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); - const auto NWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); + const auto MBlockxRepeat = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); + const auto NBlockxRepeat = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); + const auto MWave = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); + const auto NWave = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); return transform_tensor_descriptor( c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma, - make_tuple(make_pass_through_transform(MBlockxRepeat), - make_pass_through_transform(MWave), - make_unmerge_transform(make_tuple(Number{}, - Number{})), - make_pass_through_transform(NBlockxRepeat), - make_pass_through_transform(NWave), - make_pass_through_transform(Number{})), + make_tuple( + make_pass_through_transform(MBlockxRepeat), + make_pass_through_transform(MWave), + make_unmerge_transform(make_tuple(Number{}, + Number{})), + make_pass_through_transform(NBlockxRepeat), + make_pass_through_transform(NWave), + make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -243,23 +415,29 @@ struct WmmaGemm // Per-Pixel write template __host__ __device__ static constexpr auto - MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup - (const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) + MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) { - const auto MBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); - const auto NBlockxRepeat = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); - const auto MWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); - const auto NWave = c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); + const auto MBlockxRepeat = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); + const auto NBlockxRepeat = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); + const auto MWave = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); + const auto NWave = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); return transform_tensor_descriptor( c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma, - make_tuple(make_pass_through_transform(MBlockxRepeat), - make_pass_through_transform(MWave), - make_unmerge_transform(make_tuple(Number{}, - Number{})), - make_pass_through_transform(NBlockxRepeat), - make_pass_through_transform(NWave), - make_pass_through_transform(Number{})), + make_tuple( + make_pass_through_transform(MBlockxRepeat), + make_pass_through_transform(MWave), + make_unmerge_transform(make_tuple(Number{}, + Number{})), + make_pass_through_transform(NBlockxRepeat), + make_pass_through_transform(NWave), + make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, @@ -279,39 +457,34 @@ struct WmmaGemm return wmma_instr.num_acc_vgprs_per_wave; } - __device__ static constexpr index_t GetWaveSize() - { - return wmma_instr.wave_size; - } + __device__ static constexpr index_t GetWaveSize() { return wmma_instr.wave_size; } template __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const { - static_assert((is_same::value && is_same::value) || - (is_same::value && is_same::value) || - (is_same::value && is_same::value) || - (is_same::value && is_same::value) || - (is_same::value && is_same::value) + static_assert( + (is_same::value && is_same::value) || + (is_same::value && is_same::value) || + (is_same::value && is_same::value) || + (is_same::value && is_same::value) || + (is_same::value && is_same::value) #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 - || (is_same::value && is_same::value) + || (is_same::value && is_same::value) #endif - ,"base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), (int8, int32) or (int4, int32)!"); + , + "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), " + "(int8, int32) or (int4, int32)!"); if constexpr(!TransposeC) { - wmma_instr.template run( - p_a_wave, p_b_wave, p_c_thread); + wmma_instr.template run(p_a_wave, p_b_wave, p_c_thread); } else { - wmma_instr.template run( - p_b_wave, p_a_wave, p_c_thread); + wmma_instr.template run(p_b_wave, p_a_wave, p_c_thread); } } - __device__ static auto GetLaneId() - { - return get_thread_local_1d_id() % wmma_instr.wave_size; - } + __device__ static auto GetLaneId() { return get_thread_local_1d_id() % wmma_instr.wave_size; } __device__ static auto GetSubGroupId() { @@ -322,9 +495,9 @@ struct WmmaGemm { return GetLaneId() % wmma_instr.num_thread_per_subgroups; } - __device__ static auto GetSwizzledLaneIdLow() - { - return ((GetLaneIdUnderSubGroup() & 1) << 3 ) | (GetLaneIdUnderSubGroup() >> 1); + __device__ static auto GetSwizzledLaneIdLow() + { + return ((GetLaneIdUnderSubGroup() & 1) << 3) | (GetLaneIdUnderSubGroup() >> 1); } __host__ __device__ static auto CalculateAThreadOriginDataIndex() @@ -345,13 +518,13 @@ struct WmmaGemm return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset}; } - static constexpr auto wmma = WmmaSelector{}; + static constexpr auto wmma = WmmaSelector{}; static constexpr auto wmma_instr = wmma.selected_wmma; - __host__ __device__ static constexpr auto GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths() + __host__ __device__ static constexpr auto + GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths() { - return make_tuple( - I1, I1, Number{}); + return make_tuple(I1, I1, Number{}); } }; diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index 6f08a59bd99..fda6bbb21bf 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -8,6 +8,8 @@ // TODO: Add arch limitation namespace ck { +/********************************WAVE32 MODE***********************************************/ + // src: fp16, dst: fp32 template struct intrin_wmma_f32_16x16x16_f16_w32; @@ -23,20 +25,6 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> } }; -template -struct intrin_wmma_f32_16x16x16_f16_w64; - -template <> -struct intrin_wmma_f32_16x16x16_f16_w64<16, 16> -{ - template - __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) - { - reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); - } -}; - // src: bf16, dst: fp32 template struct intrin_wmma_f32_16x16x16_bf16_w32; @@ -111,5 +99,95 @@ struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16, neg_a, neg_b, clamp> } }; +/********************************WAVE64 MODE***********************************************/ + +template +struct intrin_wmma_f32_16x16x16_f16_w64; + +template <> +struct intrin_wmma_f32_16x16x16_f16_w64<16, 16> +{ + template + __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + } +}; + +// src: bf16, dst: fp32 +template +struct intrin_wmma_f32_16x16x16_bf16_w64; + +template <> +struct intrin_wmma_f32_16x16x16_bf16_w64<16, 16> +{ + template + __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + } +}; + +// src: fp16, dst: fp16 +template +struct intrin_wmma_f16_16x16x16_f16_w64; + +template +struct intrin_wmma_f16_16x16x16_f16_w64<16, 16, Opsel> +{ + template + __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) + { + // opsel usage + // false: D0.[0:15] = result + // true : D0.[16:31]= result + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], Opsel); + } +}; + +// src: bf16, dst: bf16 +template +struct intrin_wmma_bf16_16x16x16_bf16_w64; + +template +struct intrin_wmma_bf16_16x16x16_bf16_w64<16, 16, Opsel> +{ + template + __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c) + { + // opsel usage + // false: D0.[0:15] = result + // true : D0.[16:31]= result + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}], Opsel); + } +}; + +// src: iu8, dst: i32 +template +struct intrin_wmma_i32_16x16x16_iu8_w64; + +template +struct intrin_wmma_i32_16x16x16_iu8_w64<16, 16, neg_a, neg_b, clamp> +{ + template + __device__ static void Run(const int8x16_t& reg_a, const int8x16_t& reg_b, FloatC& reg_c) + { + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64( + neg_a, + bit_cast(reg_a), + neg_b, + bit_cast(reg_b), + reg_c.template AsType()[Number<0>{}], + clamp); + } +}; + } // namespace ck #endif From 9739ede0723aec5de436acbf33badb47946814b1 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 12 Dec 2022 10:44:57 +0000 Subject: [PATCH 015/118] temp save --- example/01_gemm/gemm_wmma_fp16.cpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 934 ++++++++++++++++-- .../gpu/grid/gridwise_gemm_wmma.hpp | 4 +- include/ck/utility/amd_inline_asm.hpp | 12 + 4 files changed, 854 insertions(+), 98 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 7d8ae1e9bbc..43348d6e5df 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -35,7 +35,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle // ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave| _MBlock_MWaveMPerWmma| ScalarPerVector| // ######| | | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, 1, 1, S<1, 32, 1, 8>, 8>; + < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>; using ReferenceGemmInstance = ck::tensor_operation::host:: diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 5d452d744be..88bf6a9892e 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -8,6 +8,8 @@ #include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp" #include "ck/tensor_description/tensor_adaptor.hpp" +#define CK_MNK_LOOP + namespace ck { template {}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto WmmaK = Number<16>{}; + + using ThisThreadBlock = ThisThreadBlock; + + // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. + static constexpr index_t WaveSize = 32; + + static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); + static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); + static constexpr index_t KPerBlock = + BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); + + static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); + static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); + static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); + static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); + + static constexpr auto wmma_gemm = WmmaGemm{}; + + static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); + static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); + + StaticBufferTupleOfVector + c_thread_buf_; + + __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } + + __device__ static auto GetWaveIdx() + { + const index_t thread_id = ThisThreadBlock::GetThreadId(); + + constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); + } + + __device__ static auto CalculateAThreadOriginDataIndex() + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + + const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); + // |KRepeat |MRepeat|MWave |MLane |KPack + return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); + } + + __device__ static auto CalculateBThreadOriginDataIndex() + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_n = wave_idx[I1]; + + const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); + // |KRepeat |NRepeat|Nwave |NLane |KPack + return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); + } + + template + __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); + + constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( + make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; + const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( + make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; + + return make_tuple(c_thread_m, c_thread_n); + } + + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle() + { + static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && + BK0NK1BlockDesc::IsKnownAtCompileTime(), + "wrong! Desc should be known at compile-time"); + + static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, + "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); + + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && + NPerBlock % (NPerWMMA * NRepeat) == 0, + "wrong!"); + } + // Thread level, register decriptor. Vector-write + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + MSubGroup, + Number{}, + I1, + NThreadPerSubGroup, + MAccVgprs)); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // Thread level, register decriptor. Per-pixel write + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave + // |NThreadPerSubGroup + make_tuple(Number{}, + I1, + MSubGroup, + MAccVgprs, + Number{}, + I1, + NThreadPerSubGroup)); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // Provide dimension size + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + } + + __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() + { + return transform_tensor_descriptor( + AK0MK1BlockDesc{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() + { + return transform_tensor_descriptor( + BK0NK1BlockDesc{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma + static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); + static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); + + template + __device__ void Run(const ABlockBuffer& a_block_buf, + const BBlockBuffer& b_block_buf, + CThreadBuffer& c_thread_buf) const + { + // auto a_thread_buf = make_static_buffer( + // a_thread_desc_.GetElementSpaceSize()); + // auto b_thread_buf = make_static_buffer( + // b_thread_desc_.GetElementSpaceSize()); + + StaticBufferTupleOfVector + a_thread_buf; + + StaticBufferTupleOfVector + b_thread_buf; + + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, m0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, I0, I0, I0, I0), + a_thread_buf.GetVectorTypeReference(Number{}).template AsType()); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, I0, I0, I0, I0), + b_thread_buf.GetVectorTypeReference(Number{}).template AsType()); + // vector_type a_thread_vec; + // vector_type b_thread_vec; + + // static_for<0, WmmaK, 1>{}([&](auto i) { + // a_thread_vec.template AsType()(i) = + // a_thread_buf[Number{}]; + // b_thread_vec.template AsType()(i) = + // b_thread_buf[Number{}]; + // }); + + // using wmma_input_type = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_buf.GetVectorTypeReference(Number{}), + b_thread_buf.GetVectorTypeReference(Number{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + } + + protected: + // A[M0, M1, M2, K0 = WmmaK] + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, I1, Number{})); + + // B[N0, N1, N2, K0 = WmmaK] + static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, I1, Number{})); + + // C[M, N, NumRegWMMA] + static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); + + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4>, + 4, + A_K1, + A_K1>; + + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4>, + 4, + B_K1, + B_K1>; + + AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; + BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; +}; + +template +/* A: K0PerBlock x MPerBlock x K1 + * B: K0PerBlock x NPerBlock x K1 + * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs + * KPACK == WMMA_K = 16 + */ +struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto WmmaK = Number<16>{}; + + using ThisThreadBlock = ThisThreadBlock; + + // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. + static constexpr index_t WaveSize = 32; + + static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); + static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); + static constexpr index_t KPerBlock = + BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); + + static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); + static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); + static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); + static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); + + static constexpr auto wmma_gemm = WmmaGemm{}; + + static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); + static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); + + StaticBufferTupleOfVector + c_thread_buf_; + + __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } + + __device__ static auto GetWaveIdx() + { + const index_t thread_id = ThisThreadBlock::GetThreadId(); + + constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); + } + + __device__ static auto CalculateAThreadOriginDataIndex() + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + + const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); + // |KRepeat |MRepeat|MWave |MLane |KPack + return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); + } + + __device__ static auto CalculateBThreadOriginDataIndex() + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_n = wave_idx[I1]; + + const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); + // |KRepeat |NRepeat|Nwave |NLane |KPack + return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); + } + + template + __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); + + constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( + make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; + const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( + make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; + + return make_tuple(c_thread_m, c_thread_n); + } + + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop() + { + static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && + BK0NK1BlockDesc::IsKnownAtCompileTime(), + "wrong! Desc should be known at compile-time"); + + static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, + "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); + + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && + NPerBlock % (NPerWMMA * NRepeat) == 0, + "wrong!"); + } + // Thread level, register decriptor. Vector-write + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + MSubGroup, + Number{}, + I1, + NThreadPerSubGroup, + MAccVgprs)); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // Thread level, register decriptor. Per-pixel write + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave + // |NThreadPerSubGroup + make_tuple(Number{}, + I1, + MSubGroup, + MAccVgprs, + Number{}, + I1, + NThreadPerSubGroup)); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // Provide dimension size + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + } + + __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() + { + return transform_tensor_descriptor( + AK0MK1BlockDesc{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() + { + return transform_tensor_descriptor( + BK0NK1BlockDesc{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma + static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); + static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); + + template + __device__ void Run(const ABlockBuffer& a_block_buf, + const BBlockBuffer& b_block_buf, + CThreadBuffer& c_thread_buf) const + { + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(I0, m0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, I0, I0, I0, I0), + a_thread_buf); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(I0, n0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, I0, I0, I0, I0), + b_thread_buf); + + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + }); + + using wmma_input_type = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + } + + protected: + // A[M0, M1, M2, K0 = WmmaK] + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, I1, Number{})); + + // B[N0, N1, N2, K0 = WmmaK] + static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, I1, I1, I1, Number{})); + + // C[M, N, NumRegWMMA] + static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); + + using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<3, 0, 1, 2, 4>, + 4, + A_K1, + A_K1>; + + using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence<3, 0, 1, 2, 4>, + 4, + B_K1, + B_K1>; + + AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; + BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; +}; + +template +/* A: K0PerBlock x MPerBlock x K1 + * B: K0PerBlock x NPerBlock x K1 + * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs + * KPACK == WMMA_K = 16 + */ +struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -125,7 +850,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle return make_tuple(c_thread_m, c_thread_n); } - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle() + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO() { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && BK0NK1BlockDesc::IsKnownAtCompileTime(), @@ -283,33 +1008,26 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); - // constexpr auto RepeatDiff = MRepeat - NRepeat; - - // debug_hexprinter(0xffffffff, a_thread_buf[Number{}], "Avalue "); - /* First local prefetch, move out of blockwise operation. - static_for<0, NRepeat, 1>{}([&](auto iN){ - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(I0, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); - }); - static_for<0, MRepeat, 1>{}([&](auto iN){ - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(I0, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); - }); - */ - /* + constexpr auto RepeatDiff = MRepeat - NRepeat; + static_for<0, KPerBlock, WmmaK>{}([&](auto iWmmaK){ - // Cut to Repeat Retangle to Square, assume MRepeat > NRepeat + + // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat static_for<0, RepeatDiff, 1>{}([&](auto iCut){ + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); static_for<0, NRepeat, 1>{}([&](auto iN){ + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); + vector_type a_thread_vec; vector_type b_thread_vec; @@ -323,22 +1041,31 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle }); using wmma_input_type = typename vector_type::type; - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - // debug_hexprinter(0x3c003c00, a_thread_vec.template - AsType()(Number<0>{})); wmma_gemm.template Run( a_thread_vec.template - AsType()(Number<0>{}), b_thread_vec.template - AsType()(Number<0>{}), + constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, - Number{}), a_block_buf, a_thread_desc_, make_tuple(I0, Number{}, I0, I0, - I0), a_thread_buf); }); - // Run FIFO fashion loopover in Square + + // Stage 2: Run FIFO fashion loopover in Square static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + // Row Repeatation static_for{}([&](auto iN){ + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); vector_type a_thread_vec; vector_type b_thread_vec; @@ -352,20 +1079,29 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle }); using wmma_input_type = typename vector_type::type; - constexpr index_t c_offset = + constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, - Number{}, I0, I0, Number{}), a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - static_for{}([&](auto iM){ + + // WmmaInnerloop++ + // Col Repeatation + static_for{}([&](auto iM){ + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); vector_type a_thread_vec; vector_type b_thread_vec; @@ -386,54 +1122,6 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, - I0, Number{}), b_block_buf, b_thread_desc_, make_tuple(I0, - Number{}, I0, I0, I0), b_thread_buf); - }); - }); - */ - - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), - a_thread_buf); - - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), - b_thread_buf); - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(i) = - b_thread_buf[Number{}]; - }); - - using wmma_input_type = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - }); }); }); } @@ -441,11 +1129,11 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle protected: // A[M0, M1, M2, K0 = WmmaK] static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, I1, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // B[N0, N1, N2, K0 = WmmaK] static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, I1, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( @@ -503,4 +1191,60 @@ constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_Selector() } }; +template +constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop_Selector() +{ + if constexpr(LoopSched == LoopScheduler::Default) + { + return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop{}; + } +}; + +template +constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO_Selector() +{ + if constexpr(LoopSched == LoopScheduler::Default) + { + return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; + } +}; + } // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 9b3bf5e272a..0f11801e115 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -481,7 +481,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), /* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, -/* typename DstDimAccessOrder, */ Sequence<1, 0, 2>, +/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, /* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, /* index_t DstVectorDim, */ 2, /* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, @@ -513,7 +513,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma decltype(b_grid_desc_k0_n_k1), decltype(b_block_desc_k0perblock_nperblock_k1), BBlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, + Sequence<0, 1, 2>, BBlockTransferSrcVectorDim, 2, BBlockTransferSrcScalarPerVector, diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp index 82bf2a5eb57..db27e6644b2 100644 --- a/include/ck/utility/amd_inline_asm.hpp +++ b/include/ck/utility/amd_inline_asm.hpp @@ -355,5 +355,17 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, c3); } +// Ranged input operand +__device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, + half16_t b, + float8_t& c) +{ + asm volatile("\n \ + v_wmma_f32_16x16x16_f16_w32 %0, %1, %2, %0\n \ + " + : "=v"(c) + : "v"(a), "v"(b), "0"(c)); +} + } // namespace ck #endif From e43df26a9414e2a14c4411480d8179ee88fd4230 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 13 Dec 2022 10:07:25 +0000 Subject: [PATCH 016/118] temp save, reproduce the v_bfi_b32 issue --- .../gpu/block/blockwise_gemm_wmma.hpp | 178 ++++++++++-------- .../gpu/grid/gridwise_gemm_wmma.hpp | 6 +- include/ck/utility/amd_inline_asm.hpp | 4 +- include/ck/utility/amd_wmma.hpp | 7 +- test/wmma_op/wmma_op_util.hpp | 43 ++++- 5 files changed, 146 insertions(+), 92 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 88bf6a9892e..15908c2ca4a 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -280,24 +280,24 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle const BBlockBuffer& b_block_buf, CThreadBuffer& c_thread_buf) const { - // auto a_thread_buf = make_static_buffer( - // a_thread_desc_.GetElementSpaceSize()); - // auto b_thread_buf = make_static_buffer( - // b_thread_desc_.GetElementSpaceSize()); + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); - StaticBufferTupleOfVector - a_thread_buf; - - StaticBufferTupleOfVector - b_thread_buf; + // StaticBufferTupleOfVector + // a_thread_buf; + + // StaticBufferTupleOfVector + // b_thread_buf; static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... static_for<0, MRepeat, 1>{}([&](auto m0) { @@ -306,8 +306,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle make_tuple(Number{}, m0, I0, I0, I0), a_block_buf, a_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), - a_thread_buf.GetVectorTypeReference(Number{}).template AsType()); + make_tuple(I0, m0, I0, I0, I0), + a_thread_buf); static_for<0, NRepeat, 1>{}([&](auto n0) { // read B @@ -315,28 +315,28 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle make_tuple(Number{}, n0, I0, I0, I0), b_block_buf, b_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), - b_thread_buf.GetVectorTypeReference(Number{}).template AsType()); - // vector_type a_thread_vec; - // vector_type b_thread_vec; - - // static_for<0, WmmaK, 1>{}([&](auto i) { - // a_thread_vec.template AsType()(i) = - // a_thread_buf[Number{}]; - // b_thread_vec.template AsType()(i) = - // b_thread_buf[Number{}]; - // }); - - // using wmma_input_type = typename vector_type::type; + make_tuple(I0, n0, I0, I0, I0), + b_thread_buf); + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + }); + + using wmma_input_type = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); wmma_gemm.template Run( - a_thread_buf.GetVectorTypeReference(Number{}), - b_thread_buf.GetVectorTypeReference(Number{}), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); }); @@ -346,11 +346,11 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle protected: // A[M0, M1, M2, K0 = WmmaK] static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, I1, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // B[N0, N1, N2, K0 = WmmaK] static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, I1, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( @@ -659,7 +659,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop make_tuple(I0, m0, I0, I0, I0), a_block_buf, a_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), + make_tuple(I0, Number{}, I0, I0, I0), a_thread_buf); static_for<0, NRepeat, 1>{}([&](auto n0) { @@ -668,7 +668,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop make_tuple(I0, n0, I0, I0, I0), b_block_buf, b_thread_desc_, - make_tuple(I0, I0, I0, I0, I0), + make_tuple(I0, Number{}, I0, I0, I0), b_thread_buf); static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... @@ -678,10 +678,10 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop static_for<0, WmmaK, 1>{}([&](auto i) { a_thread_vec.template AsType()(i) = a_thread_buf[Number{}]; + make_tuple((k*WmmaK + i) / A_K1, m0, 0, 0, (k*WmmaK + i) % A_K1))>{}]; b_thread_vec.template AsType()(i) = b_thread_buf[Number{}]; + make_tuple((k*WmmaK + i) / B_K1, n0, 0, 0, (k*WmmaK + i) % B_K1))>{}]; }); using wmma_input_type = typename vector_type::type; @@ -701,11 +701,11 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop protected: // A[M0, M1, M2, K0 = WmmaK] static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, I1, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // B[N0, N1, N2, K0 = WmmaK] static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, I1, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( @@ -716,7 +716,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), Sequence, - Sequence<3, 0, 1, 2, 4>, + Sequence<0, 1, 2, 3, 4>, 4, A_K1, A_K1>; @@ -726,7 +726,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), Sequence, - Sequence<3, 0, 1, 2, 4>, + Sequence<0, 1, 2, 3, 4>, 4, B_K1, B_K1>; @@ -1009,9 +1009,17 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO b_thread_desc_.GetElementSpaceSize()); constexpr auto RepeatDiff = MRepeat - NRepeat; - + static_for<0, KPerBlock, WmmaK>{}([&](auto iWmmaK){ + static_for<0, NRepeat, 1>{}([&](auto iN){ + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); + }); // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat static_for<0, RepeatDiff, 1>{}([&](auto iCut){ a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, @@ -1021,12 +1029,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO make_tuple(I0, Number{}, I0, I0, I0), a_thread_buf); static_for<0, NRepeat, 1>{}([&](auto iN){ - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); + // b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + // make_tuple(Number{}, Number{}, I0, I0, I0), + // b_block_buf, + // b_thread_desc_, + // make_tuple(I0, Number{}, I0, I0, I0), + // b_thread_buf); vector_type a_thread_vec; vector_type b_thread_vec; @@ -1042,30 +1050,34 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO using wmma_input_type = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - + s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); }); }); - + static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); + }); // Stage 2: Run FIFO fashion loopover in Square static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + // Row Repeatation static_for{}([&](auto iN){ - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); + + // b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + // make_tuple(Number{}, Number{}, I0, I0, I0), + // b_block_buf, + // b_thread_desc_, + // make_tuple(I0, Number{}, I0, I0, I0), + // b_thread_buf); vector_type a_thread_vec; vector_type b_thread_vec; @@ -1081,27 +1093,29 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); + s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); }); // WmmaInnerloop++ // Col Repeatation static_for{}([&](auto iM){ - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); + // a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + // make_tuple(Number{}, Number{}, I0, I0, I0), + // a_block_buf, + // a_thread_desc_, + // make_tuple(I0, Number{}, I0, I0, I0), + // a_thread_buf); + // b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + // make_tuple(Number{}, Number{}, I0, I0, I0), + // b_block_buf, + // b_thread_desc_, + // make_tuple(I0, Number{}, I0, I0, I0), + // b_thread_buf); vector_type a_thread_vec; vector_type b_thread_vec; @@ -1117,10 +1131,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); + s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); }); }); }); @@ -1144,7 +1160,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), Sequence, - Sequence<3, 0, 1, 2, 4>, + Sequence<0, 1, 2, 3, 4>, 4, A_K1, A_K1>; @@ -1154,7 +1170,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), Sequence, - Sequence<3, 0, 1, 2, 4>, + Sequence<0, 1, 2, 3, 4>, 4, B_K1, B_K1>; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 0f11801e115..a73d1b93773 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -310,7 +310,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); - using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle< + using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO< BlockSize, FloatAB, FloatAcc, @@ -367,7 +367,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); - using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle< + using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO< BlockSize, FloatAB, FloatAcc, @@ -540,7 +540,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle template __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) { - reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + // * Inline assembly need to elimate the duplicated data load, compiler won't help you delete them. + amd_assembly_wmma_f32_16x16x16_f16_w32(reg_a, reg_b, reg_c.template AsType()(Number<0>{})); + // reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( + // reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); } }; diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp index ef3f831abde..9961ff885cf 100644 --- a/test/wmma_op/wmma_op_util.hpp +++ b/test/wmma_op/wmma_op_util.hpp @@ -97,6 +97,7 @@ builtin_wmma_naive_selector __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) { + __shared__ src_t p_shared[16*16*2]; const int lIdx = threadIdx.x; // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and // b a_frag will store one column of the 16x16 matrix tile b_frag will store one row of the @@ -104,6 +105,9 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) using src_vec = typename vector_type::type; src_vec a_frag = {}; src_vec b_frag = {}; + + src_vec a_temp = {}; + src_vec b_temp = {}; // initialize c fragment to 0 using acc_vec = StaticBufferTupleOfVector; acc_vec c_thread_buf_; @@ -112,19 +116,52 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) // see https://atlvsp3.amd.com/sp3_gfx11_5_instructions.pdf page 482 // TODO: remove this dependency in gfx12 https://ontrack-internal.amd.com/browse/DEGFXSP3-101 const int lane = lIdx % 16; + const int lane_lo = lIdx / 2; + const int lane_hi = lIdx % 2; + for(int ele = 0; ele < 8; ++ele) + { + a_temp[ele] = a[8 * lane_hi + 16 * lane_lo + ele]; + } + + for(int ele = 0; ele < 8; ++ele) + { + b_temp[ele] = b[8 * lane_hi + 16 * lane_lo + ele]; + } + + __syncthreads(); + + for(int ele = 0; ele < 8; ++ele) + { + p_shared[8*16*lane_hi + 8 * lane_lo + ele] = a_temp[ele]; + } + + for(int ele = 0; ele < 8; ++ele) + { + p_shared[8*16*lane_hi + 8 * lane_lo + ele + 16*16] = b_temp[ele]; + } + + asm volatile("\ + s_waitcnt lgkmcnt(0) \n \ + s_barrier \ + " ::); for(int ele = 0; ele < 16; ++ele) { - b_frag[ele] = b[16 * lane + ele]; + b_frag[ele] = p_shared[(ele/8) * 16*8 + 8 * lane + ele%8 + 16*16]; } // follow origin design for(int ele = 0; ele < 16; ++ele) { - a_frag[ele] = a[16 * lane + ele]; + a_frag[ele] = p_shared[(ele/8) * 16*8 + 8 * lane + ele%8]; } + asm volatile("\ + s_waitcnt lgkmcnt(0) \n \ + s_barrier \ + " ::); + // sync threads, similar to mma_sync - __syncthreads(); + // __syncthreads(); builtin_wmma_naive_selector(a_frag, b_frag, c_thread_buf_); __syncthreads(); // wait for results, similar to mma_sync From 13af8cc43ef5674707f1a009f836602f47db4b33 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 13 Dec 2022 10:18:12 +0000 Subject: [PATCH 017/118] add inline asm for wmmaop test --- test/wmma_op/wmma_op_util.hpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp index 9961ff885cf..c70e6a407de 100644 --- a/test/wmma_op/wmma_op_util.hpp +++ b/test/wmma_op/wmma_op_util.hpp @@ -97,7 +97,7 @@ builtin_wmma_naive_selector __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) { - __shared__ src_t p_shared[16*16*2]; + __shared__ src_t p_shared[16 * 16 * 2]; const int lIdx = threadIdx.x; // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and // b a_frag will store one column of the 16x16 matrix tile b_frag will store one row of the @@ -115,7 +115,7 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in gfx11 // see https://atlvsp3.amd.com/sp3_gfx11_5_instructions.pdf page 482 // TODO: remove this dependency in gfx12 https://ontrack-internal.amd.com/browse/DEGFXSP3-101 - const int lane = lIdx % 16; + const int lane = lIdx % 16; const int lane_lo = lIdx / 2; const int lane_hi = lIdx % 2; for(int ele = 0; ele < 8; ++ele) @@ -129,15 +129,15 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) } __syncthreads(); - + for(int ele = 0; ele < 8; ++ele) { - p_shared[8*16*lane_hi + 8 * lane_lo + ele] = a_temp[ele]; + p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele] = a_temp[ele]; } for(int ele = 0; ele < 8; ++ele) { - p_shared[8*16*lane_hi + 8 * lane_lo + ele + 16*16] = b_temp[ele]; + p_shared[8 * 16 * lane_hi + 8 * lane_lo + ele + 16 * 16] = b_temp[ele]; } asm volatile("\ @@ -147,12 +147,12 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) for(int ele = 0; ele < 16; ++ele) { - b_frag[ele] = p_shared[(ele/8) * 16*8 + 8 * lane + ele%8 + 16*16]; + b_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8 + 16 * 16]; } // follow origin design for(int ele = 0; ele < 16; ++ele) { - a_frag[ele] = p_shared[(ele/8) * 16*8 + 8 * lane + ele%8]; + a_frag[ele] = p_shared[(ele / 8) * 16 * 8 + 8 * lane + ele % 8]; } asm volatile("\ @@ -163,6 +163,9 @@ __global__ void matmul(const src_t* a, const src_t* b, dst_t* c) // sync threads, similar to mma_sync // __syncthreads(); builtin_wmma_naive_selector(a_frag, b_frag, c_thread_buf_); + // since only fp16_fp32 asm wmma implemented for experiment purpose, restrict test case to fp16 + // when enable this ck::amd_assembly_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, + // c_thread_buf_.GetVectorTypeReference(Number<0>{}).template AsType()(Number<0>{})); __syncthreads(); // wait for results, similar to mma_sync static_for<0, 8, 1>{}([&](auto ele) { From 63f8766206b72ad5a25ce8274343938d4fe35ff9 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 15 Dec 2022 06:49:03 +0000 Subject: [PATCH 018/118] tidy up --- example/01_gemm/gemm_wmma_fp16.cpp | 19 +- .../gpu/block/blockwise_gemm_wmma.hpp | 944 +++++------------- .../gpu/device/impl/device_gemm_wmma.hpp | 9 +- .../gpu/grid/gridwise_gemm_wmma.hpp | 258 +---- .../threadwise_tensor_slice_transfer_v3r1.hpp | 3 - .../tensor_operation/gpu/warp/wmma_gemm.hpp | 75 +- 6 files changed, 274 insertions(+), 1034 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 43348d6e5df..e36ff630c42 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -22,20 +22,13 @@ using CElementOp = PassThrough; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; // clang-format off -// using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmWmma -// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MWMMA|NMMMA| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| -// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar| -// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector| -// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - // < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, false, 6, 1>; -// clang-format on - using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle -// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MWmma|NWmma| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | WMMA| WMMA| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave| _MBlock_MWaveMPerWmma| ScalarPerVector| -// ######| | | | | | | | | Operation| Operation| Operation| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| -// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 4, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>; +// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MRepeat|NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| +// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave| _MBlock_MWaveMPerWmma| ScalarPerVector| +// ######| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| +// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>; +// clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 15908c2ca4a..84c639391b9 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -13,7 +13,8 @@ namespace ck { template {}; + static constexpr auto wmma_gemm = WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -140,464 +141,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } - // Thread level, register decriptor. Vector-write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave - // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, - I1, - MSubGroup, - Number{}, - I1, - NThreadPerSubGroup, - MAccVgprs)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - - // Thread level, register decriptor. Per-pixel write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave - // |NThreadPerSubGroup - make_tuple(Number{}, - I1, - MSubGroup, - MAccVgprs, - Number{}, - I1, - NThreadPerSubGroup)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - - // Provide dimension size - __host__ __device__ static constexpr auto - GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); - } - - __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() - { - return transform_tensor_descriptor( - AK0MK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() - { - return transform_tensor_descriptor( - BK0NK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma - static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); - static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); - - template - __device__ void Run(const ABlockBuffer& a_block_buf, - const BBlockBuffer& b_block_buf, - CThreadBuffer& c_thread_buf) const - { - auto a_thread_buf = make_static_buffer( - a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( - b_thread_desc_.GetElementSpaceSize()); - - // StaticBufferTupleOfVector - // a_thread_buf; - - // StaticBufferTupleOfVector - // b_thread_buf; - - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0), - a_thread_buf); - - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0), - b_thread_buf); - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(i) = - b_thread_buf[Number{}]; - }); - - using wmma_input_type = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - }); - }); - }); - } - - protected: - // A[M0, M1, M2, K0 = WmmaK] - static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); - - // B[N0, N1, N2, K0 = WmmaK] - static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); - - // C[M, N, NumRegWMMA] - static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); - - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - A_K1, - A_K1>; - - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - B_K1, - B_K1>; - - AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; - BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; -}; - -template -/* A: K0PerBlock x MPerBlock x K1 - * B: K0PerBlock x NPerBlock x K1 - * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs - * KPACK == WMMA_K = 16 - */ -struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto WmmaK = Number<16>{}; - - using ThisThreadBlock = ThisThreadBlock; - - // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. - static constexpr index_t WaveSize = 32; - - static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); - static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); - static constexpr index_t KPerBlock = - BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); - - static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); - static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); - static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); - static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); - - static constexpr auto wmma_gemm = WmmaGemm{}; - - static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); - static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); - - StaticBufferTupleOfVector - c_thread_buf_; - - __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } - - __device__ static auto GetWaveIdx() - { - const index_t thread_id = ThisThreadBlock::GetThreadId(); - - constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); - } - - __device__ static auto CalculateAThreadOriginDataIndex() - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - - const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); - // |KRepeat |MRepeat|MWave |MLane |KPack - return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); - } - - __device__ static auto CalculateBThreadOriginDataIndex() - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_n = wave_idx[I1]; - - const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); - // |KRepeat |NRepeat|Nwave |NLane |KPack - return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); - } - - template - __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); - - constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( - make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; - const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( - make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; - - return make_tuple(c_thread_m, c_thread_n); - } - - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop() - { - static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && - BK0NK1BlockDesc::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, - "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); - - static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && - NPerBlock % (NPerWMMA * NRepeat) == 0, - "wrong!"); - } - // Thread level, register decriptor. Vector-write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave - // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, - I1, - MSubGroup, - Number{}, - I1, - NThreadPerSubGroup, - MAccVgprs)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - - // Thread level, register decriptor. Per-pixel write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave - // |NThreadPerSubGroup - make_tuple(Number{}, - I1, - MSubGroup, - MAccVgprs, - Number{}, - I1, - NThreadPerSubGroup)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - + // Provide dimension size __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() @@ -648,50 +192,50 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop const BBlockBuffer& b_block_buf, CThreadBuffer& c_thread_buf) const { - auto a_thread_buf = make_static_buffer( + auto a_thread_buf = make_static_buffer( a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( + auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(I0, m0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, m0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0), + a_thread_buf); - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(I0, n0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); - - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - vector_type a_thread_vec; - vector_type b_thread_vec; + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0), + b_thread_buf); + vector_type a_thread_vec; + vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = + a_thread_vec.template AsType()(i) = a_thread_buf[Number{}]; - b_thread_vec.template AsType()(i) = + make_tuple(i / A_K1, m0, 0, 0, i % A_K1))>{}]; + b_thread_vec.template AsType()(i) = b_thread_buf[Number{}]; + make_tuple(i / B_K1, n0, 0, 0, i % B_K1))>{}]; }); - using wmma_input_type = typename vector_type::type; + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); }); @@ -699,33 +243,33 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop } protected: - // A[M0, M1, M2, K0 = WmmaK] + // A[K0, M0, M1, M2, K1] static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); - // B[N0, N1, N2, K0 = WmmaK] + // B[K0, N0, N1, N2, K1] static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); + make_tuple(Number{}, Number{}, I1, I1, Number{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence, Sequence<0, 1, 2, 3, 4>, 4, A_K1, A_K1>; - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, + Sequence, Sequence<0, 1, 2, 3, 4>, 4, B_K1, @@ -735,8 +279,11 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; }; + +// block wise level pipe designed for inline asm template {}; + static constexpr auto wmma_gemm = WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -908,51 +455,6 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); } - // Thread level, register decriptor. Per-pixel write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_MAccVgprs_NRepeat_NWave_NThreadPerSubGroup() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |MAccVgprs |NRepeat |NWave - // |NThreadPerSubGroup - make_tuple(Number{}, - I1, - MSubGroup, - MAccVgprs, - Number{}, - I1, - NThreadPerSubGroup)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - // Provide dimension size __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() @@ -1003,141 +505,227 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO const BBlockBuffer& b_block_buf, CThreadBuffer& c_thread_buf) const { - auto a_thread_buf = make_static_buffer( + auto a_thread_buf = make_static_buffer( a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( + auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); constexpr auto RepeatDiff = MRepeat - NRepeat; - - static_for<0, KPerBlock, WmmaK>{}([&](auto iWmmaK){ - + // Read all Mrepeat, Nrepeat + static_for<0, NRepeat, 1>{}([&](auto iN){ + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(I0, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); + }); + + static_for<0, MRepeat, 1>{}([&](auto iM){ + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(I0, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); + }); + + // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat + static_for<0, RepeatDiff, 1>{}([&](auto iCut){ static_for<0, NRepeat, 1>{}([&](auto iN){ - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); + + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + s_nop(); + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); }); - // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat - static_for<0, RepeatDiff, 1>{}([&](auto iCut){ + if constexpr( KPerBlock > WmmaK ){ + // Read Consumed Next inner loop A a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), + make_tuple(Number{}, Number{}, I0, I0, I0), a_block_buf, a_thread_desc_, make_tuple(I0, Number{}, I0, I0, I0), a_thread_buf); - static_for<0, NRepeat, 1>{}([&](auto iN){ - // b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - // make_tuple(Number{}, Number{}, I0, I0, I0), - // b_block_buf, - // b_thread_desc_, - // make_tuple(I0, Number{}, I0, I0, I0), - // b_thread_buf); - - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type = typename vector_type::type; + } + }); - constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); - }); - }); - static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - }); + static_for{}([&](auto iWmmaK){ // Stage 2: Run FIFO fashion loopover in Square static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ - // Row Repeatation static_for{}([&](auto iN){ - - // b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - // make_tuple(Number{}, Number{}, I0, I0, I0), - // b_block_buf, - // b_thread_desc_, - // make_tuple(I0, Number{}, I0, I0, I0), - // b_thread_buf); - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = + a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = + b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; }); - using wmma_input_type = typename vector_type::type; + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); - // WmmaInnerloop++ + // Read Consumed Next inner loop A + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); + // Col Repeatation static_for{}([&](auto iM){ - // a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - // make_tuple(Number{}, Number{}, I0, I0, I0), - // a_block_buf, - // a_thread_desc_, - // make_tuple(I0, Number{}, I0, I0, I0), - // a_thread_buf); - // b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - // make_tuple(Number{}, Number{}, I0, I0, I0), - // b_block_buf, - // b_thread_desc_, - // make_tuple(I0, Number{}, I0, I0, I0), - // b_thread_buf); - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = + a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = + b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; }); - using wmma_input_type = typename vector_type::type; + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); + }); + // Read Consumed Next inner loop B + b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); + }); + + // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat + static_for<0, RepeatDiff, 1>{}([&](auto iCut){ + static_for<0, NRepeat, 1>{}([&](auto iN){ + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + s_nop(); + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); + if constexpr( KPerBlock > WmmaK ){ + a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number<(iWmmaK+WmmaK)/A_K1>{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); + } + }); + }); + + // Stage 2: Run FIFO fashion loopover in Square + static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + // Row Repeatation + static_for{}([&](auto iN){ + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); + s_nop(); + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); + }); + + // Col Repeatation + static_for{}([&](auto iM){ + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto iK) { + a_thread_vec.template AsType()(iK) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(iK) = + b_thread_buf[Number{}]; + }); + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); + s_nop(); + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + s_nop(); }); }); } @@ -1155,8 +743,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, @@ -1165,8 +753,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO A_K1, A_K1>; - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, @@ -1179,88 +767,4 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; }; -template -constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_Selector() -{ - if constexpr(LoopSched == LoopScheduler::Default) - { - return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle{}; - } -}; - -template -constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop_Selector() -{ - if constexpr(LoopSched == LoopScheduler::Default) - { - return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_MNKloop{}; - } -}; - -template -constexpr auto BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO_Selector() -{ - if constexpr(LoopSched == LoopScheduler::Default) - { - return BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; - } -}; - } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 9e572cf1dc7..e5773144ac0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -201,7 +201,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, remove_reference_t, @@ -384,7 +386,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, remove_reference_t, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index a73d1b93773..7b930bd7986 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -18,7 +18,8 @@ namespace ck { template {}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - constexpr auto WmmaK = 16; - constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); - - using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO< - BlockSize, - FloatAB, - FloatAcc, - decltype(a_block_desc_k0perblock_mperblock_k1), - decltype(b_block_desc_k0perblock_nperblock_k1), - MPerWmma, - NPerWmma, - MRepeat, - NRepeat, - KPack>; - - return BlockwiseGemm:: - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_grid_desc_m_n); - } - - // Per pixel - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - const CGridDesc_M_N& c_grid_desc_m_n) - { - constexpr auto max_lds_align = K1; - - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - constexpr auto WmmaK = 16; - constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); - - using BlockwiseGemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO< - BlockSize, - FloatAB, - FloatAcc, - decltype(a_block_desc_k0perblock_mperblock_k1), - decltype(b_block_desc_k0perblock_nperblock_k1), - MPerWmma, - NPerWmma, - MRepeat, - NRepeat, - KPack>; - - return BlockwiseGemm:: - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - c_grid_desc_m_n); - } - __host__ __device__ static constexpr auto MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n) { @@ -410,11 +298,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return BlockToCTileMap_M00_N0_M01Adapt( c_grid_desc_m_n); } - // using - // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup - // = remove_cvref_t; + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; using DefaultBlock2CTileMap = @@ -422,17 +306,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma template __device__ static void - Run(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, + Run(const FloatA* __restrict__ p_a_grid, + const FloatB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, void* __restrict__ p_shared, const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& c_grid_desc_mblock_mperblock_nblock_nperblock, - // const - // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup& - // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, const AElementwiseOperation& a_element_op, const BElementwiseOperation& b_element_op, const CElementwiseOperation& c_element_op, @@ -476,8 +357,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatAB, -/* typename DstData, */ FloatAB, +/* typename SrcData, */ FloatA, +/* typename DstData, */ FloatA, /* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), /* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, @@ -496,8 +377,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma a_block_desc_k0perblock_mperblock_k1, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); - // printf("BlockSliceLengths K0 = %d, M = %d, K1 = %d\n", K0PerBlock, MPerBlock, K1()); - // printf("a_block_wise_copy: %s\n", std::string(type_name()).c_str()); // B matrix blockwise copy auto b_blockwise_copy = @@ -508,8 +387,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma Sequence, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, + FloatB, + FloatB, decltype(b_grid_desc_k0_n_k1), decltype(b_block_desc_k0perblock_nperblock_k1), BBlockTransferSrcAccessOrder, @@ -530,18 +409,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma ck::tensor_operation::element_wise::PassThrough{}); /*******************************************************************************/ - // GEMM definition - // c_mtx += a_mtx * b_mtx - // a_mtx[K0PerBlock, MPerBlock] is in LDS - // b_mtx[K0PerBlock, NPerBlock] is in LDS - // c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in register - + // GEMM constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); + auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); // Shift Per SUB_K constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); @@ -582,101 +457,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma c_thread_buf, K0BlockMainLoop); /*******************************************************************************/ -#ifdef CK_EXPERIMENTAL_ARBITRARY_WRITEOUT - // write out C matrix, c shuffle not implemented - { - static_for<0, 16, 1>{}([&](auto i){ - char info[4]; - info[0] = 'C'; - info[1] = i/10 + '0'; - info[2] = i%10 + '0'; - info[3] = '\0'; - debug_hexprinter(0xffffffff, c_thread_buf[Number{}], info); - }); - - constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - - // This API Provide All dimension (size) you need - constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - - constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I1); - constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I2); - constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I4); - constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I5); - constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs.GetLength(I6); - // printf("MWave = %d, MSubGroup = %d, NWave = %d, NThreadPerSubGroup = %d, MAccVgprs = %d\n", MWave, MSubGroup, NWave, NThreadPerSubGroup, MAccVgprs); - // Mapping - const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); - const index_t m_thread_data_on_grid = m_block_data_idx_on_grid + c_thread_mtx_on_block[I0]; - const index_t n_thread_data_on_grid = n_block_data_idx_on_grid + c_thread_mtx_on_block[I1]; - // Checked - // debug_hexprinter(0xffffffff, m_thread_data_on_grid, "c_m"); - // debug_hexprinter(0xffffffff, n_thread_data_on_grid, "c_n"); - - const auto m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), - make_tuple(Sequence<0, 1, 2, 3>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_grid_idx = m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_grid)); - debug_hexprinter(0x4, MRepeat, "mblockxrepeat"); - debug_hexprinter(0x2, MWave, "mwave"); - debug_hexprinter(0x2, MSubGroup, "msubgroup"); - debug_hexprinter(0x8, MAccVgprs, "maccvgprs"); - debug_hexprinter(0x4, NWave, "nwave"); - - const auto n_thread_data_on_grid_idx = n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_grid)); - - - // printf("write out dimension access order = (%d, %d, %d, %d, %d, %d, %d)\n", CThreadTransferSrcDstAccessOrder{}[Number<0>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<1>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<2>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<3>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<4>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<5>{}].value, CThreadTransferSrcDstAccessOrder{}[Number<6>{}].value); - auto c_thread_copy = - ThreadwiseTensorSliceTransfer_v1r3< - /* typename SrcData */ FloatAcc, - /* typename DstData */ FloatC, - /* typename SrcDesc */ decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs), - /* typename DstDesc */ decltype(c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup), - /* typename ElementwiseOperation */ CElementwiseOperation, - // Thread register Mapping 0 1 2 4 5 6 3 - /* typename SliceLengths */ Sequence, - /* typename DimAccessOrder */ CThreadTransferSrcDstAccessOrder, - /* index_t DstVectorDim */ CThreadTransferSrcDstVectorDim, - /* index_t DstScalarPerVector */ CThreadTransferDstScalarPerVector, - /* InMemoryDataOperationEnum DstInMemOp */ CGlobalMemoryDataOperation, - /* index_t DstScalarStrideInVector */ 1, - /* bool DstResetCoordinateAfterRun */ true> - { - /* dst_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, - /* dst_slice_origin_idx */ make_multi_index(m_thread_data_on_grid_idx[I0], - m_thread_data_on_grid_idx[I1], - m_thread_data_on_grid_idx[I2], - m_thread_data_on_grid_idx[I3], - n_thread_data_on_grid_idx[I0], - n_thread_data_on_grid_idx[I1], - n_thread_data_on_grid_idx[I2]), - /* element_op */ c_element_op - }; - - c_thread_copy.Run( - /* c_thread_desc */ c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, - /* c_register_beginning*/ make_tuple(I0, I0, I0, I0, I0, I0, I0), - /* c_local(register) */ c_thread_buf, - /* c_grid_desc */ c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, - /* c_grid_buf */ c_grid_buf); - } -#endif + // write out to C, implement shuffle { - // write out to C, implement shuffle constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 1cfaaf09378..cb289d339fe 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -128,12 +128,10 @@ struct ThreadwiseTensorSliceTransfer_v3r1 detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; - // printf("src_access_lengths: %d, %d, %d\n", (src_access_lengths[Number<0>{}])(), src_access_lengths[Number<1>{}](), src_access_lengths[Number<2>{}]()); constexpr auto src_dim_access_order = SrcDimAccessOrder{}; constexpr auto ordered_src_access_lengths = container_reorder_given_new2old(src_access_lengths, src_dim_access_order); - // printf("ordered_src_access_lengths: %d, %d, %d\n", (ordered_src_access_lengths[Number<0>{}])(), ordered_src_access_lengths[Number<1>{}](), ordered_src_access_lengths[Number<2>{}]()); // make forward steps const auto src_forward_steps = generate_tuple( @@ -210,7 +208,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_buf.template Get(src_coord_.GetOffset(), is_src_valid)}; // apply SrcElementwiseOperation on src_vector_container - // debug_hexprinter(0xffffffff, src_coord_.GetOffset()); static_for<0, SrcScalarPerVector, 1>{}([&](auto i) { SrcData src_v; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 7b8887b3957..a2685e659bc 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -283,51 +283,51 @@ struct wmma_type +template struct WmmaSelector { - template + template static constexpr auto GetWmma(); template <> - static constexpr auto GetWmma() + static constexpr auto GetWmma() { return WmmaInstr::wmma_f32_16x16x16_f16; } template <> - static constexpr auto GetWmma() + static constexpr auto GetWmma() { return WmmaInstr::wmma_f32_16x16x16_bf16; } template <> - static constexpr auto GetWmma() + static constexpr auto GetWmma() { return WmmaInstr::wmma_f16_16x16x16_f16; } template <> - static constexpr auto GetWmma() + static constexpr auto GetWmma() { return WmmaInstr::wmma_bf16_16x16x16_bf16; } template <> - static constexpr auto GetWmma() + static constexpr auto GetWmma() { return WmmaInstr::wmma_i32_16x16x16_iu8; } #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 template <> - static constexpr auto GetWmma() + static constexpr auto GetWmma() { return WmmaInstr::wmma_i32_16x16x16_iu4; } #endif // get_warp_size do not return the correct wavesize, hardcode to 32 as workaround static constexpr auto selected_wmma = - wmma_type(), Number<32>{}>{}; + wmma_type(), Number<32>{}>{}; __host__ __device__ constexpr WmmaSelector() { @@ -344,7 +344,8 @@ struct WmmaSelector } }; -template {})); } - // Per-Pixel write - template - __host__ __device__ static constexpr auto - MakeCDesc_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup( - const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& - c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) - { - const auto MBlockxRepeat = - c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); - const auto NBlockxRepeat = - c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); - const auto MWave = - c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); - const auto NWave = - c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); - - return transform_tensor_descriptor( - c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma, - make_tuple( - make_pass_through_transform(MBlockxRepeat), - make_pass_through_transform(MWave), - make_unmerge_transform(make_tuple(Number{}, - Number{})), - make_pass_through_transform(NBlockxRepeat), - make_pass_through_transform(NWave), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<0>{}, - Sequence<1>{}, - Sequence<2, 3>{}, - Sequence<4>{}, - Sequence<5>{}, - Sequence<6>{})); - } - __device__ static constexpr index_t GetRegSizePerWmma() { return wmma_instr.num_acc_vgprs_per_wave; @@ -463,13 +424,13 @@ struct WmmaGemm __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const { static_assert( - (is_same::value && is_same::value) || - (is_same::value && is_same::value) || - (is_same::value && is_same::value) || - (is_same::value && is_same::value) || - (is_same::value && is_same::value) + (is_same::value && is_same::value && is_same::value) || + (is_same::value && is_same::value && is_same::value) || + (is_same::value && is_same::value && is_same::value) || + (is_same::value && is_same::value && is_same::value) || + (is_same::value && is_same::value && is_same::value) #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 - || (is_same::value && is_same::value) + || (is_same::value && is_same::value && is_same::value) #endif , "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), " @@ -518,7 +479,7 @@ struct WmmaGemm return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset}; } - static constexpr auto wmma = WmmaSelector{}; + static constexpr auto wmma = WmmaSelector{}; static constexpr auto wmma_instr = wmma.selected_wmma; __host__ __device__ static constexpr auto From 2a0e5439e176fd5063c1c39fc1e14bd68e0f6796 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 15 Dec 2022 06:57:20 +0000 Subject: [PATCH 019/118] clean some debug purpose code --- example/01_gemm/run_gemm_example.inc | 2 +- .../threadwise_tensor_slice_transfer.hpp | 29 ++---------- .../threadwise_tensor_slice_transfer_v3r1.hpp | 2 +- include/ck/utility/common_header.hpp | 47 ------------------- include/ck/utility/data_type.hpp | 5 -- .../include/ck/library/utility/check_err.hpp | 8 ++-- library/include/ck/library/utility/fill.hpp | 18 ------- 7 files changed, 9 insertions(+), 102 deletions(-) diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 4d735ebbf22..91027f72d03 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -101,7 +101,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) return true; } float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); - + std::size_t flop = 2_uz * M * N * K; std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 84800da0c93..be4c63ab0e6 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -119,29 +119,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 using SpaceFillingCurve = SpaceFillingCurve>; - // printf("SpaceFillingCurve access_lengths = (%d, %d, %d, %d, %d, %d, %d)\n", SpaceFillingCurve::access_lengths[Number<0>{}].value, - // SpaceFillingCurve::access_lengths[Number<1>{}].value, - // SpaceFillingCurve::access_lengths[Number<2>{}].value, - // SpaceFillingCurve::access_lengths[Number<3>{}].value, - // SpaceFillingCurve::access_lengths[Number<4>{}].value, - // SpaceFillingCurve::access_lengths[Number<5>{}].value, - // SpaceFillingCurve::access_lengths[Number<6>{}].value); -// - // // printf("SpaceFillingCurve dim_access_order = (%d, %d, %d, %d, %d, %d, %d)\n", SpaceFillingCurve::dim_access_order[Number<0>{}].value, - // SpaceFillingCurve::dim_access_order[Number<1>{}].value, - // SpaceFillingCurve::dim_access_order[Number<2>{}].value, - // SpaceFillingCurve::dim_access_order[Number<3>{}].value, - // SpaceFillingCurve::dim_access_order[Number<4>{}].value, - // SpaceFillingCurve::dim_access_order[Number<5>{}].value, - // SpaceFillingCurve::dim_access_order[Number<6>{}].value); -// - // // // printf("SpaceFillingCurve ordered_access_lengths = (%d, %d, %d, %d, %d, %d, %d)\n", SpaceFillingCurve::ordered_access_lengths[Number<0>{}].value, - // SpaceFillingCurve::ordered_access_lengths[Number<1>{}].value, - // SpaceFillingCurve::ordered_access_lengths[Number<2>{}].value, - // SpaceFillingCurve::ordered_access_lengths[Number<3>{}].value, - // SpaceFillingCurve::ordered_access_lengths[Number<4>{}].value, - // SpaceFillingCurve::ordered_access_lengths[Number<5>{}].value, - // SpaceFillingCurve::ordered_access_lengths[Number<6>{}].value); + // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector? static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector, "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector"); @@ -158,7 +136,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 static_for<0, DstScalarPerVector, 1>{}([&](auto i) { constexpr index_t src_offset = src_desc.CalculateOffset( src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); - // debug_hexprinter(0xffffffff, src_offset, "src_coord_iteration"); + SrcData v; // apply element-wise operation @@ -176,11 +154,10 @@ struct ThreadwiseTensorSliceTransfer_v1r3 dst_coord_.GetOffset(), is_dst_valid, dst_vector.template AsType()[Number<0>{}]); - // debug_hexprinter(0xffffffff, dst_coord_.GetOffset(), "dst_coord_iteration"); + if constexpr(idx_1d.value != num_access - 1) { constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); - // printf("move forward = (%d, %d, %d, %d, %d, %d, %d)\n", forward_step[Number<0>{}], forward_step[Number<1>{}], forward_step[Number<2>{}], forward_step[Number<3>{}], forward_step[Number<4>{}], forward_step[Number<5>{}], forward_step[Number<6>{}]); move_tensor_coordinate( dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index cb289d339fe..bb28c194f4b 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -96,7 +96,6 @@ struct ThreadwiseTensorSliceTransfer_v3r1 src_element_op_(src_element_op), dst_element_op_(dst_element_op) { - // printf("global desc: %s\n", __PRETTY_FUNCTION__); } __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) @@ -128,6 +127,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 detail::lambda_scalar_per_access{}, Number{}); constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; constexpr auto ordered_src_access_lengths = diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index f85ab7e76c6..1378bbe448e 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -49,50 +49,3 @@ #ifdef CK_USE_AMD_MFMA #include "ck/utility/amd_xdlops.hpp" #endif - -#include - -template -constexpr auto type_name() { - std::string_view name, prefix, suffix; -#ifdef __clang__ - name = __PRETTY_FUNCTION__; - prefix = "auto type_name() [T = "; - suffix = "]"; -#elif defined(__GNUC__) - name = __PRETTY_FUNCTION__; - prefix = "constexpr auto type_name() [with T = "; - suffix = "]"; -#elif defined(_MSC_VER) - name = __FUNCSIG__; - prefix = "auto __cdecl type_name<"; - suffix = ">(void)"; -#endif - name.remove_prefix(prefix.size()); - name.remove_suffix(suffix.size()); - return name; -} - -// Accepet int, float, and Number<> as input -template -__host__ __device__ -void debug_hexprinter(const uint32_t v_target, const T v_val, const char* info){ - if constexpr(std::is_same_v || std::is_same_v ) - { - const uint32_t v_dbg = *(reinterpret_cast(&v_val)); - if(v_dbg != v_target) - printf("%s@Thread: %d, Val: %08x != Target: %08x\n", info, ck::get_thread_local_1d_id(), v_dbg, v_target); - } - else if constexpr(std::is_same_v) - { - const uint16_t v_dbg = *(reinterpret_cast(&v_val)); - if(v_dbg != v_target) - printf("%s@Thread: %d, Val: %04x != Target: %08x\n", info, ck::get_thread_local_1d_id(), v_dbg, v_target); - } - else - { - const uint32_t v_dbg = *(reinterpret_cast(&(v_val.value))); - if(v_dbg != v_target) - printf("%s@Thread: %d, Val: %08x != Target: %08x\n", info, ck::get_thread_local_1d_id(), v_dbg, v_target); - } -} diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 9fc55423750..40ee8b617e2 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -942,11 +942,6 @@ using int8x16_t = typename vector_type::type; using int8x32_t = typename vector_type::type; using int8x64_t = typename vector_type::type; -#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 -// i4 -using int4x16_t = typename vector_type::type; -#endif - // Convert X to Y template __host__ __device__ constexpr Y type_convert(X x) diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index 011b2728f28..ad286400b39 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -55,7 +55,7 @@ check_err(const Range& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 16384) + if(err_count < 5) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; @@ -103,7 +103,7 @@ check_err(const Range& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 16384) + if(err_count < 5) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; @@ -150,7 +150,7 @@ check_err(const Range& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 16384) + if(err_count < 5) { std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; @@ -202,7 +202,7 @@ check_err(const Range& out, { max_err = err > max_err ? err : max_err; err_count++; - if(err_count < 16384) + if(err_count < 5) { std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r << std::endl; diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp index 854b30b2c6c..54d58f362cc 100644 --- a/library/include/ck/library/utility/fill.hpp +++ b/library/include/ck/library/utility/fill.hpp @@ -114,23 +114,5 @@ struct FillConstant } }; -template -struct FillMNID -{ - T step_{0.1}; - int k_num_{32}; - int mn_num_{128}; - - template - void operator()(ForwardIter first, ForwardIter last) const - { - std::generate(first, last, [=, iter = 0]() mutable { - auto tmp = ((iter/k_num_) % mn_num_ ) * step_; - iter ++; - return tmp; - }); - } -}; - } // namespace utils } // namespace ck From 3941bd1f1507d52f623b82c0b77e0eb640d9b8c3 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 15 Dec 2022 06:59:57 +0000 Subject: [PATCH 020/118] discard some codes --- example/01_gemm/run_gemm_example.inc | 3 ++- .../gpu/thread/threadwise_tensor_slice_transfer.hpp | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 91027f72d03..4e2cedb52ad 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -100,8 +100,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) return true; } + float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); - + std::size_t flop = 2_uz * M * N * K; std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index be4c63ab0e6..b0f453b025f 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -158,6 +158,7 @@ struct ThreadwiseTensorSliceTransfer_v1r3 if constexpr(idx_1d.value != num_access - 1) { constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d); + move_tensor_coordinate( dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step)); } From cfb397b1879f255edfbfb7f734517c1f3ccac52a Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 15 Dec 2022 07:04:31 +0000 Subject: [PATCH 021/118] clang format --- library/include/ck/library/utility/check_err.hpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp index ad286400b39..a89d03d324f 100644 --- a/library/include/ck/library/utility/check_err.hpp +++ b/library/include/ck/library/utility/check_err.hpp @@ -65,7 +65,6 @@ check_err(const Range& out, } if(!res) { - std::cerr << "err count: " << err_count << std::endl; std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; @@ -113,7 +112,6 @@ check_err(const Range& out, } if(!res) { - std::cerr << "err count: " << err_count << std::endl; std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; @@ -160,7 +158,6 @@ check_err(const Range& out, } if(!res) { - std::cerr << "err count: " << err_count << std::endl; std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; } return res; @@ -212,7 +209,6 @@ check_err(const Range& out, } if(!res) { - std::cerr << "err count: " << err_count << std::endl; std::cerr << "max err: " << max_err << std::endl; } return res; From 5d5891b0510ad562c2af2719f6a6363e13ff7520 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 15 Dec 2022 07:47:22 +0000 Subject: [PATCH 022/118] clang format --- example/01_gemm/gemm_wmma_fp16.cpp | 1 - .../gpu/block/blockwise_gemm_wmma.hpp | 160 +++++++++--------- .../gpu/device/impl/device_gemm_wmma.hpp | 21 ++- .../gpu/grid/gridwise_gemm_wmma.hpp | 28 +-- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 33 +++- include/ck/utility/amd_inline_asm.hpp | 8 +- include/ck/utility/amd_wmma.hpp | 11 +- 7 files changed, 144 insertions(+), 118 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index e36ff630c42..2a6ceca76ff 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -30,7 +30,6 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>; // clang-format on - using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 84c639391b9..d7cf6c6173b 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -52,7 +52,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); - static constexpr auto wmma_gemm = WmmaGemm{}; + static constexpr auto wmma_gemm = + WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -141,7 +142,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } - + // Provide dimension size __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() @@ -279,7 +280,6 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; }; - // block wise level pipe designed for inline asm template {}; + static constexpr auto wmma_gemm = + WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -512,7 +513,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr auto RepeatDiff = MRepeat - NRepeat; // Read all Mrepeat, Nrepeat - static_for<0, NRepeat, 1>{}([&](auto iN){ + static_for<0, NRepeat, 1>{}([&](auto iN) { b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, make_tuple(I0, Number{}, I0, I0, I0), b_block_buf, @@ -521,7 +522,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO b_thread_buf); }); - static_for<0, MRepeat, 1>{}([&](auto iM){ + static_for<0, MRepeat, 1>{}([&](auto iM) { a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, make_tuple(I0, Number{}, I0, I0, I0), a_block_buf, @@ -531,35 +532,36 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO }); // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat - static_for<0, RepeatDiff, 1>{}([&](auto iCut){ - static_for<0, NRepeat, 1>{}([&](auto iN){ - + static_for<0, RepeatDiff, 1>{}([&](auto iCut) { + static_for<0, NRepeat, 1>{}([&](auto iN) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK / A_K1, iCut, 0, 0, iK % A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; - constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); - if constexpr( KPerBlock > WmmaK ){ + if constexpr(KPerBlock > WmmaK) + { // Read Consumed Next inner loop A a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), + make_tuple(Number{}, Number{}, I0, I0, I0), a_block_buf, a_thread_desc_, make_tuple(I0, Number{}, I0, I0, I0), @@ -567,55 +569,57 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO } }); - static_for{}([&](auto iWmmaK){ + static_for{}([&](auto iWmmaK) { // Stage 2: Run FIFO fashion loopover in Square - static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) { // Row Repeatation - static_for{}([&](auto iN){ + static_for{}([&](auto iN) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; + a_thread_buf[Number{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); + constexpr index_t c_offset = c_thread_desc_.CalculateOffset( + make_tuple(WmmaInnerloop + RepeatDiff, iN, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); // Read Consumed Next inner loop A - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple( + Number{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); // Col Repeatation - static_for{}([&](auto iM){ + static_for{}([&](auto iM) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK / A_K1, iM, 0, 0, iK % A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK / B_K1, WmmaInnerloop, 0, 0, iK % B_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; @@ -624,96 +628,100 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); // Read Consumed Next inner loop B - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, Number{}, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + b_thread_buf); }); // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat - static_for<0, RepeatDiff, 1>{}([&](auto iCut){ - static_for<0, NRepeat, 1>{}([&](auto iN){ + static_for<0, RepeatDiff, 1>{}([&](auto iCut) { + static_for<0, NRepeat, 1>{}([&](auto iN) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK / A_K1, iCut, 0, 0, iK % A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; - constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); - if constexpr( KPerBlock > WmmaK ){ - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number<(iWmmaK+WmmaK)/A_K1>{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); + if constexpr(KPerBlock > WmmaK) + { + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number<(iWmmaK + WmmaK) / A_K1>{}, Number{}, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, Number{}, I0, I0, I0), + a_thread_buf); } }); }); // Stage 2: Run FIFO fashion loopover in Square - static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop){ + static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) { // Row Repeatation - static_for{}([&](auto iN){ + static_for{}([&](auto iN) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK / A_K1, WmmaInnerloop + RepeatDiff, 0, 0, iK % A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK / B_K1, iN, 0, 0, iK % B_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop+RepeatDiff, iN, 0)); + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop + RepeatDiff, iN, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); // Col Repeatation - static_for{}([&](auto iM){ + static_for{}([&](auto iM) { vector_type a_thread_vec; vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto iK) { a_thread_vec.template AsType()(iK) = a_thread_buf[Number{}]; + make_tuple(iK / A_K1, iM, 0, 0, iK % A_K1))>{}]; b_thread_vec.template AsType()(iK) = b_thread_buf[Number{}]; + make_tuple(iK / B_K1, WmmaInnerloop, 0, 0, iK % B_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; @@ -722,9 +730,9 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); s_nop(); wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); s_nop(); }); }); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index e5773144ac0..dbcceac68f2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -196,7 +196,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, remove_reference_t, - remove_reference_t, + remove_reference_t< + typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, remove_reference_t, - true>; // Last Option is W/O - + true>; // Last Option is W/O + ave_time = launch_and_time_kernel(stream_config, kernel, dim3(grid_size), @@ -391,7 +395,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, remove_reference_t, - remove_reference_t, + remove_reference_t< + typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, AElementwiseOperation, BElementwiseOperation, CElementwiseOperation, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 7b930bd7986..d70c5180da3 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -218,7 +218,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto b_block_space_size_aligned = math::integer_least_multiple( b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); - return (a_block_space_size_aligned * sizeof(FloatA) + b_block_space_size_aligned * sizeof(FloatB)); + return (a_block_space_size_aligned * sizeof(FloatA) + + b_block_space_size_aligned * sizeof(FloatB)); } // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} @@ -305,19 +306,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma remove_cvref_t; template - __device__ static void - Run(const FloatA* __restrict__ p_a_grid, - const FloatB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, - void* __restrict__ p_shared, - const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, - const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& - c_grid_desc_mblock_mperblock_nblock_nperblock, - const AElementwiseOperation& a_element_op, - const BElementwiseOperation& b_element_op, - const CElementwiseOperation& c_element_op, - const Block2CTileMap& block_2_ctile_map) + __device__ static void Run(const FloatA* __restrict__ p_a_grid, + const FloatB* __restrict__ p_b_grid, + FloatC* __restrict__ p_c_grid, + void* __restrict__ p_shared, + const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CElementwiseOperation& c_element_op, + const Block2CTileMap& block_2_ctile_map) { // clang-format off /*******************************************************************************/ diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index a2685e659bc..0672bf8e5b2 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -283,10 +283,18 @@ struct wmma_type +template struct WmmaSelector { - template + template static constexpr auto GetWmma(); template <> @@ -424,13 +432,19 @@ struct WmmaGemm __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const { static_assert( - (is_same::value && is_same::value && is_same::value) || - (is_same::value && is_same::value && is_same::value) || - (is_same::value && is_same::value && is_same::value) || - (is_same::value && is_same::value && is_same::value) || - (is_same::value && is_same::value && is_same::value) + (is_same::value && is_same::value && + is_same::value) || + (is_same::value && is_same::value && + is_same::value) || + (is_same::value && is_same::value && + is_same::value) || + (is_same::value && is_same::value && + is_same::value) || + (is_same::value && is_same::value && + is_same::value) #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 - || (is_same::value && is_same::value && is_same::value) + || (is_same::value && is_same::value && + is_same::value) #endif , "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), " @@ -479,7 +493,8 @@ struct WmmaGemm return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset}; } - static constexpr auto wmma = WmmaSelector{}; + static constexpr auto wmma = + WmmaSelector{}; static constexpr auto wmma_instr = wmma.selected_wmma; __host__ __device__ static constexpr auto diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp index 6f98f7924b4..4fc0be1fbd5 100644 --- a/include/ck/utility/amd_inline_asm.hpp +++ b/include/ck/utility/amd_inline_asm.hpp @@ -356,13 +356,9 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, } // Ranged input operand -__device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, - half16_t b, - float8_t& c) +__device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, half16_t b, float8_t& c) { - asm volatile("v_wmma_f32_16x16x16_f16 %0, %1, %2, %0" - : "=v"(c) - : "v"(a), "v"(b), "0"(c)); + asm volatile("v_wmma_f32_16x16x16_f16 %0, %1, %2, %0" : "=v"(c) : "v"(a), "v"(b), "0"(c)); } } // namespace ck diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index bf1d2a27d53..a0e79220e05 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -21,10 +21,13 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> template __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) { - // * Inline assembly need to elimate the duplicated data load, compiler won't help you delete them. - amd_assembly_wmma_f32_16x16x16_f16_w32(reg_a, reg_b, reg_c.template AsType()(Number<0>{})); - // reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( - // reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + // * Inline assembly need to elimate the duplicated data load, compiler won't help you + // delete them. + amd_assembly_wmma_f32_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()(Number<0>{})); + // reg_c.template AsType()(Number<0>{}) = + // __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( reg_a, reg_b, reg_c.template + // AsType()[Number<0>{}]); } }; From 8efd363fa3d4402ebb63c9adc335ced4ae6b807f Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 11 Jan 2023 06:29:53 +0000 Subject: [PATCH 023/118] compiler issue fixed + increase tile size --- example/01_gemm/gemm_wmma_fp16.cpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 47 ++++++++++++++----- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 2a6ceca76ff..48bcca257a3 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle // ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave| _MBlock_MWaveMPerWmma| ScalarPerVector| // ######| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>; + < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 256, 8, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8, 1>; // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index d7cf6c6173b..d75f37d7b39 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -143,6 +143,29 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle "wrong!"); } + // Thread level, register decriptor. Vector-write + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + MSubGroup, + Number{}, + I1, + NThreadPerSubGroup, + MAccVgprs)); + } + // Provide dimension size __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() @@ -550,12 +573,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - s_nop(); + // s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); + // s_nop(); }); if constexpr(KPerBlock > WmmaK) { @@ -590,12 +613,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset( make_tuple(WmmaInnerloop + RepeatDiff, iN, 0)); - s_nop(); + // s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); + // s_nop(); }); // Read Consumed Next inner loop A @@ -626,12 +649,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); - s_nop(); + // s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); + // s_nop(); }); // Read Consumed Next inner loop B b_thread_copy_.Run( @@ -662,12 +685,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - s_nop(); + // s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); + // s_nop(); }); if constexpr(KPerBlock > WmmaK) { @@ -702,12 +725,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop + RepeatDiff, iN, 0)); - s_nop(); + // s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); + // s_nop(); }); // Col Repeatation @@ -728,12 +751,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); - s_nop(); + // s_nop(); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); - s_nop(); + // s_nop(); }); }); } From ccb94cea2da6d0daf22d0bd22083fe5ea9a13dcd Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 13 Jan 2023 07:51:15 +0000 Subject: [PATCH 024/118] navi3x_multipleD+example --- example/02_gemm_bilinear/CMakeLists.txt | 1 + .../gemm_bilinear_wmma_fp16.cpp | 304 +++++++ .../device_gemm_multiple_d_wmma_cshuffle.hpp | 654 +++++++++++++++ ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 742 ++++++++++++++++++ 4 files changed, 1701 insertions(+) create mode 100644 example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/example/02_gemm_bilinear/CMakeLists.txt index 10ec0f1a711..425029c0f6b 100644 --- a/example/02_gemm_bilinear/CMakeLists.txt +++ b/example/02_gemm_bilinear/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp) +add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp new file mode 100644 index 00000000000..422739f1202 --- /dev/null +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/library/utility/check_err.hpp" + +struct AlphaBetaAdd +{ + AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const; + + template <> + __host__ __device__ constexpr void operator()( + ck::half_t& e, const float& c, const ck::half_t& d) const + { + e = ck::type_convert(alpha_ * c + beta_ * ck::type_convert(d)); + }; + + float alpha_; + float beta_; +}; + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using BDataType = F16; +using AccDataType = F32; +using CShuffleDataType = F32; +using DDataType = F16; +using EDataType = F16; + +using ALayout = Row; +using BLayout = Col; +using DLayout = Row; +using ELayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = AlphaBetaAdd; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +using DeviceOpInstance = + ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffle, + ELayout, + ADataType, + BDataType, + ck::Tuple, + EDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CDEElementOp, + GemmSpec, + 256, + 128, + 256, + 8, + 8, + 16, + 16, + 4, + 4, + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +int main(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideD = 4096; + ck::index_t StrideE = 4096; + + float alpha = 1.0f; + float beta = 1.0f; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 6) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + alpha = std::stof(argv[4]); + beta = std::stof(argv[5]); + } + else if(argc == 13) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + + StrideA = std::stoi(argv[7]); + StrideB = std::stoi(argv[8]); + StrideD = std::stoi(argv[9]); + StrideE = std::stoi(argv[10]); + + alpha = std::stof(argv[11]); + beta = std::stof(argv[12]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, " + "beta\n"); + exit(0); + } + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + using namespace ck::literals; + + if(std::is_same::value) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{})); + Tensor e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); + Tensor e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; + std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_m_k.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + b_k_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + d_m_n.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a_m_k.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_k_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + d_m_n.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize()); + DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_m_k.mData.data()); + b_device_buf.ToDevice(b_k_n.mData.data()); + d_device_buf.ToDevice(d_m_n.mData.data()); + e_device_buf.ToDevice(e_m_n_device_result.mData.data()); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{alpha, beta}; + + // do GEMM + auto device_op = DeviceOpInstance{}; + auto invoker = device_op.MakeInvoker(); + auto argument = + device_op.MakeArgument(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + M, + N, + K, + StrideA, + StrideB, + std::array{StrideD}, + StrideE, + a_element_op, + b_element_op, + cde_element_op); + + if(!device_op.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_gemm with the specified compilation parameters does " + "not support this GEMM problem"); + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" + << std::endl; + + e_device_buf.FromDevice(e_m_n_device_result.mData.data()); + + if(do_verification) + { + Tensor c_m_n({M, N}); + + using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = + ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{}); + + ref_invoker.Run(ref_argument); + + for(int m = 0; m < M; ++m) + { + for(int n = 0; n < N; ++n) + { + cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n)); + } + } + + e_device_buf.FromDevice(e_m_n_device_result.mData.data()); + + return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1; + } + + return 0; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp new file mode 100644 index 00000000000..66c4de7f05c --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD +{ + using DeviceOp = DeviceGemmMultipleD_Wmma_CShuffle; + static constexpr index_t NumDTensor = DsDataType::Size(); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + // K1 = Max Vector Access Pixels + static constexpr auto K1Number = Number{}; + + static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA) + { + assert(K % K1 == 0); + + const index_t K0 = K / K1; + + const auto a_grid_desc_m_k = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1)); + } +#ifdef ENABLE_COLMAJOR + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); + } +#endif + }(); + + if constexpr(GemmSpec == GemmSpecialization::MNPadding) + { + const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_right_pad_transform(M, PadM)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + } + + static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB) + { + assert(K % K1 == 0); + + const index_t K0 = K / K1; + + const auto b_grid_desc_k_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB)); + } + }(); + + if constexpr(GemmSpec == GemmSpecialization::MNPadding) + { + const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + + return transform_tensor_descriptor( + b_grid_desc_k_n, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_right_pad_transform(N, PadN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + return transform_tensor_descriptor( + b_grid_desc_k_n, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + } + + template + static auto MakeEGridDescriptor_M_N(index_t M, index_t N, index_t StrideE) + { + const auto e_grid_desc_m_n = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideE, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideE)); + } + }(); + + if constexpr(GemmSpec == GemmSpecialization::MNPadding) + { + const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; + const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + + return transform_tensor_descriptor( + e_grid_desc_m_n, + make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + else + { + + return transform_tensor_descriptor( + e_grid_desc_m_n, + make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + } + + static auto MakeDsGridDescriptor_M_N(const std::array& Ms, + const std::array& Ns, + const std::array& DsStride) + { + return generate_tuple( + [&](auto i) { + using DLayout = remove_cvref_t>; + + return DeviceOp::MakeEGridDescriptor_M_N(Ms[i], Ns[i], DsStride[i]); + }, + Number{}); + } + + // Gridwise descriptor, mapping to whole given provblem. + using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1)); + using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1)); + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)); + + // GridwiseOp + using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< + // DataType Family + ADataType, + BDataType, + AccDataType, + CShuffleDataType, + DsDataType, + EDataType, + // InMemory Data Descriptor + AGridDesc_K0_M_K1, + BGridDesc_K0_N_K1, + DsGridDesc_M_N, + EGridDesc_M_N, + // ElementwiseOp Family + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + InMemoryDataOperationEnum::Set, + // Tiling Family + MPerBlock, + NPerBlock, + K0PerBlock, + MPerWMMA, + NPerWMMA, + K1, + MRepeat, + NRepeat, + // ThreadCluster Family + BlockSize, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + false, // AThreadTransferSrcResetCoordinateAfterRun, + ABlockLdsAddExtraM, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + false, // BThreadTransferSrcResetCoordinateAfterRun, + BBlockLdsAddExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEShuffleBlockTransferScalarPerVector_NPerBlock, + NumPrefetch, + LoopSched, + PipelineVer>; + + // Argument + struct Argument : public BaseArgument + { + Argument(const void* p_a_grid, + const void* p_b_grid, + std::array p_ds_grid, + void* p_e_grid, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + std::array StrideDs, + index_t StrideE, + index_t M01, + index_t N01, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + : p_a_grid_{static_cast(p_a_grid)}, + p_b_grid_{static_cast(p_b_grid)}, + p_ds_grid_{}, + p_e_grid_{static_cast(p_e_grid)}, + a_grid_desc_k0_m_k1_{}, + b_grid_desc_k0_n_k1_{}, + ds_grid_desc_m_n_{}, + e_grid_desc_m_n_{}, + ds_grid_desc_mblock_mperblock_nblock_nperblock{}, + e_grid_desc_mblock_mperblock_nblock_nperblock{}, + block_2_ctile_map_{}, + M01_{M01}, + N01_{N01}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + cde_element_op_{cde_element_op} + { + a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(M, K, StrideA); + b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(K, N, StrideB); + static_for<0, NumDTensor, 1>{}([&](auto i) { + using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + + // D pointer + p_ds_grid_(i) = static_cast(p_ds_grid[i]); + + // D desc + ds_grid_desc_m_n_(i) = + DeviceOp::MakeEGridDescriptor_M_N(M, N, StrideDs[i]); + }); + e_grid_desc_m_n_ = DeviceOp::MakeEGridDescriptor_M_N(M, N, StrideE); + + block_2_ctile_map_ = GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01); + + if(GridwiseOp::CheckValidity(a_grid_desc_k0_m_k1_, + b_grid_desc_k0_n_k1_, + ds_grid_desc_m_n_, + e_grid_desc_m_n_, + block_2_ctile_map_)) + { + ds_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + ds_grid_desc_m_n_); + + e_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); + } + } + + // Pointers + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + typename GridwiseOp::DsGridPointer p_ds_grid_; + EDataType* p_e_grid_; + + // Tensor Descriptors + AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_; + BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; + DsGridDesc_M_N ds_grid_desc_m_n_; + EGridDesc_M_N e_grid_desc_m_n_; + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock; + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock; + + // Block to Tile mapping + typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map_; + + // Idle + index_t M01_; + index_t N01_; + + // ElementwiseOp + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CDEElementwiseOperation cde_element_op_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { +#if 0 + { + std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) + << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " + << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl; + + std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0) + << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", " + << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl; + + std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) + << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << ", " + << arg.c_grid_desc_m_n_.GetLength(I2) << "}" << std::endl; + } +#endif + + if(!GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_m_n_, + arg.e_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + throw std::runtime_error( + "wrong! GridwiseGemm_k0mk1_k0nk1_m0nm1_wmma_v1r1 has invalid setting"); + } + + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_); + + const auto K = + arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); + + float ave_time = 0; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + const auto kernel = kernel_gemm_mupltipe_d_wmma_cshuffle< + GridwiseOp, + ADataType, + BDataType, + typename GridwiseOp::DsGridPointer, + EDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t< + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + remove_reference_t< + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + remove_reference_t, + true>; // Last Option is W/O + + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.block_2_ctile_map_); + } + else + { + const auto kernel = kernel_gemm_mupltipe_d_wmma_cshuffle< + GridwiseOp, + ADataType, + BDataType, + typename GridwiseOp::DsGridPointer, + EDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t< + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + remove_reference_t< + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + remove_reference_t, + false>; + + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.block_2_ctile_map_); + } + + return ave_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + if(ck::get_device_name() == "gfx1100") + { + if constexpr(!(is_same_v || is_same_v)) + { + return false; + } + } + else + { + return false; + } + + return GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_m_n_, + arg.e_grid_desc_m_n_, + arg.block_2_ctile_map_); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + std::array StrideDs, + index_t StrideE, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + M, + N, + K, + StrideA, + StrideB, + StrideDs, + StrideE, + 1, + 1, + a_element_op, + b_element_op, + cde_element_op}; + } + + // polymorphic + std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + std::array StrideDs, + index_t StrideE, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + M, + N, + K, + StrideA, + StrideB, + StrideDs, + StrideE, + 1, + 1, + a_element_op, + b_element_op, + cde_element_op); + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + + // clang-format off + str << "DeviceGemmMultipleD_Wmma_CShuffle" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << K0PerBlock << ", " + << K1 << ", " + << MPerWMMA << ", " + << NPerWMMA << ", " + << MRepeat << ", " + << NRepeat + << ">" + << " NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp new file mode 100644 index 00000000000..2eff4c9745c --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -0,0 +1,742 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_mupltipe_d_wmma_cshuffle( + const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + DsPointer p_ds_grid, + EDataType* __restrict__ p_e_grid, + const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, + const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock, + const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation cde_element_op, + const Block2CTileMap block_2_ctile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + + GridwiseOp::template Run(p_a_grid, + p_b_grid, + p_ds_grid, + p_e_grid, + p_shared, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + ds_grid_desc_mblock_mperblock_nblock_nperblock, + e_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b_element_op, + cde_element_op, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_ds_grid; + ignore = p_e_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = e_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = a_element_op; + ignore = b_element_op; + ignore = cde_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx1100__)) +} + +template < // DataType Family + typename ADataType, + typename BDataType, + typename AccDataType, + typename CShuffleDataType, + typename DsDataType, + typename EDataType, + // InMemory Data Descriptor + typename AGridDesc_K0_M_K1, + typename BGridDesc_K0_N_K1, + typename DsGridDesc_M_N, + typename EGridDesc_M_N, + // ElementwiseOp Family + typename AElementwiseOperation, + typename BElementwiseOperation, + typename CDEElementwiseOperation, + InMemoryDataOperationEnum EGlobalMemoryDataOperation, + // Tiling Family + index_t MPerBlock, + index_t NPerBlock, + index_t K0PerBlock, + index_t MPerWmma, + index_t NPerWmma, + index_t K1Value, + index_t MRepeat, + index_t NRepeat, + // ThreadCluster Family + index_t BlockSize, + typename ABlockTransferThreadClusterLengths_K0_M_K1, + typename ABlockTransferThreadClusterArrangeOrder, + typename ABlockTransferSrcAccessOrder, + index_t ABlockTransferSrcVectorDim, + index_t ABlockTransferSrcScalarPerVector, + index_t ABlockTransferDstScalarPerVector_K1, + bool AThreadTransferSrcResetCoordinateAfterRun, + bool ABlockLdsExtraM, + typename BBlockTransferThreadClusterLengths_K0_N_K1, + typename BBlockTransferThreadClusterArrangeOrder, + typename BBlockTransferSrcAccessOrder, + index_t BBlockTransferSrcVectorDim, + index_t BBlockTransferSrcScalarPerVector, + index_t BBlockTransferDstScalarPerVector_K1, + bool BThreadTransferSrcResetCoordinateAfterRun, + bool BBlockLdsExtraN, + index_t CShuffleMRepeatPerShuffle, + index_t CShuffleNRepeatPerShuffle, + typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock, + index_t NumGemmKPrefetchStage = 1, + LoopScheduler LoopSched = make_default_loop_scheduler(), + PipelineVersion PipelineVer = PipelineVersion::v1> +struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle +{ + static constexpr index_t NumDTensor = DsDataType::Size(); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + static constexpr auto I7 = Number<7>{}; + + // K1 should be Number<...> + static constexpr auto K1 = Number{}; + + using ThisThreadBlock = ThisThreadBlock; + + using GridwiseGemmPipe = remove_cvref_t())>; + + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() + { + constexpr auto max_lds_align = K1; + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); + + return a_block_desc_k0perblock_mperblock_k1; + } + + __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() + { + constexpr auto max_lds_align = K1; + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); + + return b_block_desc_k0perblock_nperblock_k1; + } + + __host__ __device__ static constexpr auto + // *Caution Here repeat is shuffle repeat + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + { + constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma); + constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma); + + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + I1, + Number{})); + + return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat; + } + + // ck::Tuple + static constexpr auto MakeDsGridPointer() + { + return generate_tuple( + [&](auto i) { + using DDataType = remove_cvref_t>; + + return static_cast(nullptr); + }, + Number{}); + } + + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_desc_k0perblock_mperblock_k1 = + GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + + constexpr auto b_block_desc_k0perblock_nperblock_k1 = + GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + + constexpr auto max_lds_align = K1; + + constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); + + constexpr auto b_block_space_size_aligned = math::integer_least_multiple( + b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); + + return (a_block_space_size_aligned * sizeof(ADataType) + + b_block_space_size_aligned * sizeof(BDataType)); + } + + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} + template + __host__ __device__ static constexpr bool + CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const DsGridDesc_M_N& ds_grid_desc_m_n, + const EGridDesc_M_N& e_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) + { + static_assert(is_known_at_compile_time>::value, + "wrong! K1 need to be known at compile-time"); + + static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && + (NPerBlock % (NRepeat * NPerWmma)) == 0, + "Invalid tuning param!"); + + const auto M = a_grid_desc_k0_m_k1.GetLength(I1); + const auto N = b_grid_desc_k0_n_k1.GetLength(I1); + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + + bool valid = true; + + static_for<0, NumDTensor, 1>{}([&](auto i) { + valid = valid && (M == ds_grid_desc_m_n[i].GetLength(I0) && + N == ds_grid_desc_m_n[i].GetLength(I1)); + }); + + if(!valid) + { + return false; + } + + if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) && + K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && + K1 == b_grid_desc_k0_n_k1.GetLength(I2))) + return false; + + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) + return false; + + // check gridwise gemm pipeline + const auto num_k_loop = K0 / K0PerBlock; + + if(!GridwiseGemmPipe::IsSupported(num_k_loop)) + { + return false; + } + + if(!block_2_ctile_map.CheckValidity(e_grid_desc_m_n)) + { + return false; + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + return true; + } + + __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) + { + const index_t num_loop = K / (K0PerBlock * K1); + + return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); + } + + // E desc for destination in blockwise copy + __host__ __device__ static constexpr auto + MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N& e_grid_desc_m_n) + { + const auto M = e_grid_desc_m_n.GetLength(I0); + const auto N = e_grid_desc_m_n.GetLength(I1); + + const auto MBlock = M / MPerBlock; + const auto NBlock = N / NPerBlock; + + const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( + e_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), + make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + return e_grid_desc_mblock_mperblock_nblock_nperblock; + } + + // Ds desc for source in blockwise copy + __host__ __device__ static constexpr auto + MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N& ds_grid_desc_m_n) + { + return generate_tuple( + [&](auto i) { + return MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]); + }, + Number{}); + } + + // return block_id to C matrix tile idx (m0, n0) mapping + __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap( + const EGridDesc_M_N& e_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */) + { + return BlockToCTileMap_M00_N0_M01Adapt( + e_grid_desc_m_n); + } + + using DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; + using EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; + using DefaultBlock2CTileMap = + remove_cvref_t; + using DsGridPointer = decltype(MakeDsGridPointer()); + + template + __device__ static void Run(const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + DsGridPointer p_ds_grid, + EDataType* __restrict__ p_e_grid, + void* __restrict__ p_shared, + const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + ds_grid_desc_mblock_mperblock_nblock_nperblock, + const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + e_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CDEElementwiseOperation& cde_element_op, + const Block2CTileMap& block_2_ctile_map) + { + // clang-format off +/*******************************************************************************/ +// Memory buffer zone. + const auto a_grid_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); + const auto ds_grid_buf = generate_tuple( + [&](auto i) { + return make_dynamic_buffer( + p_ds_grid[i], + ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize()); + }, + Number{}); + auto e_grid_buf = make_dynamic_buffer( + p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + +/*******************************************************************************/ +// BlockIdx.x -> [BlockId.m, BlockId.n] + const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + if(!block_2_ctile_map.ValidCTileIndex( + block_work_idx, + make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), + e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) + { return; } + + // Store BlockId into SGPR + const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); + +/*******************************************************************************/ +// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + constexpr auto max_lds_align = K1; + constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + // A matrix blockwise copy + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, +/* typename SrcElementwiseOperation, */ AElementwiseOperation, +/* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, +/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, +/* typename BlockSliceLengths, */ Sequence, +/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, +/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ ADataType, +/* typename DstData, */ ADataType, +/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), +/* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), +/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, +/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, +/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( + a_grid_desc_k0_m_k1, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_element_op, + a_block_desc_k0perblock_mperblock_k1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + // B matrix blockwise copy + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BDataType, + BDataType, + decltype(b_grid_desc_k0_n_k1), + decltype(b_block_desc_k0perblock_nperblock_k1), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc_k0_n_k1, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc_k0perblock_nperblock_k1, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + +/*******************************************************************************/ + // GEMM + constexpr auto WmmaK = 16; + constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + + auto blockwise_gemm = + BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; + + // Prepare Register for C matrix + auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); + +/*******************************************************************************/ + constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); + // LDS allocation for A and B: be careful of alignment + auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); + auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + + // Shift Per SUB_K + constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + + // gridwise GEMM pipeline + const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); + GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, + a_block_desc_k0perblock_mperblock_k1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc_k0_n_k1, + b_block_desc_k0perblock_nperblock_k1, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + blockwise_gemm, + c_thread_buf, + K0BlockMainLoop); +/*******************************************************************************/ + // write out to C, implement shuffle + { + constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + // This API Provide All dimension (size) you need + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = + blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); + constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); + constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I5); + constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I6); + + // LDS descriptor, shuffle and write out in MRepeat x NRepeat times + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); + + auto c_shuffle_block_buf = make_dynamic_buffer( + static_cast(p_shared), + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); + + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + make_tuple( + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // MRepeat per shuffle repeat + MWave, // MWave + MSubGroup, // MSubGroup * MAccVgprs = MPerWmma + MAccVgprs)), + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // NRepeat per shuffle repeat + NWave, // NWave + NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<>{}, Sequence<0, 1, 2, 6>{}, Sequence<>{}, Sequence<3, 4, 5>{})); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // shuffle: threadwise copy C from VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + 1, // vector write pixel + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + make_multi_index(0, + m_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + 0, + n_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I2], + m_thread_data_on_block_idx[I3]), + ck::tensor_operation::element_wise::PassThrough{}}; + + // tuple of reference to C/Ds tensor descriptors + const auto c_ds_desc_refs = concat_tuple_of_reference( + tie(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), + generate_tie( + [&](auto i) -> const auto& // return type should be reference + { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; }, + Number{})); + + // tuple of reference to C/Ds tensor buffers + const auto c_ds_buf_refs = concat_tuple_of_reference( + tie(c_shuffle_block_buf), + generate_tie( + [&](auto i) -> const auto& // return type should be reference + { return ds_grid_buf[i]; }, + Number{})); + + // tuple of starting index of C/Ds blockwise copy + const auto idx_c_ds_block_begin = container_concat( + make_tuple(make_multi_index(0, 0, 0, 0)), + generate_tuple( + [&](auto) { + return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0); + }, + Number{})); + + // shuffle: blockwise copy C from LDS to global + auto cde_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v7< + ThisThreadBlock, // ThreadGroup + decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})), + Tuple, + decltype(c_ds_desc_refs), + decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)), + CDEElementwiseOperation, // ElementwiseOperation, + Sequence(EGlobalMemoryDataOperation)>, // DstInMemOp, + Sequence<1, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, + 1, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths, + CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CDEShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + sequence_merge_t< + Sequence, + uniform_sequence_gen_t>, // bool ThreadTransferSrcResetCoordinateAfterRun, + Sequence> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_ds_desc_refs, + idx_c_ds_block_begin, + tie(e_grid_desc_mblock_mperblock_nblock_nperblock), + make_tuple(make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0)), + cde_element_op}; + + // space filling curve for local reg & global memory + // space filling curve for threadwise C in VGPR + constexpr auto sfc_c_vgpr = + SpaceFillingCurve, + Sequence<0, 1, 2, 3, 4, 5, 6>, + Sequence>{}; + + // space filling curve for shuffled blockwise C in global mem + constexpr auto sfc_cde_global = + SpaceFillingCurve, + Sequence<0, 2, 1, 3>, + Sequence<1, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, + 1, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{}; + + constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); + + static_assert(num_access == sfc_cde_global.GetNumOfAccess(), "wrong!"); + + static_for<0, num_access, 1>{}([&](auto access_id) { + // make sure it's safe to write to LDS + block_sync_lds(); + + // each thread write its data from VGPR to LDS + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + sfc_c_vgpr.GetIndexTupleOfNumber(access_id), + c_thread_buf, + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + c_shuffle_block_buf); + + // make sure it's safe to read from LDS + block_sync_lds(); + + // each block copy its data from LDS to global + cde_shuffle_block_copy_lds_to_global.Run( + c_ds_desc_refs, + c_ds_buf_refs, + tie(e_grid_desc_mblock_mperblock_nblock_nperblock), + tie(e_grid_buf)); + + if constexpr(access_id < num_access - 1) + { + constexpr auto cde_global_step = sfc_cde_global.GetForwardStep(access_id); + // move on Ds + static_for<0, NumDTensor, 1>{}([&](auto i) { + cde_shuffle_block_copy_lds_to_global.MoveSrcSliceWindow( + c_ds_desc_refs, i + I1, cde_global_step); + }); + + // move on E + cde_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( + tie(e_grid_desc_mblock_mperblock_nblock_nperblock), + I0, + cde_global_step); + } + }); + } + // clang-format on + } +}; + +} // namespace ck From 2963dd9604b42b4ebfeb3319cd2349e54e0cb2cd Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 16 Jan 2023 11:35:32 +0000 Subject: [PATCH 025/118] temp save --- .../CMakeLists.txt | 1 + .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 439 +++++++ ...d_contraction_multiple_d_wmma_cshuffle.hpp | 1061 +++++++++++++++++ ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 92 ++ 4 files changed, 1593 insertions(+) create mode 100644 example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp diff --git a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt index 40470f27d42..ac54aebdc21 100644 --- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt +++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp) +add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp) diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp new file mode 100644 index 00000000000..d508d4483c5 --- /dev/null +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/numeric.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Add = ck::tensor_operation::element_wise::Add; + +using ADataType = F16; +using BDataType = F16; +using AccDataType = F32; +using CShuffleDataType = F16; +using DDataType = F16; +using DsDataType = ck::Tuple; +using EDataType = F16; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 2; +static constexpr ck::index_t NumDimN = 2; +static constexpr ck::index_t NumDimK = 1; + +using AElementOp = ck::tensor_operation::element_wise::PassThrough; +using BElementOp = ck::tensor_operation::element_wise::PassThrough; +using CDEElementOp = ck::tensor_operation::element_wise::Add; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; + +static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed; +static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default; + +using DeviceOpInstanceKKNN = + ck::tensor_operation::device::DeviceBatchedContractionMultipleD_Wmma_CShuffle< + NumDimG, + NumDimM, + NumDimN, + NumDimK, + ADataType, + BDataType, + ck::Tuple, + EDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CDEElementOp, + GemmSpec, + ABSpec, + ABSpec, + DESpec, + 256, + 128, + 256, + 8, + 8, + 16, + 16, + 4, + 4, + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +using DeviceOpInstance = DeviceOpInstanceKKNN; + +// hardcoded for NumDimM == NumDimN == NumDimK == 2 +template = + false> +struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator +{ + // Argument + struct Argument : public ck::tensor_operation::device::BaseArgument + { + Argument(const Tensor& a_gs_ms_ks, + const Tensor& b_gs_ns_ks, + Tensor& e_gs_ms_ns, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + : a_gs_ms_ks_{a_gs_ms_ks}, + b_gs_ns_ks_{b_gs_ns_ks}, + e_gs_ms_ns_{e_gs_ms_ns}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + cde_element_op_{cde_element_op} + { + } + + const Tensor& a_gs_ms_ks_; + const Tensor& b_gs_ns_ks_; + Tensor& e_gs_ms_ns_; + + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CDEElementwiseOperation cde_element_op_; + }; + + // Invoker + struct Invoker : public ck::tensor_operation::device::BaseInvoker + { + using Argument = ReferenceContraction_G2_M2_N2_K1::Argument; + + float Run(const Argument& arg) + { + auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) { + const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4]; + + AccDataType v_acc = 0; + + for(int k0 = 0; k0 < K0; ++k0) + { + AccDataType v_a; + AccDataType v_b; + + arg.a_element_op_( + v_a, + ck::type_convert(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0))); + arg.b_element_op_( + v_b, + ck::type_convert(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0))); + + v_acc += v_a * v_b; + } + + AccDataType v_c; + + arg.cde_element_op_(v_c, v_acc); + + arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c; + }; + + make_ParallelTensorFunctor(f_ms_ns, + arg.e_gs_ms_ns_.mDesc.GetLengths()[0], + arg.e_gs_ms_ns_.mDesc.GetLengths()[1], + arg.e_gs_ms_ns_.mDesc.GetLengths()[2], + arg.e_gs_ms_ns_.mDesc.GetLengths()[3], + arg.e_gs_ms_ns_.mDesc.GetLengths()[4], + arg.e_gs_ms_ns_.mDesc.GetLengths()[5])( + std::thread::hardware_concurrency()); + + return 0; + } + + float Run(const ck::tensor_operation::device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override + { + return true; + } + + static auto MakeArgument(const Tensor& a_gs_ms_ks, + const Tensor& b_gs_ns_ks, + Tensor& e_gs_ms_ns, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + { + return Argument{ + a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceContraction_G2_M2_N2_K1" + << std::endl; + // clang-format on + + return str.str(); + } +}; + +int main(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + ck::index_t G0 = 1; + ck::index_t G1 = 1; + + ck::index_t M0 = 1; + ck::index_t M1 = 1; + + ck::index_t N0 = 1; + ck::index_t N1 = 1; + + ck::index_t K0 = 1; + + // A[G0, G1, M0, M1, K0] + std::vector a_gs_ms_ks_lengths{G0, G1, M0, M1, K0}; + std::vector a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1}; + // B[G0, G1, N0, N1, K0] + std::vector b_gs_ns_ks_lengths{G0, G1, N0, N1, K0}; + std::vector b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1}; + + // D[G0, G1, M0, N0, M1, N1] + std::vector d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1}; + std::vector d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1}; + // E[G0, G1, M0, N0, M1, N1] + std::vector e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1}; + std::vector e_gs_ms_ns_strides{ + G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1}; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + exit(0); + } + std::cout<<"CP -4 "< a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + Tensor d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides); + Tensor e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + Tensor e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + std::cout<<"CP -3 "<{-5, 5}); + b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + } + std::cout<<"CP -2 "<{d_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_lengths, + b_gs_ns_ks_strides, + std::array, 1>{d_gs_ms_ns_lengths}, + std::array, 1>{d_gs_ms_ns_strides}, + e_gs_ms_ns_lengths, + e_gs_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + std::cout<<"CP 1 "<(e_gs_ms_ns_lengths.begin(), NumDimG, 1, std::multiplies<>{}); + + ck::index_t M = ck::accumulate_n( + e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{}); + + ck::index_t N = ck::accumulate_n( + e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{}); + + ck::index_t K = ck::accumulate_n( + a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{}); + + std::size_t flop = std::size_t(2) * G * M * N * K; + std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N + + sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << op.GetTypeString() << std::endl; + + e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data()); + + if(do_verification) + { + Tensor c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + + using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1; + + auto ref_gemm = ReferenceOpInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{}); + + ref_invoker.Run(ref_argument); + + for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0) + { + for(size_t g1 = 0; g1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++g1) + { + for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m0) + { + for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++m1) + { + for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n0) + { + for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5]; + ++n1) + { + cde_element_op(e_gs_ms_ns_host_result(g0, g1, m0, m1, n0, n1), + c_ms_ns_host_result(g0, g1, m0, m1, n0, n1), + d_gs_ms_ns(g0, g1, m0, m1, n0, n1)); + } + } + } + } + } + } + + return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1; + } + + return 0; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp new file mode 100644 index 00000000000..1c1dfae6a53 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -0,0 +1,1061 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// Tensor Contraction: +// input : A +// input : B +// input : D0, D1, ... +// output : E +// C = a_op(A) * b_op(B) +// E = cde_op(C, D0, D1, ...) +// Assume: +// A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...] +// B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...] +// D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...] +// E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...] + +// NOTE: TensorSpecialization::Packed specialized tensor is "packed" in a sense that each inner +// dimension in a dimension group (eg [G0, G1] in Gs, [M0, M1, M2] in Ms, etc.) are contiguous and +// ordered. Not in a sense that the tensor [G0, G1, ..., M0, M1, ..., N0, N1...] can be permuted +// while still being a contiguous, unpadded tensor. In other words, it merely degenerates into +// TensorSpecialization::Default with NumDimG/M/N/K = 1 +// +// Detail- Packed tensor satisfies +// stride_0 = 1 +// stride_i = stride_{i - 1} * extent_{i - 1} +// So tensor +// [G0, G1, G2, M, N] +// transposed into tensor +// [G0, G2, G1, M, N] +// with strides +// [G2 * G1 * M * N, G1 * M * N, M * N, N, 1] +// is again a packed tensor. MakeGridDescriptor() currently just merges dimensions and ignores some +// strides from input tensor extents so finer dimension information is lost. Merging dimensions is +// essentially a degenerated case of TensorSpecialization::Default with NumDimG/M/N/K = 1. +// +// Might need to expose dimension order to the interface to fully support +// TensorSpecialization::Packed in a traditional sense of "packed" tensor +template +struct DeviceBatchedContractionMultipleD_Wmma_CShuffle + : public DeviceBatchedContractionMultipleD +{ + using DeviceOp = DeviceBatchedContractionMultipleD_Wmma_CShuffle; + static constexpr index_t NumDTensor = DsDataType::Size(); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + // K1 = Max Vector Access Pixels + static constexpr auto K1Number = Number{}; + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, K0PerBlock* K1}; + + // Assume: A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...] + static auto MakeAGridDescriptor_M_K(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) + { + assert(a_gs_ms_ks_lengths_vec.size() == NumDimG + NumDimM + NumDimK && + a_gs_ms_ks_strides_vec.size() == NumDimG + NumDimM + NumDimK); + + const auto to_tuple = [&](auto& vec, auto start, auto end) { + return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); + }; + + const auto a_ms_ks_lengths = to_tuple( + a_gs_ms_ks_lengths_vec, Number{}, Number{}); + const auto a_ms_ks_strides = to_tuple( + a_gs_ms_ks_strides_vec, Number{}, Number{}); + + // dimension Ids for M0, M1, ... + constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{}; + + // dimension Ids for K0, K1, ... + constexpr auto kDimIds = + typename arithmetic_sequence_gen::type{}; + + // lengths for M0, M1, ... + const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds); + + // lengths for K0, K1, ... + const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds); + + if constexpr(ASpec == TensorSpecialization::Packed) + { + auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); + auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{}); + const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor( + make_tuple(M, K), + make_tuple(a_ms_ks_strides[Number{}], + a_ms_ks_strides[Number{}])); + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + else + { + // naive tensor A[M0, M1, M2, ..., K0, K1, K2...] + const auto a_grid_desc_ms_ks = + make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides); + + // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...] + const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor( + a_grid_desc_ms_ks, + make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)), + make_tuple(mDimIds, kDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + } + + // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...] + static auto MakeBGridDescriptor_N_K(const std::vector& b_gs_ns_ks_lengths_vec, + const std::vector& b_gs_ns_ks_strides_vec) + { + assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK && + b_gs_ns_ks_strides_vec.size() == NumDimG + NumDimN + NumDimK); + + const auto to_tuple = [&](auto& vec, auto start, auto end) { + return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); + }; + + const auto b_ns_ks_lengths = to_tuple( + b_gs_ns_ks_lengths_vec, Number{}, Number{}); + const auto b_ns_ks_strides = to_tuple( + b_gs_ns_ks_strides_vec, Number{}, Number{}); + + // dimension Ids for N0, N1, ... + constexpr auto nDimIds = typename arithmetic_sequence_gen<0, NumDimN, 1>::type{}; + + // dimension Ids for K0, K1, ... + constexpr auto kDimIds = + typename arithmetic_sequence_gen::type{}; + + // lengths for K0, K1, ... + const auto kLengths = get_container_subset(b_ns_ks_lengths, kDimIds); + + // lengths for N0, N1, ... + const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds); + + if constexpr(BSpec == TensorSpecialization::Packed) + { + auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); + auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{}); + const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor( + make_tuple(N, K), + make_tuple(b_ns_ks_strides[Number{}], + b_ns_ks_strides[Number{}])); + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + else + { + // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...] + const auto b_grid_desc_ns_ks = + make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides); + + // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...] + const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor( + b_grid_desc_ns_ks, + make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)), + make_tuple(nDimIds, kDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + } + + // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] + static auto MakeEGridDescriptor_M_N(const std::vector& e_gs_ms_ns_lengths_vec, + const std::vector& e_gs_ms_ns_strides_vec) + { + assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && + e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN); + + const auto to_tuple = [&](auto& vec, auto start, auto end) { + return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); + }; + + const auto e_ms_ns_lengths = to_tuple( + e_gs_ms_ns_lengths_vec, Number{}, Number{}); + const auto e_ms_ns_strides = to_tuple( + e_gs_ms_ns_strides_vec, Number{}, Number{}); + + // dimension Ids for M0, M1, ... + constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{}; + + // dimension Ids for N0, N1, ... + constexpr auto nDimIds = + typename arithmetic_sequence_gen::type{}; + + // lengths for M0, M1, ... + const auto mLengths = get_container_subset(e_ms_ns_lengths, mDimIds); + + // lengths for K0, K1, ... + const auto nLengths = get_container_subset(e_ms_ns_lengths, nDimIds); + + if constexpr(DESpec == TensorSpecialization::Packed) + { + auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); + auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); + const auto e_grid_desc_mraw_nraw = make_naive_tensor_descriptor( + make_tuple(M, N), + make_tuple(e_ms_ns_strides[Number{}], + e_ms_ns_strides[Number{}])); + return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw); + } + else + { + // naive tensor E[M0, M1, M2, ..., N0, N1, N2...] + const auto e_grid_desc_ms_ns = + make_naive_tensor_descriptor(e_ms_ns_lengths, e_ms_ns_strides); + + // transformed tensor E[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...] + const auto e_grid_desc_mraw_nraw = transform_tensor_descriptor( + e_grid_desc_ms_ns, + make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)), + make_tuple(mDimIds, nDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw); + } + } + + // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] + static auto MakeEGridDescriptor_G_M_N(const std::vector& e_gs_ms_ns_lengths_vec, + const std::vector& e_gs_ms_ns_strides_vec) + { + assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && + e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN); + + const auto to_tuple = [&](auto& vec, auto start, auto end) { + return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); + }; + + const auto e_gs_ms_ns_lengths = + to_tuple(e_gs_ms_ns_lengths_vec, Number<0>{}, Number{}); + const auto e_gs_ms_ns_strides = + to_tuple(e_gs_ms_ns_strides_vec, Number<0>{}, Number{}); + + // dimension Ids for G0, G1, ... + constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{}; + + // dimension Ids for M0, M1, ... + constexpr auto mDimIds = + typename arithmetic_sequence_gen::type{}; + + // dimension Ids for N0, N1, ... + constexpr auto nDimIds = typename arithmetic_sequence_gen::type{}; + + // lengths for G0, G1, ... + const auto gLengths = get_container_subset(e_gs_ms_ns_lengths, gDimIds); + + // lengths for M0, M1, ... + const auto mLengths = get_container_subset(e_gs_ms_ns_lengths, mDimIds); + + // lengths for K0, K1, ... + const auto nLengths = get_container_subset(e_gs_ms_ns_lengths, nDimIds); + + if constexpr(DESpec == TensorSpecialization::Packed) + { + auto G = container_reduce(gLengths, math::multiplies{}, Number<1>{}); + auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); + auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); + const auto e_grid_desc_g_mraw_nraw = make_naive_tensor_descriptor( + make_tuple(G, M, N), + make_tuple(e_gs_ms_ns_strides[Number{}], + e_gs_ms_ns_strides[Number{}], + e_gs_ms_ns_strides[Number{}])); + // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw); + return e_grid_desc_g_mraw_nraw; + } + else + { + // naive tensor E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] + const auto e_grid_desc_gs_ms_ns = + make_naive_tensor_descriptor(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + + // transformed tensor E[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * + // N2 * ...] + const auto e_grid_desc_g_mraw_nraw = transform_tensor_descriptor( + e_grid_desc_gs_ms_ns, + make_tuple(make_merge_transform(gLengths), + make_merge_transform(mLengths), + make_merge_transform(nLengths)), + make_tuple(gDimIds, mDimIds, nDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw); + return e_grid_desc_g_mraw_nraw; + } + } + + static auto MakeDsGridDescriptor_M_N( + const std::array, NumDTensor>& ds_gs_ms_ns_lengths_vec, + const std::array, NumDTensor>& ds_gs_ms_ns_strides_vec) + { + return generate_tuple( + [&](auto i) { + return DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths_vec[i], + ds_gs_ms_ns_strides_vec[i]); + }, + Number{}); + } + + static auto MakeDsGridDescriptor_G_M_N( + const std::array, NumDTensor>& ds_gs_ms_ns_lengths_vec, + const std::array, NumDTensor>& ds_gs_ms_ns_strides_vec) + { + return generate_tuple( + [&](auto i) { + return DeviceOp::MakeEGridDescriptor_G_M_N(ds_gs_ms_ns_lengths_vec[i], + ds_gs_ms_ns_strides_vec[i]); + }, + Number{}); + } + + // Gridwise descriptor, mapping to whole given provblem. + using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K({}, {})); + using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K({}, {})); + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N({}, {})); + using DsGridDesc_G_M_N = remove_cvref_t; + using EGridDesc_G_M_N = decltype(MakeEGridDescriptor_G_M_N({}, {})); + + // A desc for source in blockwise copy + template + __host__ __device__ static constexpr auto + MakeAGridDescriptor_K0_M_K1(const AGridDesc_M_K& a_grid_desc_m_k) + { + const auto M = a_grid_desc_m_k.GetLength(I0); + const auto K = a_grid_desc_m_k.GetLength(I1); + + const auto AK0 = K / K1; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + // B desc for source in blockwise copy + template + __host__ __device__ static constexpr auto + MakeBGridDescriptor_K0_N_K1(const BGridDesc_N_K& b_grid_desc_n_k) + { + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + + const auto BK0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + struct ComputePtrOffsetOfStridedBatch + { + ComputePtrOffsetOfStridedBatch(index_t batch_stride_A, + index_t batch_stride_B, + DsGridDesc_G_M_N ds_grid_desc_g_m_n, + EGridDesc_G_M_N e_grid_desc_g_m_n) + : batch_stride_A_(batch_stride_A), + batch_stride_B_(batch_stride_B), + ds_grid_desc_g_m_n_(ds_grid_desc_g_m_n), + e_grid_desc_g_m_n_(e_grid_desc_g_m_n) + { + } + + __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const + { + return static_cast(g_idx) * batch_stride_A_; + } + + __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const + { + return static_cast(g_idx) * batch_stride_B_; + } + + __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const + { + std::array ds_offset; + + static_for<0, NumDTensor, 1>{}([&](auto i) { + ds_offset[i] = static_cast(g_idx) * + ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(1, 0, 0)); + }); + + return ds_offset; + } + + __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const + { + return static_cast(g_idx) * + e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(1, 0, 0)); + } + + private: + index_t batch_stride_A_; + index_t batch_stride_B_; + DsGridDesc_G_M_N ds_grid_desc_g_m_n_; + EGridDesc_G_M_N e_grid_desc_g_m_n_; + }; + + using AGridDesc_K0_M_K1 = decltype(DeviceOp::MakeAGridDescriptor_K0_M_K1(AGridDesc_M_K{})); + using BGridDesc_K0_N_K1 = decltype(DeviceOp::MakeBGridDescriptor_K0_N_K1(BGridDesc_N_K{})); + + // GridwiseOp + using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< + // DataType Family + ADataType, + BDataType, + AccDataType, + CShuffleDataType, + DsDataType, + EDataType, + // InMemory Data Descriptor + AGridDesc_K0_M_K1, + BGridDesc_K0_N_K1, + DsGridDesc_M_N, + EGridDesc_M_N, + // ElementwiseOp Family + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + InMemoryDataOperationEnum::Set, + // Tiling Family + MPerBlock, + NPerBlock, + K0PerBlock, + MPerWMMA, + NPerWMMA, + K1, + MRepeat, + NRepeat, + // ThreadCluster Family + BlockSize, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + false, // AThreadTransferSrcResetCoordinateAfterRun, + ABlockLdsAddExtraM, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + false, // BThreadTransferSrcResetCoordinateAfterRun, + BBlockLdsAddExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEShuffleBlockTransferScalarPerVector_NPerBlock, + NumPrefetch, + LoopSched, + PipelineVer>; + + // Argument + struct Argument : public BaseArgument + { + Argument(const void* p_a_grid, + const void* p_b_grid, + std::array p_ds_grid, + void* p_e_grid, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& b_gs_ns_ks_lengths, + const std::array, NumDTensor>& ds_gs_ms_ns_lengths, + const std::vector& e_gs_ms_ns_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_strides, + const std::array, NumDTensor>& ds_gs_ms_ns_strides, + const std::vector& e_gs_ms_ns_strides, + index_t M01, + index_t N01, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + : p_a_grid_{static_cast(p_a_grid)}, + p_b_grid_{static_cast(p_b_grid)}, + p_ds_grid_{}, + p_e_grid_{static_cast(p_e_grid)}, + a_grid_desc_m_k_{}, + b_grid_desc_n_k_{}, + ds_grid_desc_m_n_{}, + e_grid_desc_m_n_{}, + ds_grid_desc_g_m_n_{ + DeviceOp::MakeDsGridDescriptor_G_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides)}, + e_grid_desc_g_m_n_{ + DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)}, + a_grid_desc_k0_m_k1_{}, + b_grid_desc_k0_n_k1_{}, + ds_grid_desc_mblock_mperblock_nblock_nperblock{}, + e_grid_desc_mblock_mperblock_nblock_nperblock{}, + block_2_ctile_map_{}, + M01_{M01}, + N01_{N01}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + cde_element_op_{cde_element_op}, + a_mz_stride_{}, + a_kz_stride_{}, + b_nz_stride_{}, + b_kz_stride_{}, + ds_nz_stride_{}, + e_nz_stride_{}, + a_batch_stride_{a_gs_ms_ks_strides[NumDimG - 1]}, + b_batch_stride_{b_gs_ns_ks_strides[NumDimG - 1]}, + compute_ptr_offset_of_batch_{ + a_batch_stride_, b_batch_stride_, ds_grid_desc_g_m_n_, e_grid_desc_g_m_n_} + { + a_grid_desc_m_k_ = + DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + a_grid_desc_m_k_ = + DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(a_grid_desc_m_k_); + b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_grid_desc_n_k_); + static_for<0, NumDTensor, 1>{}([&](auto i) { + using DDataType = remove_cvref_t>; + + // D pointer + p_ds_grid_(i) = static_cast(p_ds_grid[i]); + + // D desc + ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths[i], + ds_gs_ms_ns_strides[i]); + }); + e_grid_desc_m_n_ = + DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + + block_2_ctile_map_ = GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01); + + if(GridwiseOp::CheckValidity(a_grid_desc_k0_m_k1_, + b_grid_desc_k0_n_k1_, + ds_grid_desc_m_n_, + e_grid_desc_m_n_, + block_2_ctile_map_)) + { + ds_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + ds_grid_desc_m_n_); + + e_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); + } + + // for sanity check of vector memory access + a_mz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM - 1]; + a_kz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]; + b_nz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN - 1]; + b_kz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1]; + + for(index_t i = 0; i < NumDTensor; ++i) + { + ds_nz_stride_[i] = ds_gs_ms_ns_strides[i][NumDimG + NumDimM + NumDimN - 1]; + } + + e_nz_stride_ = e_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1]; + } + + // Pointers + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + typename GridwiseOp::DsGridPointer p_ds_grid_; + EDataType* p_e_grid_; + + // Tensor Descriptors + AGridDesc_M_K a_grid_desc_m_k_; + BGridDesc_N_K b_grid_desc_n_k_; + DsGridDesc_M_N ds_grid_desc_m_n_; + EGridDesc_M_N e_grid_desc_m_n_; + DsGridDesc_G_M_N ds_grid_desc_g_m_n_; + EGridDesc_G_M_N e_grid_desc_g_m_n_; + + AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_; + BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; + + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock; + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock; + + // Block to Tile mapping + typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map_; + + // Idle + index_t M01_; + index_t N01_; + + // ElementwiseOp + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CDEElementwiseOperation cde_element_op_; + + // Strides for the last M/N/K dimensions of A/B/Ds/E + // for sanity check of vector load/store + index_t a_mz_stride_; + index_t a_kz_stride_; + index_t b_nz_stride_; + index_t b_kz_stride_; + std::array ds_nz_stride_; + index_t e_mz_stride_; + index_t e_nz_stride_; + + index_t a_batch_stride_; + index_t b_batch_stride_; + + // Batch Offset + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { +#if 0 + { + std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) + << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " + << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl; + + std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0) + << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", " + << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl; + + std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) + << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << ", " + << arg.c_grid_desc_m_n_.GetLength(I2) << "}" << std::endl; + } +#endif + + if(!GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_m_n_, + arg.e_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + throw std::runtime_error( + "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting"); + } + + const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0); + + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G; + + const auto K = + arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); + + float ave_time = 0; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + const auto kernel = kernel_contraction_multiple_d_wmma_cshuffle< + GridwiseOp, + ADataType, + BDataType, + typename GridwiseOp::DsGridPointer, + EDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t< + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + remove_reference_t< + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + ComputePtrOffsetOfStridedBatch, + remove_reference_t, + true>; // Last Option is W/O + + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + G, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.compute_ptr_offset_of_batch_, + arg.block_2_ctile_map_); + } + else + { + const auto kernel = kernel_contraction_multiple_d_wmma_cshuffle< + GridwiseOp, + ADataType, + BDataType, + typename GridwiseOp::DsGridPointer, + EDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t< + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + remove_reference_t< + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + ComputePtrOffsetOfStridedBatch, + remove_reference_t, + false>; + + ave_time = + launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + G, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.compute_ptr_offset_of_batch_, + arg.block_2_ctile_map_); + } + + return ave_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + if(ck::get_device_name() == "gfx1100") + { + if constexpr(!(is_same_v || is_same_v)) + { + return false; + } + } + else + { + return false; + } + + if(!GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_m_n_, + arg.e_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + return false; + } + + // check vector access + static_assert((ABlockTransferSrcVectorDim == 1 || ABlockTransferSrcVectorDim == 2) && + (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2), + "wrong!"); + + // vector memory access of A: could be on M or AK1 dimension + if constexpr(ABlockTransferSrcVectorDim == 1) + { + if(!(arg.a_mz_stride_ == 1 && + arg.a_grid_desc_k0_m_k1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0)) + { + return false; + } + } + else + { + if(!(arg.a_kz_stride_ == 1 && + arg.a_grid_desc_k0_m_k1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0)) + { + return false; + } + } + + // vector memory access of B: could be on N or BK1 dimension + if constexpr(BBlockTransferSrcVectorDim == 1) + { + if(!(arg.b_nz_stride_ == 1 && + arg.b_grid_desc_k0_n_k1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0)) + { + return false; + } + } + else + { + if(!(arg.b_kz_stride_ == 1 && + arg.b_grid_desc_k0_n_k1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0)) + { + return false; + } + } + + // vector memory access of Ds: always on NPerBlock dimension + bool valid_d_access = true; + + static_for<0, NumDTensor, 1>{}([&](auto i) { + if(!(arg.ds_nz_stride_[i] == 1 && + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetLength(I3) % + CDEShuffleBlockTransferScalarPerVector_NPerBlock == + 0)) + { + valid_d_access = false; + } + }); + + if(valid_d_access == false) + { + return false; + } + + // vector memory access of E: always on NPerBlock dimension + if(!((arg.e_nz_stride_ == 1 && + arg.e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I3) % + CDEShuffleBlockTransferScalarPerVector_NPerBlock == + 0) || + CDEShuffleBlockTransferScalarPerVector_NPerBlock == 1)) + { + return false; + } + + return true; + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto + MakeArgument(const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_lengths, + const std::vector& b_gs_ns_ks_strides, + const std::array, NumDTensor>& ds_gs_ms_ns_lengths, + const std::array, NumDTensor>& ds_gs_ms_ns_strides, + const std::vector& e_gs_ms_ns_lengths, + const std::vector& e_gs_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + a_gs_ms_ks_lengths, + b_gs_ns_ks_lengths, + ds_gs_ms_ns_lengths, + e_gs_ms_ns_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_strides, + ds_gs_ms_ns_strides, + e_gs_ms_ns_strides, + 1, + 1, + a_element_op, + b_element_op, + cde_element_op}; + } + + // polymorphic + std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + std::array p_ds, + void* p_e, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_lengths, + const std::vector& b_gs_ns_ks_strides, + const std::array, NumDTensor>& ds_gs_ms_ns_lengths, + const std::array, NumDTensor>& ds_gs_ms_ns_strides, + const std::vector& e_gs_ms_ns_lengths, + const std::vector& e_gs_ms_ns_strides, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CDEElementwiseOperation cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + a_gs_ms_ks_lengths, + b_gs_ns_ks_lengths, + ds_gs_ms_ns_lengths, + e_gs_ms_ns_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_strides, + ds_gs_ms_ns_strides, + e_gs_ms_ns_strides, + 1, + 1, + a_element_op, + b_element_op, + cde_element_op); + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + + // clang-format off + str << "DeviceBatchedContractionMultipleD_Wmma_CShuffle" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << K0PerBlock << ", " + << K1 << ", " + << MPerWMMA << ", " + << NPerWMMA << ", " + << MRepeat << ", " + << NRepeat + << ">" + << " NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 2eff4c9745c..33311dc8c3d 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -17,6 +17,98 @@ namespace ck { +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_contraction_multiple_d_wmma_cshuffle( + const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + DsPointer p_ds_grid, + EDataType* __restrict__ p_e_grid, + const index_t batch_count, + const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1, + const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock, + const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation cde_element_op, + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, + const Block2CTileMap block_2_etile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); + + const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + + DsPointer p_ds_grid_grp; + + static constexpr index_t NumDTensor = + DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); + + static_for<0, NumDTensor, 1>{}( + [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); + + GridwiseOp::template Run(p_a_grid + a_batch_offset, + p_b_grid + b_batch_offset, + p_ds_grid_grp, + p_e_grid + e_batch_offset, + p_shared, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + ds_grid_desc_mblock_mperblock_nblock_nperblock, + e_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b_element_op, + cde_element_op, + block_2_etile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_ds_grid; + ignore = p_e_grid; + ignore = batch_count; + ignore = a_element_op; + ignore = b_element_op; + ignore = cde_element_op; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = e_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = block_2_etile_map; + ignore = compute_ptr_offset_of_batch; +#endif +} + template Date: Wed, 18 Jan 2023 06:00:08 +0000 Subject: [PATCH 026/118] workable --- .../CMakeLists.txt | 1 + .../common_wmma.hpp | 355 +++++++ ...ouped_conv_fwd_bias_relu_add_wmma_fp16.cpp | 26 + ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 286 ++++++ ...d_contraction_multiple_d_wmma_cshuffle.hpp | 2 +- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 870 ++++++++++++++++++ ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 101 +- 7 files changed, 1637 insertions(+), 4 deletions(-) create mode 100644 example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt index 61b2b2f6f3a..c725dc8e8a1 100644 --- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt +++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt @@ -16,6 +16,7 @@ if(USE_BITINT_EXTENSION_INT4) add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4) endif() # USE_BITINT_EXTENSION_INT4 +add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp) add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp) diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp new file mode 100644 index 00000000000..201165775c4 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" + +using BF16 = ck::bhalf_t; +using FP16 = ck::half_t; +using FP32 = float; +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +using I4 = ck::int4_t; +#endif +using I8 = std::int8_t; +using I32 = std::int32_t; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +struct CommonLayoutSetting +{ + using InputLayout = InputLay; + using WeightLayout = WeightLay; + using OutputLayout = OutputLay; +}; + +template +struct CommonLayoutSettingSelector; + +namespace ctl = ck::tensor_layout::convolution; + +template <> +struct CommonLayoutSettingSelector<1> final + : CommonLayoutSetting +{ +}; + +template <> +struct CommonLayoutSettingSelector<2> final + : CommonLayoutSetting +{ +}; + +template <> +struct CommonLayoutSettingSelector<3> final + : CommonLayoutSetting +{ +}; + +template +using InputLayout = typename CommonLayoutSettingSelector::InputLayout; + +template +using WeightLayout = typename CommonLayoutSettingSelector::WeightLayout; + +template +using OutputLayout = typename CommonLayoutSettingSelector::OutputLayout; + +struct ExecutionConfig final +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = true; +}; + +#define DefaultConvParam \ + ck::utils::conv::ConvParam \ + { \ + 2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \ + } + +inline void print_help_msg() +{ + std::cerr << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: time kernel (0=no, 1=yes)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; +} + +inline bool parse_cmd_args(int argc, + char* argv[], + ExecutionConfig& config, + ck::utils::conv::ConvParam& conv_param) +{ + constexpr int num_execution_config_args = + 3; // arguments for do_verification, init_method, time_kernel + constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_ + + constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args; + constexpr int threshold_to_catch_all_args = + threshold_to_catch_partial_args + num_conv_param_leading_args; + + if(argc == 1) + { + // use default + } + // catch only ExecutionConfig arguments + else if(argc == threshold_to_catch_partial_args) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + } + // catch both ExecutionConfig & ConvParam arguments + else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0)) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + + const ck::index_t num_dim_spatial = std::stoi(argv[4]); + conv_param = ck::utils::conv::parse_conv_param( + num_dim_spatial, threshold_to_catch_partial_args, argv); + } + else + { + print_help_msg(); + return false; + } + + return true; +} + +inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.G_ * conv_param.C_ // wi + }); + + case 2: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.C_, + conv_param.input_spatial_lengths_[0], + conv_param.input_spatial_lengths_[1]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * + conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi + conv_param.G_ * conv_param.C_ // wi + }); + + case 3: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.C_, + conv_param.input_spatial_lengths_[0], + conv_param.input_spatial_lengths_[1], + conv_param.input_spatial_lengths_[2]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * + conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * + conv_param.G_ * conv_param.C_, // di + conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi + conv_param.G_ * conv_param.C_ // wi + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} + +inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k + 1, // c + conv_param.C_ // x + }); + case 2: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.K_, + conv_param.C_, + conv_param.filter_spatial_lengths_[0], + conv_param.filter_spatial_lengths_[1]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * + conv_param.C_, // k + 1, // c + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y + conv_param.C_ // x + }); + case 3: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.K_, + conv_param.C_, + conv_param.filter_spatial_lengths_[0], + conv_param.filter_spatial_lengths_[1], + conv_param.filter_spatial_lengths_[2]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * + conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * + conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k + 1, // c + conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * + conv_param.C_, // z + conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y + conv_param.C_ // x + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} + +inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + 0, // k + 1, // c + 0 // x + }); + case 2: + return HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // ho + 0 // wo + }); + case 3: + return HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // z + 0, // y + 0 // x + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} + +inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.G_ * conv_param.K_ // wo + }); + case 2: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * + conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho + conv_param.G_ * conv_param.K_ // wo + }); + + case 3: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * + conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * + conv_param.G_ * conv_param.K_, // do + conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho + conv_param.G_ * conv_param.K_ // wo + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp new file mode 100644 index 00000000000..9d1d257a288 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common_wmma.hpp" + +// kernel data types +using InKernelDataType = FP16; +using WeiKernelDataType = FP16; +using AccDataType = FP32; +using CShuffleDataType = FP16; +using BiasKernelDataType = FP16; +using ResidualKernelDataType = FP16; +using OutKernelDataType = FP16; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +#include "run_grouped_conv_fwd_bias_relu_add_wmma_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc new file mode 100644 index 00000000000..2297d247067 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +template +struct LayoutSetting +{ + using BiasLayout = BiasLay; + using ResidualLayout = ResidualLay; +}; + +template +struct LayoutSettingSelector; + +template <> +struct LayoutSettingSelector<1> final : LayoutSetting +{ +}; + +template <> +struct LayoutSettingSelector<2> final : LayoutSetting +{ +}; + +template <> +struct LayoutSettingSelector<3> final : LayoutSetting +{ +}; + +template +using BiasLayout = typename LayoutSettingSelector::BiasLayout; + +template +using ResidualLayout = typename LayoutSettingSelector::ResidualLayout; + +template +using DeviceConvFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle< + NDimSpatial, + InputLayout, + WeightLayout, + ck::Tuple, ResidualLayout>, + OutputLayout, + InKernelDataType, + WeiKernelDataType, + ck::Tuple, + OutKernelDataType, + AccDataType, + CShuffleDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 8, // K0PerBlock + 8, // K1 + 16, // MPerWMMA + 16, // NPerWMMA + 4, // MRepeat + 4, // NRepeat + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + true, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + true, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8>; + +template +using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; + +template +bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config, + const ck::utils::conv::ConvParam& conv_param) +{ + static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial"); + + const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param); + const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param); + const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param); + const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param); + + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor bias(bias_g_n_k_wos_desc); + Tensor residual(bias_g_n_k_wos_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "bias: " << bias.mDesc << std::endl; + std::cout << "residual: " << residual.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + + switch(config.init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + bias.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + bias.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem bias_device_buf(sizeof(OutKernelDataType) * bias.mDesc.GetElementSpaceSize()); + DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize()); + +#ifdef BUILD_INT4_EXAMPLE + const Tensor in_converted(in); + const Tensor wei_converted(wei); + const Tensor bias_converted(bias); + const Tensor residual_converted(residual); + + in_device_buf.ToDevice(in_converted.mData.data()); + wei_device_buf.ToDevice(wei_converted.mData.data()); + bias_device_buf.ToDevice(bias_converted.mData.data()); + residual_device_buf.ToDevice(residual_converted.mData.data()); +#else + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + bias_device_buf.ToDevice(bias.mData.data()); + residual_device_buf.ToDevice(residual.mData.data()); +#endif + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array d0_g_n_k_wos_lengths{}; + std::array d0_g_n_k_wos_strides{}; + std::array d1_g_n_k_wos_lengths{}; + std::array d1_g_n_k_wos_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths); + copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides); + copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths); + copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + // do Conv + auto conv = DeviceConvFwdInstance{}; + auto invoker = conv.MakeInvoker(); + auto argument = + conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + std::array{bias_device_buf.GetDeviceBuffer(), + residual_device_buf.GetDeviceBuffer()}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, 2>{ + {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths}}, + std::array, 2>{ + {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides}}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << conv.GetTypeString() << std::endl; + + if(config.do_verification) + { + Tensor c_host(out_g_n_k_wos_desc); + + auto ref_conv = HostConvFwdInstance{}; + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + c_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + InElementOp{}, + WeiElementOp{}, + PassThrough{}); + + ref_invoker.Run(ref_argument); + + // TODO: implement elementwise operation for host + out_host.ForEach([&](auto&, auto idx) { + OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx)); + }); + + out_device_buf.FromDevice(out_device.mData.data()); + +#ifdef BUILD_INT4_EXAMPLE + const Tensor out_device_converted(out_device); + + return ck::utils::check_err( + out_device_converted, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); +#else + return ck::utils::check_err( + out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f); +#endif + } + + return true; +} + +bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[]) +{ + ExecutionConfig config; + ck::utils::conv::ConvParam conv_param = DefaultConvParam; + + if(!parse_cmd_args(argc, argv, config, conv_param)) + { + return false; + } + + switch(conv_param.num_dim_spatial_) + { + case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param); + case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); + case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); + } + + return false; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 1c1dfae6a53..e627bb2d10f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -723,7 +723,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle arg.block_2_ctile_map_)) { throw std::runtime_error( - "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting"); + "wrong! GridwiseGemmMultipleD_wmma_cshuffle has invalid setting"); } const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp new file mode 100644 index 00000000000..d79c54fcc77 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -0,0 +1,870 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/host_utility/io.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +namespace { + +template +struct ComputePtrOffsetOfStridedBatch +{ + ComputePtrOffsetOfStridedBatch() = default; + + ComputePtrOffsetOfStridedBatch(index_t BatchStrideA, + index_t BatchStrideB, + Array BatchStrideDs, + index_t BatchStrideE) + : BatchStrideA_(BatchStrideA), + BatchStrideB_(BatchStrideB), + BatchStrideDs_(BatchStrideDs), + BatchStrideE_(BatchStrideE) + { + } + + __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const + { + return g_idx * static_cast(BatchStrideA_); + } + + __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const + { + return g_idx * static_cast(BatchStrideB_); + } + + __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const + { + Array ds_offset; + static_for<0, NumDTensor, 1>{}( + [&](auto i) { ds_offset(i) = g_idx * static_cast(BatchStrideDs_[i]); }); + return ds_offset; + } + + __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const + { + return g_idx * static_cast(BatchStrideE_); + } + + index_t BatchStrideA_; + index_t BatchStrideB_; + Array BatchStrideDs_; + index_t BatchStrideE_; +}; + +} // namespace + +// +// @brief Device Convolution operation. +// +// Supports: +// @li Forward convolution with up to 3 spatial dimentions +// @li Input tensor in GNWC data format +// @li Weight tensor in GKXC data format +// @li Output tensor in GNWK data format +// +// 1D: +// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C] +// 2D: +// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C] +// 3D: +// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C] +// Assume: +// AK1 == BK1 +template +struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle + : public DeviceGroupedConvFwdMultipleD +{ + using DeviceOp = DeviceGroupedConvFwdMultipleD_Wmma_CShuffle; + + static constexpr index_t NumDTensor = DsDataType::Size(); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr index_t KPerBlock = K0PerBlock * K1; + + static constexpr auto conv_to_gemm_transformer = + TransformConvFwdToGemm{}; + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; + + template + static auto + MakeAGridDescriptor_M_K(const std::array& a_g_n_c_wis_lengths, + const std::array& a_g_n_c_wis_strides, + const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides, + const std::array& e_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads) + { + const auto in_gemmmraw_gemmkraw_desc = + conv_to_gemm_transformer.template MakeADescriptor_M_K(a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + const auto in_gemmm_gemmk_desc = + matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc); + + return in_gemmm_gemmk_desc; + } + + template + static auto + MakeBGridDescriptor_N_K(const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides) + { + const auto wei_gemmnraw_gemmkraw_desc = + conv_to_gemm_transformer.template MakeBDescriptor_N_K(b_g_k_c_xs_lengths, + b_g_k_c_xs_strides); + + const auto wei_gemmn_gemmk_desc = + matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc); + + return wei_gemmn_gemmk_desc; + } + + template + static auto + MakeEGridDescriptor_M_N(const std::array& e_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_strides) + { + const auto out_gemmmraw_gemmnraw_desc = + conv_to_gemm_transformer.template MakeCDescriptor_M_N(e_g_n_k_wos_lengths, + e_g_n_k_wos_strides); + + const auto out_gemmm_gemmn_desc = + matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc); + + return out_gemmm_gemmn_desc; + } + + static auto MakeDsGridDescriptor_M_N( + const std::array, NumDTensor>& ds_g_n_k_wos_lengths, + const std::array, NumDTensor>& ds_g_n_k_wos_strides) + { + return generate_tuple( + [&](auto i) { + using DLayout = remove_cvref_t>; + + return DeviceOp::MakeEGridDescriptor_M_N(ds_g_n_k_wos_lengths[i], + ds_g_n_k_wos_strides[i]); + }, + Number{}); + } + + // desc for problem definition + using AGridDesc_M_K = remove_cvref_t({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>; + using BGridDesc_N_K = remove_cvref_t({}, {}))>; + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = remove_cvref_t({}, {}))>; + + // A desc for source in blockwise copy + template + __host__ __device__ static constexpr auto + MakeAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k) + { + const auto M = a_grid_desc_m_k.GetLength(I0); + const auto K = a_grid_desc_m_k.GetLength(I1); + + const auto AK1 = K1; + const auto AK0 = K / AK1; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)), make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + // B desc for source in blockwise copy + template + __host__ __device__ static constexpr auto + MakeBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k) + { + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + + const auto BK1 = K1; + const auto BK0 = K / BK1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + using AGridDesc_AK0_M_AK1 = decltype(DeviceOp::MakeAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{})); + using BGridDesc_BK0_N_BK1 = decltype(DeviceOp::MakeBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{})); + + // GridwiseOp + using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< + // DataType Family + ADataType, + BDataType, + AccDataType, + CShuffleDataType, + DsDataType, + EDataType, + // InMemory Data Descriptor + AGridDesc_AK0_M_AK1, + BGridDesc_BK0_N_BK1, + DsGridDesc_M_N, + EGridDesc_M_N, + // ElementwiseOp Family + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + InMemoryDataOperationEnum::Set, + // Tiling Family + MPerBlock, + NPerBlock, + K0PerBlock, + MPerWMMA, + NPerWMMA, + K1, + MRepeat, + NRepeat, + // ThreadCluster Family + BlockSize, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + false, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + false, + BBlockLdsExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CDEShuffleBlockTransferScalarPerVector_NPerBlock, + NumGemmKPrefetchStage, + LoopSched, + PipelineVer>; + + // Argument + struct Argument : public BaseArgument + { + Argument(const void* p_a, + const void* p_b, + const std::array& p_ds, + void* p_e, + const std::array& a_g_n_c_wis_lengths, + const std::array& a_g_n_c_wis_strides, + const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides, + const std::array, NumDTensor>& + ds_g_n_k_wos_lengths, + const std::array, NumDTensor>& + ds_g_n_k_wos_strides, + const std::array& e_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads, + index_t M01, + index_t N01, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CDEElementwiseOperation& cde_element_op) + : p_a_grid_{static_cast(p_a)}, + p_b_grid_{static_cast(p_b)}, + p_ds_grid_{}, + p_e_grid_{static_cast(p_e)}, + num_group_{a_g_n_c_wis_lengths[0]}, + a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads)}, + b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_g_k_c_xs_lengths, + b_g_k_c_xs_strides)}, + ds_grid_desc_m_n_{}, + e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, + e_g_n_k_wos_strides)}, + a_grid_desc_ak0_m_ak1_{ + DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)}, + b_grid_desc_bk0_n_bk1_{ + DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)}, + ds_grid_desc_mblock_mperblock_nblock_nperblock_{}, + e_grid_desc_mblock_mperblock_nblock_nperblock_{}, + block_2_etile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01)}, + compute_ptr_offset_of_batch_{}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + cde_element_op_{cde_element_op}, + a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths}, + a_g_n_c_wis_strides_{a_g_n_c_wis_strides}, + b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths}, + b_g_k_c_xs_strides_{b_g_k_c_xs_strides}, + ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths}, + ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides}, + e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths}, + e_g_n_k_wos_strides_{e_g_n_k_wos_strides}, + conv_filter_strides_{conv_filter_strides}, + conv_filter_dilations_{conv_filter_dilations}, + input_left_pads_{input_left_pads}, + input_right_pads_{input_right_pads} + { + // A/B/E Batch Stride + compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0]; + compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0]; + + // populate pointer, batch stride, desc for Ds + static_for<0, NumDTensor, 1>{}([&](auto i) { + // using DLayout = remove_cvref_t>; + using DDataType = remove_cvref_t>; + + // D pointer + p_ds_grid_(i) = static_cast(p_ds[i]); + + // D batch stride + compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0]; + }); + + // D desc + ds_grid_desc_m_n_ = DeviceOp::MakeDsGridDescriptor_M_N(ds_g_n_k_wos_lengths, ds_g_n_k_wos_strides); + + // populate desc for Ds/E + if(GridwiseOp::CheckValidity(a_grid_desc_ak0_m_ak1_, + b_grid_desc_bk0_n_bk1_, + ds_grid_desc_m_n_, + e_grid_desc_m_n_, + block_2_etile_map_)) + { + // e_grid_desc_mblock_mperblock_nblock_nperblock_ = + // GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + // e_grid_desc_m_n_); + + // ds_grid_desc_mblock_mperblock_nblock_nperblock_ = + // GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + // ds_grid_desc_m_n_); + } + } + + void Print() const + { + std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl; + std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl; + static_for<0, NumDTensor, 1>{}( + [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; }); + std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl; + } + + // private: + // pointers + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + typename GridwiseOp::DsGridPointer p_ds_grid_; + EDataType* p_e_grid_; + + // tensor descriptors for problem definiton + index_t num_group_; + AGridDesc_M_K a_grid_desc_m_k_; + BGridDesc_N_K b_grid_desc_n_k_; + DsGridDesc_M_N ds_grid_desc_m_n_; + EGridDesc_M_N e_grid_desc_m_n_; + + // tensor descriptors for block/thread-wise copy + AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; + BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock_; + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock_; + + // block-to-e-tile map + typename GridwiseOp::DefaultBlock2CTileMap block_2_etile_map_; + + // for computing batch offset + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; + + // element-wise op + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CDEElementwiseOperation cde_element_op_; + + // for checking IsSupportedArgument() + std::array a_g_n_c_wis_lengths_; + std::array a_g_n_c_wis_strides_; + std::array b_g_k_c_xs_lengths_; + std::array b_g_k_c_xs_strides_; + std::array, NumDTensor> ds_g_n_k_wos_lengths_; + std::array, NumDTensor> ds_g_n_k_wos_strides_; + std::array e_g_n_k_wos_lengths_; + std::array e_g_n_k_wos_strides_; + std::array conv_filter_strides_; + std::array conv_filter_dilations_; + std::array input_left_pads_; + std::array input_right_pads_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(stream_config.log_level_ > 0) + { + arg.Print(); + } + + if(!GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.ds_grid_desc_m_n_, + arg.e_grid_desc_m_n_, + arg.block_2_etile_map_)) + { + throw std::runtime_error( + "wrong! GridwiseGemmMultipleD_wmma_cshuffle has invalid setting"); + } + + const index_t grid_size = + arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.num_group_; + + const auto K = + arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + + auto launch_kernel = [&](auto has_main_k_block_loop) { + constexpr bool has_main_loop = has_main_k_block_loop.value; + + const auto kernel = kernel_grouped_conv_fwd_multiple_d_wmma_cshuffle< + GridwiseOp, + ADataType, + BDataType, + typename GridwiseOp::DsGridPointer, + EDataType, + AElementwiseOperation, + BElementwiseOperation, + CDEElementwiseOperation, + DeviceOp::AGridDesc_AK0_M_AK1, + DeviceOp::BGridDesc_BK0_N_BK1, + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, + remove_reference_t, + ComputePtrOffsetOfStridedBatch, + has_main_loop>; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.a_g_n_c_wis_lengths_[0], // Group count + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_etile_map_, + arg.compute_ptr_offset_of_batch_); + }; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static bool IsSupportedArgument(const Argument& arg) + { + namespace ctc = tensor_layout::convolution; + + // check device + if(get_device_name() == "gfx1100") + { + if constexpr(!(is_same_v || is_same_v)) + { + return false; + } + } + else + { + return false; + } + + // check ConvolutionForwardSpecialization + if constexpr(ConvForwardSpecialization == + ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) + { + // check if it's 1x1, stride=1 conv + for(index_t i = 0; i < NDimSpatial; ++i) + { + const index_t X = arg.b_g_k_c_xs_lengths_[i + 2]; + const index_t ConvStride = arg.conv_filter_strides_[i]; + const index_t LeftPad = arg.input_left_pads_[i]; + const index_t RightPad = arg.input_right_pads_[i]; + + if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0)) + { + return false; + } + } + } + else if constexpr(ConvForwardSpecialization == + ConvolutionForwardSpecialization::Filter1x1Pad0) + { + // check if it's 1x1 conv + for(index_t i = 0; i < NDimSpatial; ++i) + { + const index_t X = arg.b_g_k_c_xs_lengths_[i + 2]; + const index_t LeftPad = arg.input_left_pads_[i]; + const index_t RightPad = arg.input_right_pads_[i]; + + if(!(X == 1 && LeftPad == 0 && RightPad == 0)) + { + return false; + } + } + } + + // check vector access of A + // FIXME: layout + if constexpr(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v) + { + const index_t C = arg.a_g_n_c_wis_lengths_[2]; + + if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0)) + { + return false; + } + } + else + { + return false; + } + + // check vector access of B + // FIXME: layout + if constexpr(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v) + + { + const index_t C = arg.b_g_k_c_xs_lengths_[2]; + + if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0)) + { + return false; + } + } + else + { + return false; + } + + // check vector access of Ds + bool valid = true; + + static_for<0, NumDTensor, 1>{}([&](auto i) { + using DLayout = remove_cvref_t>; + + // FIXME: layout + if constexpr(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v) + { + const index_t K = arg.ds_g_n_k_wos_lengths_[i][2]; + + if(!(K % CDEShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + valid = false; + } + } + else + { + valid = false; + } + }); + + if(!valid) + { + return false; + } + + // check vector access of E + if constexpr(is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v || is_same_v || + is_same_v) + { + const index_t K = arg.e_g_n_k_wos_lengths_[2]; + + if(!(K % CDEShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + return false; + } + } + else + { + return false; + } + + // check Gridwise GEMM + return GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.ds_grid_desc_m_n_, + arg.e_grid_desc_m_n_, + arg.block_2_etile_map_); + } + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument( + const void* p_a, + const void* p_b, + const std::array& p_ds, + void* p_e, + const std::array& a_g_n_c_wis_lengths, + const std::array& a_g_n_c_wis_strides, + const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides, + const std::array, NumDTensor>& ds_g_n_k_wos_lengths, + const std::array, NumDTensor>& ds_g_n_k_wos_strides, + const std::array& e_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CDEElementwiseOperation& cde_element_op) + { + return Argument{p_a, + p_b, + p_ds, + p_e, + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + ds_g_n_k_wos_lengths, + ds_g_n_k_wos_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + 1, + 1, + a_element_op, + b_element_op, + cde_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b, + const std::array& p_ds, + void* p_e, + const std::array& a_g_n_c_wis_lengths, + const std::array& a_g_n_c_wis_strides, + const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides, + const std::array, NumDTensor>& ds_g_n_k_wos_lengths, + const std::array, NumDTensor>& ds_g_n_k_wos_strides, + const std::array& e_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CDEElementwiseOperation& cde_element_op) override + { + return std::make_unique(p_a, + p_b, + p_ds, + p_e, + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + ds_g_n_k_wos_lengths, + ds_g_n_k_wos_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + 1, + 1, + a_element_op, + b_element_op, + cde_element_op); + } + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << getConvForwardSpecializationString(ConvForwardSpecialization) + << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 33311dc8c3d..630ae13f1ce 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -17,6 +17,99 @@ namespace ck { +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_grouped_conv_fwd_multiple_d_wmma_cshuffle( + const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + DsPointer p_ds_grid, + EDataType* __restrict__ p_e_grid, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation cde_element_op, + const index_t batch_count, + const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1, + const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1, + const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + ds_grid_desc_mblock_mperblock_nblock_nperblock, + const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock + e_grid_desc_mblock_mperblock_nblock_nperblock_, + const Block2CTileMap block_2_ctile_map, + const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) + // offset base pointer for each work-group + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); + const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx))); + const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx))); + + const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); + + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + + DsPointer p_ds_grid_grp; + + static constexpr index_t NumDTensor = + DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); + + static_for<0, NumDTensor, 1>{}( + [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); + + GridwiseOp::template Run(p_a_grid + a_batch_offset, + p_b_grid + b_batch_offset, + p_ds_grid_grp, + p_e_grid + e_batch_offset, + p_shared, + a_grid_desc_k0_m_k1, + b_grid_desc_k0_n_k1, + ds_grid_desc_mblock_mperblock_nblock_nperblock, + e_grid_desc_mblock_mperblock_nblock_nperblock_, + a_element_op, + b_element_op, + cde_element_op, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_ds_grid; + ignore = p_e_grid; + ignore = batch_count; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_; + ignore = a_element_op; + ignore = b_element_op; + ignore = cde_element_op; + ignore = compute_ptr_offset_of_batch; + ignore = block_2_ctile_map; +#endif +} + template __host__ __device__ static constexpr auto - MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N& e_grid_desc_m_n) + MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N_& e_grid_desc_m_n) { const auto M = e_grid_desc_m_n.GetLength(I0); const auto N = e_grid_desc_m_n.GetLength(I1); @@ -426,9 +520,10 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle } // Ds desc for source in blockwise copy + template __host__ __device__ static constexpr auto - MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N& ds_grid_desc_m_n) - { + MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N_& ds_grid_desc_m_n) + { return generate_tuple( [&](auto i) { return MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]); From abfc94b223715c8e5931a50f6775c25aea4d4663 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 18 Jan 2023 10:48:01 +0000 Subject: [PATCH 027/118] batchedgemm[OK], groupconv[debug] --- .../gemm_bilinear_wmma_fp16.cpp | 2 +- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 28 +- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 251 +++++++----------- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 14 +- 4 files changed, 111 insertions(+), 184 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 422739f1202..ff99bf46411 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -112,7 +112,7 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = false; + bool time_kernel = true; // GEMM shape ck::index_t M = 3840; diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index d508d4483c5..2a2e8899d10 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -56,7 +56,7 @@ using DeviceOpInstanceKKNN = NumDimK, ADataType, BDataType, - ck::Tuple, + DsDataType, EDataType, AccDataType, CShuffleDataType, @@ -239,18 +239,18 @@ int main(int argc, char* argv[]) { bool do_verification = true; int init_method = 1; - bool time_kernel = false; + bool time_kernel = true; ck::index_t G0 = 1; - ck::index_t G1 = 1; + ck::index_t G1 = 2; - ck::index_t M0 = 1; - ck::index_t M1 = 1; + ck::index_t M0 = 4; + ck::index_t M1 = 128; - ck::index_t N0 = 1; - ck::index_t N1 = 1; + ck::index_t N0 = 16; + ck::index_t N1 = 256; - ck::index_t K0 = 1; + ck::index_t K0 = 2048; // A[G0, G1, M0, M1, K0] std::vector a_gs_ms_ks_lengths{G0, G1, M0, M1, K0}; @@ -284,13 +284,11 @@ int main(int argc, char* argv[]) printf("arg3: time kernel (0=no, 1=yes)\n"); exit(0); } - std::cout<<"CP -4 "< a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); Tensor b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); Tensor d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides); Tensor e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); Tensor e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); - std::cout<<"CP -3 "<{-0.5, 0.5}); break; } - std::cout<<"CP -2 "<(e_gs_ms_ns_lengths.begin(), NumDimG, 1, std::multiplies<>{}); @@ -371,7 +363,7 @@ int main(int argc, char* argv[]) ck::index_t K = ck::accumulate_n( a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{}); - + std::cout<<"GMNK="<; using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N({}, {})); + using DsGridDesc_G_M_N = remove_cvref_t; using EGridDesc_G_M_N = decltype(MakeEGridDescriptor_G_M_N({}, {})); - // A desc for source in blockwise copy - template - __host__ __device__ static constexpr auto - MakeAGridDescriptor_K0_M_K1(const AGridDesc_M_K& a_grid_desc_m_k) - { - const auto M = a_grid_desc_m_k.GetLength(I0); - const auto K = a_grid_desc_m_k.GetLength(I1); - - const auto AK0 = K / K1; - - return transform_tensor_descriptor( - a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - - // B desc for source in blockwise copy - template - __host__ __device__ static constexpr auto - MakeBGridDescriptor_K0_N_K1(const BGridDesc_N_K& b_grid_desc_n_k) - { - const auto N = b_grid_desc_n_k.GetLength(I0); - const auto K = b_grid_desc_n_k.GetLength(I1); - - const auto BK0 = K / K1; - - return transform_tensor_descriptor( - b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - struct ComputePtrOffsetOfStridedBatch { ComputePtrOffsetOfStridedBatch(index_t batch_stride_A, @@ -482,6 +449,40 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle EGridDesc_G_M_N e_grid_desc_g_m_n_; }; + // A desc for source in blockwise copy + template + __host__ __device__ static constexpr auto + MakeAGridDescriptor_K0_M_K1(const AGridDesc_M_K& a_grid_desc_m_k) + { + const auto M = a_grid_desc_m_k.GetLength(I0); + const auto K = a_grid_desc_m_k.GetLength(I1); + + const auto AK0 = K / K1; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + // B desc for source in blockwise copy + template + __host__ __device__ static constexpr auto + MakeBGridDescriptor_K0_N_K1(const BGridDesc_N_K& b_grid_desc_n_k) + { + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + + const auto BK0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + using AGridDesc_K0_M_K1 = decltype(DeviceOp::MakeAGridDescriptor_K0_M_K1(AGridDesc_M_K{})); using BGridDesc_K0_N_K1 = decltype(DeviceOp::MakeBGridDescriptor_K0_N_K1(BGridDesc_N_K{})); @@ -592,41 +593,34 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle compute_ptr_offset_of_batch_{ a_batch_stride_, b_batch_stride_, ds_grid_desc_g_m_n_, e_grid_desc_g_m_n_} { - a_grid_desc_m_k_ = - DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); - a_grid_desc_m_k_ = - DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); - a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(a_grid_desc_m_k_); - b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_grid_desc_n_k_); static_for<0, NumDTensor, 1>{}([&](auto i) { using DDataType = remove_cvref_t>; // D pointer p_ds_grid_(i) = static_cast(p_ds_grid[i]); - - // D desc - ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths[i], - ds_gs_ms_ns_strides[i]); }); - e_grid_desc_m_n_ = - DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + + a_grid_desc_m_k_ = + DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + b_grid_desc_n_k_ = + DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + + ds_grid_desc_m_n_ = DeviceOp::MakeDsGridDescriptor_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides); + + e_grid_desc_m_n_ = DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + + a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(a_grid_desc_m_k_); + b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_grid_desc_n_k_); block_2_ctile_map_ = GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01); - if(GridwiseOp::CheckValidity(a_grid_desc_k0_m_k1_, - b_grid_desc_k0_n_k1_, - ds_grid_desc_m_n_, - e_grid_desc_m_n_, - block_2_ctile_map_)) - { - ds_grid_desc_mblock_mperblock_nblock_nperblock = - GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - ds_grid_desc_m_n_); + ds_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + ds_grid_desc_m_n_); - e_grid_desc_mblock_mperblock_nblock_nperblock = - GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - e_grid_desc_m_n_); - } + e_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); // for sanity check of vector memory access a_mz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM - 1]; @@ -700,128 +694,61 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { -#if 0 - { - std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0) - << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", " - << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl; - - std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0) - << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", " - << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl; - - std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) - << ", " << arg.c_grid_desc_m_n_.GetLength(I1) << ", " - << arg.c_grid_desc_m_n_.GetLength(I2) << "}" << std::endl; - } -#endif - - if(!GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_, - arg.b_grid_desc_k0_n_k1_, - arg.ds_grid_desc_m_n_, - arg.e_grid_desc_m_n_, - arg.block_2_ctile_map_)) - { - throw std::runtime_error( - "wrong! GridwiseGemmMultipleD_wmma_cshuffle has invalid setting"); - } - const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0); - const index_t grid_size = - arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G; + const index_t grid_size = arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G; - const auto K = - arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); + const auto K = arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); - float ave_time = 0; + auto launch_kernel = [&](auto has_main_k_block_loop) { + constexpr bool has_main_loop = has_main_k_block_loop.value; - if(GridwiseOp::CalculateHasMainKBlockLoop(K)) - { const auto kernel = kernel_contraction_multiple_d_wmma_cshuffle< GridwiseOp, ADataType, BDataType, typename GridwiseOp::DsGridPointer, EDataType, - remove_reference_t, - remove_reference_t, - remove_reference_t< - typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, - remove_reference_t< - typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + DeviceOp::AGridDesc_K0_M_K1, + DeviceOp::BGridDesc_K0_N_K1, + typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, + typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, ComputePtrOffsetOfStridedBatch, - remove_reference_t, - true>; // Last Option is W/O - - ave_time = - launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_ds_grid_, - arg.p_e_grid_, - G, - arg.a_grid_desc_k0_m_k1_, - arg.b_grid_desc_k0_n_k1_, - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, - arg.e_grid_desc_mblock_mperblock_nblock_nperblock, - arg.a_element_op_, - arg.b_element_op_, - arg.cde_element_op_, - arg.compute_ptr_offset_of_batch_, - arg.block_2_ctile_map_); + typename GridwiseOp::DefaultBlock2CTileMap, + has_main_loop>; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + G, + arg.a_grid_desc_k0_m_k1_, + arg.b_grid_desc_k0_n_k1_, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.compute_ptr_offset_of_batch_, + arg.block_2_ctile_map_); + }; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); } else { - const auto kernel = kernel_contraction_multiple_d_wmma_cshuffle< - GridwiseOp, - ADataType, - BDataType, - typename GridwiseOp::DsGridPointer, - EDataType, - remove_reference_t, - remove_reference_t, - remove_reference_t< - typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, - remove_reference_t< - typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, - AElementwiseOperation, - BElementwiseOperation, - CDEElementwiseOperation, - ComputePtrOffsetOfStridedBatch, - remove_reference_t, - false>; - - ave_time = - launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_ds_grid_, - arg.p_e_grid_, - G, - arg.a_grid_desc_k0_m_k1_, - arg.b_grid_desc_k0_n_k1_, - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, - arg.e_grid_desc_mblock_mperblock_nblock_nperblock, - arg.a_element_op_, - arg.b_element_op_, - arg.cde_element_op_, - arg.compute_ptr_offset_of_batch_, - arg.block_2_ctile_map_); + return launch_kernel(integral_constant{}); } - - return ave_time; } // polymorphic diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 630ae13f1ce..c5ea67117e9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -147,13 +147,15 @@ __global__ void const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) + //printf("entry kernel launch"); __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + //printf("before compute_ptr_offset call"); const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( @@ -163,14 +165,18 @@ __global__ void const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); - DsPointer p_ds_grid_grp; - static constexpr index_t NumDTensor = DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); + + DsPointer p_ds_grid_grp; + + //printf("before allocate pointer d"); static_for<0, NumDTensor, 1>{}( [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); + //printf("before entry"); + GridwiseOp::template Run(p_a_grid + a_batch_offset, p_b_grid + b_batch_offset, p_ds_grid_grp, @@ -564,6 +570,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle const CDEElementwiseOperation& cde_element_op, const Block2CTileMap& block_2_ctile_map) { + //printf("safe entry"); // clang-format off /*******************************************************************************/ // Memory buffer zone. @@ -709,6 +716,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle c_thread_buf, K0BlockMainLoop); /*******************************************************************************/ + //printf("safe 1"); // write out to C, implement shuffle { constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = From 9c3c435a0aeea6a807a9ac465237ad6717537426 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 18 Jan 2023 11:22:56 +0000 Subject: [PATCH 028/118] groupconv: Sanity check[OK], Performance[Bad] --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 2 +- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 30 ++++--------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 2297d247067..d59d1bc7025 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -54,7 +54,7 @@ using DeviceConvFwdInstance = 256, // BlockSize 128, // MPerBlock 256, // NPerBlock - 8, // K0PerBlock + 4, // K0PerBlock 8, // K1 16, // MPerWMMA 16, // NPerWMMA diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index d79c54fcc77..c4c05d03801 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -435,20 +435,12 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle ds_grid_desc_m_n_ = DeviceOp::MakeDsGridDescriptor_M_N(ds_g_n_k_wos_lengths, ds_g_n_k_wos_strides); // populate desc for Ds/E - if(GridwiseOp::CheckValidity(a_grid_desc_ak0_m_ak1_, - b_grid_desc_bk0_n_bk1_, - ds_grid_desc_m_n_, - e_grid_desc_m_n_, - block_2_etile_map_)) - { - // e_grid_desc_mblock_mperblock_nblock_nperblock_ = - // GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - // e_grid_desc_m_n_); - - // ds_grid_desc_mblock_mperblock_nblock_nperblock_ = - // GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - // ds_grid_desc_m_n_); - } + e_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + e_grid_desc_m_n_); + ds_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseOp::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + ds_grid_desc_m_n_); } void Print() const @@ -520,16 +512,6 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle arg.Print(); } - if(!GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.ds_grid_desc_m_n_, - arg.e_grid_desc_m_n_, - arg.block_2_etile_map_)) - { - throw std::runtime_error( - "wrong! GridwiseGemmMultipleD_wmma_cshuffle has invalid setting"); - } - const index_t grid_size = arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.num_group_; From 0517cf084adf47c11f044399051f95c4fd5746a7 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 19 Jan 2023 07:04:16 +0000 Subject: [PATCH 029/118] navi3x_groupconv_need_optimization --- .../run_grouped_conv_fwd_bias_relu_add_wmma_example.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index d59d1bc7025..8161b1088ad 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -53,13 +53,13 @@ using DeviceConvFwdInstance = GemmSpec, // GemmSpecialization 256, // BlockSize 128, // MPerBlock - 256, // NPerBlock + 128, // NPerBlock 4, // K0PerBlock 8, // K1 16, // MPerWMMA 16, // NPerWMMA 4, // MRepeat - 4, // NRepeat + 2, // NRepeat S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder From 3ddd357898af31678acf4539ccc498106ea9d129 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 30 Jan 2023 10:15:07 +0000 Subject: [PATCH 030/118] create necessary files --- .../CMakeLists.txt | 4 + ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 160 +++ ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 855 +++++++++++++ ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 1131 +++++++++++++++++ 4 files changed, 2150 insertions(+) create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 8d9aaec85a5..c253acbb67b 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -5,6 +5,7 @@ add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) +add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) add_custom_target(example_gemm_scale_softmax_gemm) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16) @@ -14,3 +15,6 @@ add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_soft add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) + +add_custom_target(example_gemm_scale_softmax_gemm_wmma) +add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp new file mode 100644 index 00000000000..f091f456eff --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +/* +Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o + |-----------------| + Gemm0 + |-------------------------------------| + Gemm1 +*/ + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using B0DataType = F16; +using B1DataType = F16; +using AccDataType = F32; +using CShuffleDataType = F32; +using CDataType = F16; +using Acc0BiasDataType = ck::Tuple<>; +using Acc1BiasDataType = ck::Tuple<>; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 1; +static constexpr ck::index_t NumDimN = 1; +static constexpr ck::index_t NumDimK = 1; +static constexpr ck::index_t NumDimO = 1; + +using AElementOp = PassThrough; +using B0ElementOp = PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; +static constexpr auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; + +using DeviceGemmInstance = + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle< + NumDimG, + NumDimM, + NumDimN, + NumDimK, + NumDimO, + ADataType, + B0DataType, + B1DataType, + CDataType, + Acc0BiasDataType, + Acc1BiasDataType, + AccDataType, + CShuffleDataType, + AElementOp, + B0ElementOp, + Acc0ElementOp, + B1ElementOp, + CElementOp, + GemmSpec, + TensorSpecA, + TensorSpecB0, + TensorSpecB1, + TensorSpecC, + 1, + 256, + 128, // MPerBlock + 128, // NPerBlock + 32, // KPerBlock + 64, // Gemm1NPerBlock + 32, // Gemm1KPerBlock + 8, // AK1 + 8, // BK1 + 2, // B1K1 + 32, // MPerXDL + 32, // NPerXDL + 1, // MXdlPerWave + 4, // NXdlPerWave + 2, // Gemm1NXdlPerWave + S<4, 64, 1>, // ABlockTransfer + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 64, 1>, // BBlockTransfer + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<16, 16, 1>, // B1BlockTransfer + S<0, 2, 1>, + S<0, 2, 1>, + 1, + 4, + 2, + false, + 1, // CShuffleMXdlPerWavePerShuffle + 2, // CShuffleNXdlPerWavePerShuffle + S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 8, // CShuffleBlockTransferScalarPerVector_NPerBlock + MaskingSpec>; // MaskingSpecialization + +// Ref Gemm0: fp16 in, fp32 out +using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +// Ref Softmax: fp32 in, fp16 out +using ReferenceSoftmaxInstance = + ck::tensor_operation::host::ReferenceSoftmax; + +// Ref Gemm1: fp16 in, fp16 out +using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +#include "run_batched_gemm_scale_softmax_gemm_permute.inc" + +int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp new file mode 100644 index 00000000000..203be87b81e --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -0,0 +1,855 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle_v1.hpp" +#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1( + const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatAB* __restrict__ p_b1_grid, + FloatC* __restrict__ p_c_grid, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const AccElementwiseOperation acc_element_op, + const B1ElementwiseOperation b1_element_op, + const CElementwiseOperation c_element_op, + const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, + const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, + const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock, + const Block2CTileMap block_2_ctile_map, + const index_t batch_count, + const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch, + const C0MatrixMask c0_matrix_mask) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetBBasePtr(g_idx))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + GridwiseGemm::template Run(p_a_grid + a_batch_offset, + p_b_grid + b_batch_offset, + p_b1_grid + b1_batch_offset, + p_c_grid + c_batch_offset, + p_shared, + a_element_op, + b_element_op, + acc_element_op, + b1_element_op, + c_element_op, + a_grid_desc_ak0_m_ak1, + b_grid_desc_bk0_n_bk1, + b1_grid_desc_bk0_n_bk1, + c_grid_desc_mblock_mperblock_nblock_nperblock, + block_2_ctile_map, + c0_matrix_mask); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_b1_grid; + ignore = p_c_grid; + ignore = a_element_op; + ignore = b_element_op; + ignore = acc_element_op; + ignore = b1_element_op; + ignore = c_element_op; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b_grid_desc_bk0_n_bk1; + ignore = b1_grid_desc_bk0_n_bk1; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = block_2_ctile_map; + ignore = batch_count; + ignore = compute_base_ptr_of_batch; + ignore = c0_matrix_mask; +#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) +} + +// Computes C = A * B0 * B1 +// ^^^^^^ (Acc0) +// ^^^^^^^^^^^ (Acc1) +template +struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle + : public DeviceBatchedGemmSoftmaxGemmPermute +{ + static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0, + "Number of dimension must be greater than 0"); + + static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size(); + static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size(); + + // TODO ANT: implement bias combination + static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented"); + +#if 0 + // TODO ANT: use alias + static constexpr index_t NumDimGemm0M = NumDimM; + static constexpr index_t NumDimGemm0N = NumDimN; + static constexpr index_t NumDimGemm0K = NumDimK; + static constexpr index_t NumDimGemm1M = NumDimM; + static constexpr index_t NumDimGemm1N = NumDimO; + static constexpr index_t NumDimGemm1K = NumDimN; +#endif + + using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< + Sequence, + Sequence, + GemmSpec, + ASpec, + BSpec, + B1Spec, + CSpec>; + + static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) + { + return Transform::MakeAGridDescriptor_AK0_M_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), + Number{}); + } + + static auto MakeBGridDescriptor_BK0_N_BK1(const std::vector& b_gs_ns_ks_lengths_vec, + const std::vector& b_gs_ns_ks_strides_vec) + { + return Transform::MakeB0GridDescriptor_BK0_N_BK1( + Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec), + Number{}); + } + + static auto + MakeB1GridDescriptor_BK0_N_BK1(const std::vector& b1_gs_gemm1ns_gemm1ks_lengths_vec, + const std::vector& b1_gs_gemm1ns_gemm1ks_strides_vec) + { + return Transform::MakeB1GridDescriptor_BK0_N_BK1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec, + b1_gs_gemm1ns_gemm1ks_strides_vec), + Number{}); + } + + using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1({}, {})); + using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1({}, {})); + using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1({}, {})); + using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); + using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); + using BGridDesc_G_N_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); + using B1GridDesc_G_N_K = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); + using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); + + constexpr static auto make_MaskOutPredicate() + { + if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled) + { + return MaskDisabledPredicate{}; + } + else if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle) + { + return MaskOutUpperTrianglePredicate{}; + } + } + using C0MatrixMask = C0MatrixMask_impl; + + struct ComputeBasePtrOfStridedBatch + { + ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, + const BGridDesc_G_N_K& b_grid_desc_g_n_k, + const B1GridDesc_G_N_K& b1_grid_desc_g_n_k, + const CGridDesc_G_M_N& c_grid_desc_g_m_n) + : a_grid_desc_g_m_k_(a_grid_desc_g_m_k), + b_grid_desc_g_n_k_(b_grid_desc_g_n_k), + b1_grid_desc_g_n_k_(b1_grid_desc_g_n_k), + c_grid_desc_g_m_n_(c_grid_desc_g_m_n) + { + } + + __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const + { + return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const + { + return b_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const + { + return b1_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const + { + return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + private: + AGridDesc_G_M_K a_grid_desc_g_m_k_; + BGridDesc_G_N_K b_grid_desc_g_n_k_; + B1GridDesc_G_N_K b1_grid_desc_g_n_k_; + CGridDesc_G_M_N c_grid_desc_g_m_n_; + }; + + // GridwiseGemm + using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle< + ADataType, // TODO: distinguish A/B datatype + GemmAccDataType, + CShuffleDataType, + CDataType, + AElementwiseOperation, + BElementwiseOperation, + AccElementwiseOperation, + B1ElementwiseOperation, + CElementwiseOperation, + InMemoryDataOperationEnum::Set, + AGridDesc_AK0_M_AK1, + BGridDesc_BK0_N_BK1, + B1GridDesc_BK0_N_BK1, + CGridDesc_M_N, + NumGemmKPrefetchStage, + BlockSize, + MPerBlock, + NPerBlock, + KPerBlock, + Gemm1NPerBlock, + Gemm1KPerBlock, + AK1, + BK1, + B1K1, + MPerXDL, + NPerXDL, + MXdlPerWave, + NXdlPerWave, + Gemm1NXdlPerWave, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + true, + ABlockLdsExtraM, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + true, + BBlockLdsExtraN, + B1BlockTransferThreadClusterLengths_BK0_N_BK1, + B1BlockTransferThreadClusterArrangeOrder, + B1BlockTransferSrcAccessOrder, + B1BlockTransferSrcVectorDim, + B1BlockTransferSrcScalarPerVector, + B1BlockTransferDstScalarPerVector_BK1, + false, + B1BlockLdsExtraN, + CShuffleMXdlPerWavePerShuffle, + CShuffleNXdlPerWavePerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CShuffleBlockTransferScalarPerVector_NPerBlock, + LoopSched, + Transform::matrix_padder.PadN, + MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>; + + // Argument + // FIXME: constness + struct Argument : public BaseArgument + { + Argument( + const ADataType* p_a_grid, + const BDataType* p_b_grid, + const B1DataType* p_b1_grid, + CDataType* p_c_grid, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_lengths, + const std::vector& b_gs_ns_ks_strides, + const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_strides, + const std::array, NumAcc1Bias> + acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths + const std::array, NumAcc1Bias> + acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) + : p_a_grid_{p_a_grid}, + p_b_grid_{p_b_grid}, + p_b1_grid_{p_b1_grid}, + p_c_grid_{p_c_grid}, + a_grid_desc_ak0_m_ak1_{ + DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b_grid_desc_bk0_n_bk1_{ + DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)}, + b1_grid_desc_bk0_n_bk1_{DeviceOp::MakeB1GridDescriptor_BK0_N_BK1( + b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)}, + c_grid_desc_m_n_{Transform::MakeCGridDescriptor_M_N(c_gs_ms_gemm1ns_lengths, + c_gs_ms_gemm1ns_strides)}, + a_grid_desc_g_m_k_{ + Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b_grid_desc_g_n_k_{ + Transform::MakeB0GridDescriptor_G_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)}, + b1_grid_desc_g_n_k_{Transform::MakeB1GridDescriptor_G_N_K( + b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)}, + c_grid_desc_g_m_n_{Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_gemm1ns_lengths, + c_gs_ms_gemm1ns_strides)}, + c_grid_desc_mblock_mperblock_nblock_nperblock_{}, + block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + acc_element_op_{acc_element_op}, + b1_element_op_{b1_element_op}, + c_element_op_{c_element_op}, + c0_matrix_mask_{b_grid_desc_g_n_k_.GetLength(I1)}, + raw_lengths_mz_nz_kz_gemm1nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], + b_gs_ns_ks_lengths[NumDimG + NumDimN - 1], + b_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1], + b1_gs_gemm1ns_gemm1ks_lengths[NumDimG + NumDimO - 1]}, + a_mz_kz_strides_{a_gs_ms_ks_strides[NumDimG + NumDimM - 1], + a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}, + b_nz_kz_strides_{b_gs_ns_ks_strides[NumDimG + NumDimN - 1], + b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1]}, + b1_nz_kz_strides_{b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO - 1], + b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO + NumDimN - 1]}, + c_mz_gemm1nz_strides_{c_gs_ms_gemm1ns_strides[NumDimG + NumDimM - 1], + c_gs_ms_gemm1ns_strides[NumDimG + NumDimM + NumDimO - 1]}, + batch_count_{c_grid_desc_g_m_n_.GetLength(I0)}, + compute_base_ptr_of_batch_{ + a_grid_desc_g_m_k_, b_grid_desc_g_n_k_, b1_grid_desc_g_n_k_, c_grid_desc_g_m_n_} + { + // TODO ANT: implement bias addition + ignore = p_acc0_biases; + ignore = p_acc1_biases; + ignore = acc0_biases_gs_ms_ns_lengths; + ignore = acc0_biases_gs_ms_ns_strides; + ignore = acc1_biases_gs_ms_gemm1ns_lengths; + ignore = acc1_biases_gs_ms_gemm1ns_strides; + + if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_, + b_grid_desc_bk0_n_bk1_, + b1_grid_desc_bk0_n_bk1_, + c_grid_desc_m_n_, + block_2_ctile_map_)) + { + c_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n_); + } + } + + void Print() const + { + std::cout << "a_grid_desc_g_m_k_: " << a_grid_desc_g_m_k_.GetLength(I0) << ", " + << a_grid_desc_g_m_k_.GetLength(I1) << ", " + << a_grid_desc_g_m_k_.GetLength(I2) << '\n'; + std::cout << "b_grid_desc_g_n_k_: " << b_grid_desc_g_n_k_.GetLength(I0) << ", " + << b_grid_desc_g_n_k_.GetLength(I1) << ", " + << b_grid_desc_g_n_k_.GetLength(I2) << '\n'; + std::cout << "b1_grid_desc_g_n_k_: " << b1_grid_desc_g_n_k_.GetLength(I0) << ", " + << b1_grid_desc_g_n_k_.GetLength(I1) << ", " + << b1_grid_desc_g_n_k_.GetLength(I2) << '\n'; + std::cout << "c_grid_desc_g_m_n_: " << c_grid_desc_g_m_n_.GetLength(I0) << ", " + << c_grid_desc_g_m_n_.GetLength(I1) << ", " + << c_grid_desc_g_m_n_.GetLength(I2) << '\n'; + } + + // pointers + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + const B1DataType* p_b1_grid_; + CDataType* p_c_grid_; + + // tensor descriptor + AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; + BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; + B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_; + CGridDesc_M_N c_grid_desc_m_n_; + AGridDesc_G_M_K a_grid_desc_g_m_k_; + BGridDesc_G_N_K b_grid_desc_g_n_k_; + B1GridDesc_G_N_K b1_grid_desc_g_n_k_; + CGridDesc_G_M_N c_grid_desc_g_m_n_; + typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock_; + + // block-to-c-tile map + typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_; + + // element-wise op + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + AccElementwiseOperation acc_element_op_; + B1ElementwiseOperation b1_element_op_; + CElementwiseOperation c_element_op_; + + // check C0 masking and padding + C0MatrixMask c0_matrix_mask_; + + // For robust IsSupportedArgument() check + std::vector raw_lengths_mz_nz_kz_gemm1nz_; + std::vector a_mz_kz_strides_; + std::vector b_nz_kz_strides_; + std::vector b1_nz_kz_strides_; + std::vector c_mz_gemm1nz_strides_; + + index_t batch_count_; + ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(!DeviceOp::IsSupportedArgument(arg)) + { + throw std::runtime_error("wrong! unsupported argument"); + } + + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; + + // Gemm0_K + const auto K = + arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + + float ave_time = 0; + + auto launch_kernel = [&](auto has_main_k_block_loop_) { + const auto kernel = kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1< + GridwiseGemm, + ADataType, // TODO: distiguish A/B datatype + CDataType, + AElementwiseOperation, + BElementwiseOperation, + AccElementwiseOperation, + B1ElementwiseOperation, + CElementwiseOperation, + DeviceOp::AGridDesc_AK0_M_AK1, + DeviceOp::BGridDesc_BK0_N_BK1, + DeviceOp::B1GridDesc_BK0_N_BK1, + typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, + typename GridwiseGemm::DefaultBlock2CTileMap, + ComputeBasePtrOfStridedBatch, + C0MatrixMask, + has_main_k_block_loop_>; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_b1_grid_, + arg.p_c_grid_, + arg.a_element_op_, + arg.b_element_op_, + arg.acc_element_op_, + arg.b1_element_op_, + arg.c_element_op_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.b1_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, + arg.block_2_ctile_map_, + arg.batch_count_, + arg.compute_base_ptr_of_batch_, + arg.c0_matrix_mask_); + }; + + // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need + // to concern Gemm0's loop + if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) + { + ave_time = launch_kernel(integral_constant{}); + } + else + { + ave_time = launch_kernel(integral_constant{}); + } + + return ave_time; + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { +#if DEBUG_LOG + arg.Print(); +#endif + + if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")) + { + return false; + } + + // TODO ANT: Check if tensor specialization & strides mismatch + + // Check if C permute dimension matches GEMM + GEMM shape + const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded + const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); + const index_t c_gemm1n = arg.c_grid_desc_m_n_.GetLength(I1); + const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); + const index_t b1_gemm1n = arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1); + + if(!(c_g == arg.batch_count_ && c_m == a_m && c_gemm1n == b1_gemm1n)) + { + return false; + } + + // Note: we need raw lengths since threadwise copy can not handle vector load when part of + // vector is out of bounds + // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O + const auto MzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[0]; + const auto NzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[1]; + const auto KzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[2]; + const auto Gemm1NzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[3]; + + // Check scalar per vector requirement + const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; + const auto b_extent_lowest = BBlockTransferSrcVectorDim == 2 ? KzRaw : NzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? NzRaw : Gemm1NzRaw; + const auto c_extent_lowest = Gemm1NzRaw; + + if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && + b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 && + b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && + c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + return false; + } + + // Check vector load/store requirement + const auto a_stride_lowest = + ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; + const auto b_stride_lowest = + BBlockTransferSrcVectorDim == 2 ? arg.b_nz_kz_strides_[1] : arg.b_nz_kz_strides_[0]; + const auto b1_stride_lowest = + B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_kz_strides_[1] : arg.b1_nz_kz_strides_[0]; + const auto c_stride_lowest = + arg.c_mz_gemm1nz_strides_[1]; // cshuffle assumes lowest dim in Gemm1Ns to be contiguous + + if(!(a_stride_lowest == 1 || b_stride_lowest == 1 || b1_stride_lowest == 1 || + c_stride_lowest == 1)) + { + return false; + } + + return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, + arg.b_grid_desc_bk0_n_bk1_, + arg.b1_grid_desc_bk0_n_bk1_, + arg.c_grid_desc_m_n_, + arg.block_2_ctile_map_); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument( + const ADataType* p_a, + const BDataType* p_b, + const B1DataType* p_b1, + CDataType* p_c, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_lengths, + const std::vector& b_gs_ns_ks_strides, + const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_strides, + const std::array, NumAcc1Bias> + acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths + const std::array, NumAcc1Bias> + acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) + { + return Argument{p_a, + p_b, + p_b1, + p_c, + p_acc0_biases, + p_acc1_biases, + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_lengths, + b_gs_ns_ks_strides, + b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + acc0_biases_gs_ms_ns_lengths, + acc0_biases_gs_ms_ns_strides, + acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths + acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + a_element_op, + b_element_op, + acc_element_op, + b1_element_op, + c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + // FIXME: constness + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b, + const void* p_b1, + void* p_c, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_lengths, + const std::vector& b_gs_ns_ks_strides, + const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_strides, + const std::array, NumAcc1Bias> + acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths + const std::array, NumAcc1Bias> + acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_b1), + static_cast(p_c), + p_acc0_biases, // cast in struct Argument + p_acc1_biases, // cast in struct Argument + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_lengths, + b_gs_ns_ks_strides, + b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + acc0_biases_gs_ms_ns_lengths, + acc0_biases_gs_ms_ns_strides, + acc1_biases_gs_ms_gemm1ns_lengths, + acc1_biases_gs_ms_gemm1ns_strides, + a_element_op, + b_element_op, + acc_element_op, + b1_element_op, + c_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << MPerBlock << ", " + << Gemm1NPerBlock << ", " + << Gemm1KPerBlock << ", " + << B1K1 << ", " + << getGemmSpecializationString(GemmSpec) << ", " + << "ASpec" << getTensorSpecializationString(ASpec) << ", " + << "B0Spec" << getTensorSpecializationString(BSpec) << ", " + << "B1Spec" << getTensorSpecializationString(B1Spec) << ", " + << "CSpec" << getTensorSpecializationString(CSpec) << ", " + << getMaskingSpecializationString(MaskingSpec) << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp new file mode 100644 index 00000000000..0f6f160528a --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -0,0 +1,1131 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_softmax.hpp" + +namespace ck { + +template +struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle +{ + static_assert(LoopSched == LoopScheduler::Default, + "Non-default loop scheduler is currently not supported"); + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + static constexpr auto I7 = Number<7>{}; + + // K1 should be Number<...> + // Gemm0 + static constexpr auto AK0 = Number{}; + static constexpr auto BK0 = Number{}; + static constexpr auto AK1 = Number{}; + static constexpr auto BK1 = Number{}; + + static constexpr auto Gemm0MWaves = MPerBlock / (MPerXdl * MXdlPerWave); + static constexpr auto Gemm0NWaves = NPerBlock / (NPerXdl * NXdlPerWave); + + // Gemm1 + static constexpr auto B1K0 = Number{}; + static constexpr auto B1K1 = Number{}; + + using ThisThreadBlock = ThisThreadBlock; + + using GridwiseGemmPipe = remove_cvref_t())>; + + template + __host__ __device__ static constexpr auto + MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&) + { + constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl); + + return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K( + ABlockDesc_AK0_M_AK1{}); + } + + template + __host__ __device__ static constexpr auto + MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&) + { + constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl); + + return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K( + BBlockDesc_BK0_N_BK1{}); + } + + template + __host__ __device__ static constexpr auto + MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&) + { + return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(ABlockDesc_AK0_M_AK1{}); + } + + template + __host__ __device__ static constexpr auto + MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&) + { + constexpr index_t Gemm1NWaves = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl); + return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K( + BBlockDesc_BK0_N_BK1{}); + } + + __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() + { + // A matrix in LDS memory, dst of blockwise copy + return make_naive_tensor_descriptor( + make_tuple(AK0, Number{}, AK1), + make_tuple(Number{} * AK1, AK1, I1)); + } + + __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1() + { + // B matrix in LDS memory, dst of blockwise copy + return make_naive_tensor_descriptor( + make_tuple(BK0, Number{}, BK1), + make_tuple(Number{} * BK1, BK1, I1)); + } + + __host__ __device__ static constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1() + { + // B1 matrix in LDS memory, dst of blockwise copy + return make_naive_tensor_descriptor( + make_tuple(B1K0, Number{}, B1K1), + make_tuple(Number{} * B1K1, B1K1, I1)); + } + + __host__ __device__ static constexpr auto + GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock() + { + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl); + + constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + I1, + Number{})); + + return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock; + } + + __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() + { + const index_t gemm0_bytes_end = (SharedMemTrait::a_block_space_size_aligned + + SharedMemTrait::b_block_space_size_aligned) * + sizeof(FloatAB); + const index_t gemm1_bytes_end = + (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * + sizeof(FloatAB); + const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset + + SharedMemTrait::reduction_space_size_aligned) * + sizeof(FloatGemmAcc); + const index_t c_block_bytes_end = + SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); + + return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); + } + + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} + template + __host__ __device__ static constexpr bool + CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, + const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1, + const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1, + const CGridDesc_M_N& c_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) + { + static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) && + (NPerBlock % (NXdlPerWave * NPerXdl)) == 0, + "Invalid tuning param!"); + + const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1); + const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1); + const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); + const auto Gemm1N = b1_grid_desc_bk0_n_bk1.GetLength(I1); + + if(!(M == c_grid_desc_m_n.GetLength(I0) && Gemm1N == c_grid_desc_m_n.GetLength(I1))) + { + return false; + } + + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0 && + Gemm1N % Gemm1NPerBlock == 0)) + { + return false; + } + + // check gemm0 gridwise gemm pipeline + const auto num_gemm0_k_loop = K / KPerBlock; + if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop)) + { + return false; + } + + // check gemm1 gridwise gemm pipeline + if(!(NPerBlock % Gemm1KPerBlock == 0)) + { + return false; + } + + const auto num_gemm1_k_inner_loop = NPerBlock / Gemm1KPerBlock; + if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop)) + { + return false; + } + + if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n)) + { + return false; + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + return true; + } + + __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) + { + const index_t num_loop = K / KPerBlock; + + return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); + } + + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto MBlock = M / MPerBlock; + const auto NBlock = N / Gemm1NPerBlock; + + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), + make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + return c_grid_desc_mblock_mperblock_nblock_nperblock; + } + + // return block_id to C matrix tile idx (m0, n0) mapping + __host__ __device__ static constexpr auto + MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n) + { + return BlockToCTileMap_M00_N0_M01Adapt( + c_grid_desc_m_n); + } + + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; + + using DefaultBlock2CTileMap = + remove_cvref_t; + + struct SharedMemTrait + { + // LDS allocation for A and B: be careful of alignment + static constexpr auto a_block_desc_ak0_m_ak1 = + GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + static constexpr auto b_block_desc_bk0_n_bk1 = + GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + static constexpr auto b1_block_desc_bk0_n_bk1 = + GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + + static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), B1K1); + + static constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); + static constexpr auto b_block_space_size_aligned = math::integer_least_multiple( + b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align); + static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple( + b1_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align); + + static constexpr auto a_block_space_offset = 0; + static constexpr auto b_block_space_offset = a_block_space_size_aligned.value; + static constexpr auto b1_block_space_offset = 0; + + // LDS allocation for reduction + static constexpr index_t reduction_space_size_aligned = + math::integer_least_multiple(BlockSize, max_lds_align); + + static constexpr auto reduction_space_offset = 0; + + // LDS allocation for C shuffle in LDS + static constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + static constexpr auto c_block_space_size = + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize(); + }; + + template + __device__ static void Run(const FloatAB* __restrict__ p_a_grid, + const FloatAB* __restrict__ p_b_grid, + const FloatAB* __restrict__ p_b1_grid, + FloatC* __restrict__ p_c_grid, + void* __restrict__ p_shared, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const AccElementwiseOperation& acc_element_op, + const B1ElementwiseOperation& b1_element_op, + const CElementwiseOperation& c_element_op, + const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, + const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1, + const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock, + const Block2CTileMap& block_2_ctile_map, + const C0MatrixMask& c0_matrix_mask) + { + const auto a_grid_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + const auto b1_grid_buf = make_dynamic_buffer( + p_b1_grid, b1_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + // divide block work by [M, N] + const auto block_work_idx = + block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + + if(!block_2_ctile_map.ValidCTileIndex( + block_work_idx, + make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), + c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) + { + return; + } + + // HACK: this force m/gemm1_n_block_data_idx_on_grid into SGPR + const index_t m_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + + const index_t gemm1_n_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(block_work_idx[I1] * Gemm1NPerBlock); + + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + + // B matrix in LDS memory, dst of blockwise copy + constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + + // + // set up Gemm0 + // + + // A matrix blockwise copy + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + ABlockTransferThreadClusterLengths_AK0_M_AK1, + ABlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(a_grid_desc_ak0_m_ak1), + decltype(a_block_desc_ak0_m_ak1), + ABlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + ABlockTransferSrcVectorDim, + 2, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_AK1, + 1, + 1, + true, // SrcResetCoord + true, // DstResetCoord + NumGemmKPrefetchStage>( + a_grid_desc_ak0_m_ak1, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_element_op, + a_block_desc_ak0_m_ak1, + make_multi_index(0, 0, 0), + tensor_operation::element_wise::PassThrough{}); + + // B matrix blockwise copy + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_BK0_N_BK1, + BBlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(b_grid_desc_bk0_n_bk1), + decltype(b_block_desc_bk0_n_bk1), + BBlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_BK1, + 1, + 1, + true, // SrcResetCoord + true, // DstResetCoord + NumGemmKPrefetchStage>( + b_grid_desc_bk0_n_bk1, + make_multi_index(0, 0, 0), // will loop over GemmN dimension + b_element_op, + b_block_desc_bk0_n_bk1, + make_multi_index(0, 0, 0), + tensor_operation::element_wise::PassThrough{}); + + // Fused Gemm+Gemm pipeline + // for n in N0: + // for k in K0: + // acc[m][n] += A[m][k] * B0[k][n] + // acc1[m][o] += acc[m][n] * B1[n][o] + + // sanity check + constexpr index_t KPack = math::max( + math::lcm(AK1, BK1), MfmaSelector::selected_mfma.k_per_blk); + + auto blockwise_gemm = BlockwiseGemmXdlops_v2< + BlockSize, + FloatAB, + FloatGemmAcc, + decltype(a_block_desc_ak0_m_ak1), + decltype(b_block_desc_bk0_n_bk1), + decltype(MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(a_block_desc_ak0_m_ak1)), + decltype(MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(b_block_desc_bk0_n_bk1)), + MPerBlock, + NPerBlock, + KPerBlock, + MPerXdl, + NPerXdl, + MXdlPerWave, + NXdlPerWave, + KPack, + true>{}; // TransposeC + + auto acc_thread_buf = blockwise_gemm.GetCThreadBuffer(); + + // LDS allocation for A and B: be careful of alignment + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::a_block_space_offset, + a_block_desc_ak0_m_ak1.GetElementSpaceSize()); + + auto b_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::b_block_space_offset, + b_block_desc_bk0_n_bk1.GetElementSpaceSize()); + + constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); + const auto a_block_reset_copy_step = + make_multi_index(-a_grid_desc_ak0_m_ak1.GetLength(I0), 0, 0); + const auto b_block_reset_copy_step = + make_multi_index(-b_grid_desc_bk0_n_bk1.GetLength(I0), NPerBlock, 0); + + // gridwise GEMM pipeline + // Only supports LoopScheduler::Default + const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_Selector(); + + const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( + (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / + KPerBlock); + + // + // set up Gemm1 + // + + // Acc matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type + constexpr auto acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = + blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); + + constexpr auto m0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0); + constexpr auto n0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1); + constexpr auto m1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2); + constexpr auto n1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3); + constexpr auto m2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4); + constexpr auto n2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5); + constexpr auto n3 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6); + constexpr auto n4 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7); + + constexpr auto b1_block_slice_copy_step = make_multi_index(Gemm1KPerBlock / B1K1, 0, 0); + + // acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 to acc_thread_desc_k0_m_k1 + // n0_n1_n2_n3 -> k0 + // m0_m1_m2 -> m + // n4 -> k1 + // NOTE: had to use merge_v3 or will spit out compilation errors + constexpr auto acc_thread_desc_k0_m_k1 = transform_tensor_descriptor( + acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4, + make_tuple(make_merge_transform_v3_division_mod(make_tuple(n0, n1, n2, n3)), + make_merge_transform_v3_division_mod(make_tuple(m0, m1, m2)), + make_pass_through_transform(n4)), + make_tuple(Sequence<1, 3, 5, 6>{}, Sequence<0, 2, 4>{}, Sequence<7>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + // A1 matrix in AccVGPR + // N2 num_groups_per_blk, N3 num_input_blks, N4 group_size + constexpr auto AccN3 = + blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLength(I6); + + constexpr auto A1ThreadSlice_K0_M_K1 = + make_tuple(Number{}, Number{}, Number{}); + + constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0]; + constexpr auto A1ThreadSliceM = A1ThreadSlice_K0_M_K1[I1]; + constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2]; + constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor( + A1ThreadSlice_K0_M_K1, + make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1)); + + // B1 matrix in LDS memory, dst of blockwise copy + constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + + // A1 matrix blockwise copy + auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic< + FloatGemmAcc, + FloatAB, + decltype(acc_thread_desc_k0_m_k1), + decltype(a1_thread_desc_k0_m_k1), + tensor_operation::element_wise::PassThrough, + Sequence, + Sequence<1, 0, 2>, + 2, + n4>{tensor_operation::element_wise::PassThrough{}}; + + // B1 matrix blockwise copy + auto b1_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + B1BlockTransferThreadClusterLengths_BK0_N_BK1, + B1BlockTransferThreadClusterArrangeOrder, + FloatAB, + FloatAB, + decltype(b1_grid_desc_bk0_n_bk1), + decltype(b1_block_desc_bk0_n_bk1), + B1BlockTransferSrcAccessOrder, + Sequence<1, 0, 2>, + B1BlockTransferSrcVectorDim, + 2, + B1BlockTransferSrcScalarPerVector, + B1BlockTransferDstScalarPerVector_BK1, + 1, + 1, + B1ThreadTransferSrcResetCoordinateAfterRun, + true, // DstResetCoord + NumGemmKPrefetchStage>( + b1_grid_desc_bk0_n_bk1, + make_multi_index(0, gemm1_n_block_data_idx_on_grid, 0), + b1_element_op, + b1_block_desc_bk0_n_bk1, + make_multi_index(0, 0, 0), + tensor_operation::element_wise::PassThrough{}); + + auto a1_thread_buf = make_static_buffer( + a1_thread_desc_k0_m_k1.GetElementSpaceSize()); + + // reuse LDS space for gemm0's b_block_buf + auto b1_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::b1_block_space_offset, + b1_block_desc_bk0_n_bk1.GetElementSpaceSize()); + + // selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size + // selected_mfma.k_per_blk <= Gemm1KPack + // + // Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common + // multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case + // Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs + // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will + // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7]. + // therefore we may just as well assign Gemm1KPack = group_size + constexpr index_t Gemm1KPack = + MfmaSelector::selected_mfma.group_size; + + auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2< + BlockSize, + FloatAB, + FloatGemmAcc, + decltype(a1_thread_desc_k0_m_k1), + decltype(b1_block_desc_bk0_n_bk1), + decltype(MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(a1_thread_desc_k0_m_k1)), + decltype(MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(b1_block_desc_bk0_n_bk1)), + MPerBlock, + Gemm1NPerBlock, + Gemm1KPerBlock, + MPerXdl, + NPerXdl, + MXdlPerWave, + Gemm1NXdlPerWave, + Gemm1KPack, + true, // TransposeC + Gemm1KPack, // AMmaKStride + Gemm1KPack * XdlopsGemm{}.K0PerXdlops>{ + // BMmaKStride + make_tuple(0, 0, 0, 0)}; // A_origin + + auto acc1_thread_buf = gemm1_blockwise_gemm.GetCThreadBuffer(); + + // + // Blockwise softmax + // + auto workspace_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::reduction_space_offset, + SharedMemTrait::reduction_space_size_aligned); + + // get acc0 8D thread cluster + constexpr auto thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4 = + blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths() / + blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); + constexpr auto tm0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I0); + constexpr auto tn0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I1); + constexpr auto tm1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I2); + constexpr auto tn1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I3); + constexpr auto tm2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I4); + constexpr auto tn2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I5); + constexpr auto tn3 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I6); + constexpr auto tn4 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I7); + + // get acc0 thread map + constexpr auto m0_n_m1_to_m_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(tm0 * tm1, tm2)), + make_pass_through_transform(I1)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + constexpr auto threadid_to_m0_n_m1_adaptor = make_single_stage_tensor_adaptor( + make_tuple( + make_merge_transform(make_tuple(tm0 * tm1, tn0 * tn1 * tn2 * tn3 * tn4, tm2))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + const auto threadid_to_m_n_thread_cluster_adaptor = + chain_tensor_adaptors(m0_n_m1_to_m_n_adaptor, threadid_to_m0_n_m1_adaptor); + + // get acc0 2D thread cluster & 2D thread slice + constexpr auto thread_cluster_desc_m_n = make_naive_tensor_descriptor_packed( + make_tuple(tm0 * tm1 * tm2, tn0 * tn1 * tn2 * tn3 * tn4)); + constexpr auto thread_slice_desc_m_n = + make_naive_tensor_descriptor_packed(make_tuple(m0 * m1 * m2, n0 * n1 * n2 * n3 * n4)); + + auto blockwise_softmax = BlockwiseSoftmax{}; + + const index_t num_gemm1_k_block_outer_loop = + b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock; + constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock; + + // Initialize C + StaticBuffer + c_thread_buf; + c_thread_buf.Clear(); + + // Initialize running sum and max of exponentiating row vectors + using SoftmaxBuf = typename decltype(blockwise_softmax)::BufferType; + SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new; + running_sum = 0; + running_sum_new = 0; + running_max = NumericLimits::Lowest(); + running_max_new = NumericLimits::Lowest(); + + // gemm1 K loop + index_t gemm1_k_block_outer_index = 0; + do + { + auto n_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock); + if(c0_matrix_mask.IsTileSkippable( + m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock)) + { + continue; + } + // gemm0 + gridwise_gemm_pipeline.template Run(a_grid_desc_ak0_m_ak1, + a_block_desc_ak0_m_ak1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc_bk0_n_bk1, + b_block_desc_bk0_n_bk1, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + blockwise_gemm, + acc_thread_buf, + num_k_block_main_loop); + + // do MNK padding or upper triangular masking + if constexpr(MaskOutUpperTriangle || PadN) + { + // 8d thread_desc in thread scope + constexpr auto c_thread_lengths = + blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); + + // 8d block_desc in block scope + constexpr auto c_block_lengths = + blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); + + constexpr auto M0 = c_block_lengths[I0]; + constexpr auto N0 = c_block_lengths[I1]; + constexpr auto M1 = c_block_lengths[I2]; + constexpr auto N1 = c_block_lengths[I3]; + constexpr auto M2 = c_block_lengths[I4]; + constexpr auto N2 = c_block_lengths[I5]; + constexpr auto N3 = c_block_lengths[I6]; + constexpr auto N4 = c_block_lengths[I7]; + + // works like multi-dimension static_for (static_ford), but provides both the linear + // index as well as n-d index + using Acc0TileIterator = SpaceFillingCurve< + decltype(c_thread_lengths), + typename arithmetic_sequence_gen<0, c_thread_lengths.Size(), 1>::type, + typename uniform_sequence_gen::type, + false>; // SnakeCurved + + auto acc0_thread_origin = blockwise_gemm.CalculateCThreadOriginDataIndex8D( + Number<0>{}, Number<0>{}, Number<0>{}, Number<0>{}); + + constexpr auto block_idx_to_m_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(M0, M1, M2)), + make_unmerge_transform(make_tuple(N0, N1, N2, N3, N4))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5, 6, 7>{})); + + static_for<0, Acc0TileIterator::GetNumOfAccess(), 1>{}([&](auto i) { + auto acc0_thread_idx = Acc0TileIterator::GetIndex(i) + acc0_thread_origin; + auto m_local = + block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; + auto n_local = + block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; + auto m_global = m_local + m_block_data_idx_on_grid; + auto n_global = n_local + n_block_data_idx_on_grid; + if(c0_matrix_mask.IsMaskedElement(m_global, n_global)) + { + acc_thread_buf(i) = -ck::NumericLimits::Infinity(); + } + else + { + acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); + } + }); + } + else + { + static_for<0, acc_thread_buf.Size(), 1>{}( + [&](auto i) { acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); }); + } + + block_sync_lds(); // wait for lds read in gemm0 blockwise gemm + + // softmax + SoftmaxBuf& max = blockwise_softmax.max_value_buf; + SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; + + blockwise_softmax.Run(acc_thread_buf, workspace_buf); + + // TODO: may convert to log domain + running_max_new = mathext::max(max, running_max); + running_sum_new = mathext::exp(running_max - running_max_new) * running_sum + + mathext::exp(max - running_max_new) * sum; + + // gemm1 + { + // TODO: explore using dynamic buffer for a1 thread buffer + // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(), + // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that + // the A1 source buffer is static buffer holding the output of first GEMM and + // requires constexpr offset by design. Therefore, we pass tensor coordinate offset + // explicitly in Run() below. + + // Initialize acc1 + acc1_thread_buf.Clear(); + + // preload data into LDS + b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf); + + b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1, + b1_block_slice_copy_step); + + block_sync_lds(); // wait for reduction LDS read + + b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf); + + // main body + if constexpr(num_gemm1_k_block_inner_loop > 1) + { + static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) { + a1_blockwise_copy.Run(acc_thread_desc_k0_m_k1, + make_tuple(Number{}, I0, I0), + acc_thread_buf, + a1_thread_desc_k0_m_k1, + make_tuple(I0, I0, I0), + a1_thread_buf); + + b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf); + + block_sync_lds(); + + gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); + + block_sync_lds(); + + b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1, + b1_block_slice_copy_step); + + b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf); + }); + } + // tail + { + a1_blockwise_copy.Run( + acc_thread_desc_k0_m_k1, + make_tuple( + Number<(num_gemm1_k_block_inner_loop - 1) * A1ThreadSliceK0>{}, I0, I0), + acc_thread_buf, + a1_thread_desc_k0_m_k1, + make_tuple(I0, I0, I0), + a1_thread_buf); + + block_sync_lds(); + + gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); + } + } // end gemm1 + + // workaround compiler issue; see ck/ck.hpp + if constexpr(CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE == 1 && + is_same_v && MPerBlock == 256 && NPerBlock == 128 && + Gemm1NPerBlock == 128) + { + __builtin_amdgcn_sched_barrier(0); + } + + constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = + gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); + constexpr auto cm0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0); + constexpr auto cn0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1); + constexpr auto cm1 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2); + constexpr auto cn1 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3); + constexpr auto cm2 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4); + constexpr auto cn2 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5); + constexpr auto cn3 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6); + constexpr auto cn4 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7); + constexpr auto c_thread_slice_desc_m_n = make_naive_tensor_descriptor_packed( + make_tuple(cm0 * cm1 * cm2, cn0 * cn1 * cn2 * cn3 * cn4)); + constexpr auto c_thread_buf_slice_m = c_thread_slice_desc_m_n.GetLength(I0); + constexpr auto c_thread_buf_slice_n = c_thread_slice_desc_m_n.GetLength(I1); + + static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) { + static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) { + auto I = Number{}; + FloatGemmAcc acc1 = acc1_thread_buf[I]; // P*V + FloatGemmAcc c = c_thread_buf[I]; // O + FloatGemmAcc c_new = + (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c + + math::exp(max[iM] - running_max_new[iM]) * acc1) / + running_sum_new[iM]; // Formula by Dao et al., + // https://arxiv.org/pdf/2205.14135v2.pdf section 3.1 + + c_thread_buf(I) = c_new; // O_new + }); + }); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_ak0_m_ak1, + a_block_reset_copy_step); // rewind K + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_bk0_n_bk1, + b_block_reset_copy_step); // rewind K and step N + + // update before next j iteration + running_max = running_max_new; + running_sum = running_sum_new; + + block_sync_lds(); // wait for gemm1 LDS read + } while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); // end j loop + + // shuffle C and write out + { + static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 && + Gemm1NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0, + "wrong!"); + + constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl); + + // TODO: hacky, fix it! + constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = + gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); + + // TODO: hacky, fix it! + // c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp is only used to get lengths + constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp = + gemm1_blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); + + constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0); + constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1); + constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2); + constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3); + constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4); + constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5); + constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6); + constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7); + + constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + + auto c_shuffle_block_buf = make_dynamic_buffer( + static_cast(p_shared), + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + + constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_tuple( + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // M0 (MXdlPerWave) per shuffle + M1, // M1 = MWave + M2)), // M2 = MPerXdl + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // N0 (NXdlPerWave) per shuffle + N1, // N1 = NWave + N2, // N2 * N3 * N4 = NPerXdl + N3, + N4))), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple( + Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{})); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = + gemm1_blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_m0_m1_m2_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(M0, M1, M2))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = + m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))), + make_tuple(Sequence<0, 1, 2, 3, 4>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_idx = + n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // shuffle: threadwise copy C from VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + 7, + 1, + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4, + make_multi_index(0, + 0, + m_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + n_thread_data_on_block_idx[I2], + n_thread_data_on_block_idx[I3], + n_thread_data_on_block_idx[I4]), + tensor_operation::element_wise::PassThrough{}}; + + // shuffle: blockwise copy C from LDS to global + auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< + ThisThreadBlock, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + FloatCShuffle, // typename SrcData, + FloatC, // typename DstData, + decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + true, // bool ThreadTransferSrcResetCoordinateAfterRun, + false> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(0, 0, 0, 0), + c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0), + c_element_op}; + + // space filling curve for threadwise C in VGPR + constexpr auto sfc_c_vgpr = + SpaceFillingCurve, + Sequence<0, 1, 2, 3, 4, 5, 6, 7>, + Sequence>{}; + + // space filling curve for shuffled blockwise C in global mem + constexpr auto sfc_c_global = + SpaceFillingCurve, + Sequence<0, 2, 1, 3>, + Sequence<1, + CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + 1, + CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{}; + + constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); + + static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!"); + + static_for<0, num_access, 1>{}([&](auto access_id) { + // make sure it's safe to write to LDS + block_sync_lds(); + + // each thread write its data from VGPR to LDS + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4, + sfc_c_vgpr.GetIndexTupleOfNumber(access_id), + c_thread_buf, + c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4, + c_shuffle_block_buf); + + // make sure it's safe to read from LDS + block_sync_lds(); + + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global.Run( + c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + + if constexpr(access_id < num_access - 1) + { + constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); + + // move on C + c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); + } + }); + } + } +}; + +} // namespace ck From a0a469e418589ad82c9cf5f6c83e3ef2a5559698 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 3 Feb 2023 10:06:37 +0000 Subject: [PATCH 031/118] save progress --- .../gpu/block/blockwise_gemm_wmma.hpp | 10 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 2 +- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 1109 ++++++++--------- .../gpu/grid/gridwise_gemm_wmma.hpp | 2 +- include/ck/utility/amd_wmma.hpp | 10 +- 5 files changed, 501 insertions(+), 632 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index d75f37d7b39..1cd5efd9bae 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -129,7 +129,11 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle return make_tuple(c_thread_m, c_thread_n); } - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle() + using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle( + Tuple4 a_origin = CalculateAThreadOriginDataIndex(), + Tuple4 b_origin = CalculateBThreadOriginDataIndex()) + : a_thread_copy_(a_origin), b_thread_copy_(b_origin) { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && BK0NK1BlockDesc::IsKnownAtCompileTime(), @@ -299,8 +303,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle B_K1, B_K1>; - AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; - BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; + AThreadCopy a_thread_copy_; + BThreadCopy b_thread_copy_; }; // block wise level pipe designed for inline asm diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 203be87b81e..6d09dd4bed2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -13,7 +13,7 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle_v1.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp" #include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 0f6f160528a..76fca261d69 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -18,72 +18,134 @@ namespace ck { -template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_wmma( + const FloatA* __restrict__ p_a_grid, + const FloatB* __restrict__ p_b0_grid, + FloatC* __restrict__ p_c_grid, + const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1 b0_grid_desc_k0_l_k1, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock, + // const + // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup + // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CElementwiseOperation c_element_op, + const Block2CTileMap block_2_ctile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + + GridwiseGemm::template Run(p_a_grid, + p_b0_grid, + p_c_grid, + p_shared, + a_grid_desc_k0_m_k1, + b0_grid_desc_k0_l_k1, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b_element_op, + c_element_op, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b0_grid; + ignore = p_c_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b0_grid_desc_k0_l_k1; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx1100__)) +} + +// Gemm0: A [M x K] x B0 [K x L] = Acc [M x L] +// Gemm1: Acc [M x L] x B1 [L x N] = C [M x N] +template -struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle + index_t NumGemmKPrefetchStage = 1, + LoopScheduler LoopSched = make_default_loop_scheduler(), + PipelineVersion PipelineVer = PipelineVersion::v1> +struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle { - static_assert(LoopSched == LoopScheduler::Default, - "Non-default loop scheduler is currently not supported"); - static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; @@ -94,161 +156,127 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle static constexpr auto I7 = Number<7>{}; // K1 should be Number<...> - // Gemm0 - static constexpr auto AK0 = Number{}; - static constexpr auto BK0 = Number{}; - static constexpr auto AK1 = Number{}; - static constexpr auto BK1 = Number{}; - - static constexpr auto Gemm0MWaves = MPerBlock / (MPerXdl * MXdlPerWave); - static constexpr auto Gemm0NWaves = NPerBlock / (NPerXdl * NXdlPerWave); - - // Gemm1 - static constexpr auto B1K0 = Number{}; - static constexpr auto B1K1 = Number{}; + static constexpr auto K1 = Number{}; + static constexpr auto N1 = Number{}; using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = remove_cvref_t())>; + GridwiseGemmPipeline_Selector())>; - template - __host__ __device__ static constexpr auto - MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&) + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() { - constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl); + constexpr auto max_lds_align = K1; - return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K( - ABlockDesc_AK0_M_AK1{}); - } - - template - __host__ __device__ static constexpr auto - MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&) - { - constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl); - - return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K( - BBlockDesc_BK0_N_BK1{}); - } - - template - __host__ __device__ static constexpr auto - MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&) - { - return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(ABlockDesc_AK0_M_AK1{}); - } + // A matrix in LDS memory, dst of blockwise copy + constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); - template - __host__ __device__ static constexpr auto - MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&) - { - constexpr index_t Gemm1NWaves = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl); - return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K( - BBlockDesc_BK0_N_BK1{}); + return a_block_desc_k0perblock_mperblock_k1; } - __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() + __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() { - // A matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(AK0, Number{}, AK1), - make_tuple(Number{} * AK1, AK1, I1)); - } + constexpr auto max_lds_align = K1; - __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1() - { // B matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(BK0, Number{}, BK1), - make_tuple(Number{} * BK1, BK1, I1)); - } + constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + }(); - __host__ __device__ static constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1() - { - // B1 matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(B1K0, Number{}, B1K1), - make_tuple(Number{} * B1K1, B1K1, I1)); + return b_block_desc_k0perblock_nperblock_k1; } __host__ __device__ static constexpr auto - GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock() + // *Caution Here repeat is shuffle repeat + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() { - constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); - constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl); + constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma); + constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma); - constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = make_naive_tensor_descriptor_packed( make_tuple(I1, - Number{}, + Number{}, I1, - Number{})); + Number{})); - return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock; + return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat; } __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { - const index_t gemm0_bytes_end = (SharedMemTrait::a_block_space_size_aligned + - SharedMemTrait::b_block_space_size_aligned) * - sizeof(FloatAB); - const index_t gemm1_bytes_end = - (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * - sizeof(FloatAB); - const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset + - SharedMemTrait::reduction_space_size_aligned) * - sizeof(FloatGemmAcc); - const index_t c_block_bytes_end = - SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); - - return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); + // LDS allocation for A and B: be careful of alignment + constexpr auto a_block_desc_k0perblock_mperblock_k1 = + GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + + constexpr auto b_block_desc_k0perblock_nperblock_k1 = + GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + + constexpr auto max_lds_align = K1; + + constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); + + constexpr auto b_block_space_size_aligned = math::integer_least_multiple( + b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); + + return (a_block_space_size_aligned * sizeof(FloatA) + + b_block_space_size_aligned * sizeof(FloatB)); } // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, - const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1, - const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1, + CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const BGridDesc_K0_N_K1& b0_grid_desc_k0_l_k1, const CGridDesc_M_N& c_grid_desc_m_n, const Block2CTileMap& block_2_ctile_map) { - static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) && - (NPerBlock % (NXdlPerWave * NPerXdl)) == 0, + static_assert(is_known_at_compile_time>::value, + "wrong! K1 need to be known at compile-time"); + + static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && + (NPerBlock % (NRepeat * NPerWmma)) == 0, "Invalid tuning param!"); - const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1); - const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1); - const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); - const auto Gemm1N = b1_grid_desc_bk0_n_bk1.GetLength(I1); + const auto M = a_grid_desc_k0_m_k1.GetLength(I1); + const auto N = b0_grid_desc_k0_l_k1.GetLength(I1); + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - if(!(M == c_grid_desc_m_n.GetLength(I0) && Gemm1N == c_grid_desc_m_n.GetLength(I1))) - { + if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) && + K0 == b0_grid_desc_k0_l_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && + K1 == b0_grid_desc_k0_l_k1.GetLength(I2))) return false; - } - if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0 && - Gemm1N % Gemm1NPerBlock == 0)) - { + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) return false; - } - // check gemm0 gridwise gemm pipeline - const auto num_gemm0_k_loop = K / KPerBlock; - if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop)) - { - return false; - } + // check gridwise gemm pipeline + const auto num_k_loop = K0 / K0PerBlock; - // check gemm1 gridwise gemm pipeline - if(!(NPerBlock % Gemm1KPerBlock == 0)) - { - return false; - } - - const auto num_gemm1_k_inner_loop = NPerBlock / Gemm1KPerBlock; - if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop)) + if(!GridwiseGemmPipe::IsSupported(num_k_loop)) { return false; } @@ -264,7 +292,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / KPerBlock; + const index_t num_loop = K / (K0PerBlock * K1); return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -276,12 +304,12 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle const auto N = c_grid_desc_m_n.GetLength(I1); const auto MBlock = M / MPerBlock; - const auto NBlock = N / Gemm1NPerBlock; + const auto NBlock = N / NPerBlock; const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( c_grid_desc_m_n, make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), - make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_unmerge_transform(make_tuple(NBlock, Number{}))), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); @@ -289,284 +317,225 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle } // return block_id to C matrix tile idx (m0, n0) mapping - __host__ __device__ static constexpr auto - MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n) + __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap( + const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */) { - return BlockToCTileMap_M00_N0_M01Adapt( + return BlockToCTileMap_M00_N0_M01Adapt( c_grid_desc_m_n); } using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; - using DefaultBlock2CTileMap = - remove_cvref_t; + remove_cvref_t; - struct SharedMemTrait - { - // LDS allocation for A and B: be careful of alignment - static constexpr auto a_block_desc_ak0_m_ak1 = - GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); - static constexpr auto b_block_desc_bk0_n_bk1 = - GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); - static constexpr auto b1_block_desc_bk0_n_bk1 = - GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1(); - - static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), B1K1); - - static constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - static constexpr auto b_block_space_size_aligned = math::integer_least_multiple( - b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align); - static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple( - b1_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align); - - static constexpr auto a_block_space_offset = 0; - static constexpr auto b_block_space_offset = a_block_space_size_aligned.value; - static constexpr auto b1_block_space_offset = 0; - - // LDS allocation for reduction - static constexpr index_t reduction_space_size_aligned = - math::integer_least_multiple(BlockSize, max_lds_align); - - static constexpr auto reduction_space_offset = 0; - - // LDS allocation for C shuffle in LDS - static constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = - GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); - static constexpr auto c_block_space_size = - c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize(); - }; - - template - __device__ static void Run(const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - const FloatAB* __restrict__ p_b1_grid, + template + __device__ static void Run(const FloatA* __restrict__ p_a_grid, + const FloatB0* __restrict__ p_b0_grid, + const FloatB1* __restrict__ p_b1_grid, FloatC* __restrict__ p_c_grid, void* __restrict__ p_shared, + const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const B0GridDesc_K0_L_K1& b0_grid_desc_k0_l_k1, + const B1GridDesc_L0_N_L1& b1_grid_desc_l0_n_l1, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation& a_element_op, - const BElementwiseOperation& b_element_op, + const B0ElementwiseOperation& b0_element_op, const AccElementwiseOperation& acc_element_op, const B1ElementwiseOperation& b1_element_op, const CElementwiseOperation& c_element_op, - const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, - const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1, - const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& - c_grid_desc_mblock_mperblock_nblock_nperblock, - const Block2CTileMap& block_2_ctile_map, - const C0MatrixMask& c0_matrix_mask) + const C0MatrixMask& c0_matrix_mask, + const Block2CTileMap& block_2_ctile_map) { + // clang-format off +/*******************************************************************************/ +// Memory buffer zone. const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize()); - const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); + const auto b0_grid_buf = make_dynamic_buffer( + p_b0_grid, b0_grid_desc_k0_l_k1.GetElementSpaceSize()); const auto b1_grid_buf = make_dynamic_buffer( - p_b1_grid, b1_grid_desc_bk0_n_bk1.GetElementSpaceSize()); + p_b1_grid, b1_grid_desc_l0_n_l1.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - // divide block work by [M, N] - const auto block_work_idx = - block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); - +/*******************************************************************************/ +// BlockIdx.x -> [BlockId.m, BlockId.n] + const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); if(!block_2_ctile_map.ValidCTileIndex( block_work_idx, make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) - { - return; - } + { return; } - // HACK: this force m/gemm1_n_block_data_idx_on_grid into SGPR - const index_t m_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + // Store BlockId into SGPR + const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); - const index_t gemm1_n_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(block_work_idx[I1] * Gemm1NPerBlock); +/*******************************************************************************/ +// set up Gemm0 +/*******************************************************************************/ - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1(); - - // - // set up Gemm0 - // +/*******************************************************************************/ +// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy + const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + constexpr auto max_lds_align = K1; + constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); + constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); // A matrix blockwise copy auto a_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - ABlockTransferThreadClusterLengths_AK0_M_AK1, - ABlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(a_grid_desc_ak0_m_ak1), - decltype(a_block_desc_ak0_m_ak1), - ABlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - ABlockTransferSrcVectorDim, - 2, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_AK1, - 1, - 1, - true, // SrcResetCoord - true, // DstResetCoord - NumGemmKPrefetchStage>( - a_grid_desc_ak0_m_ak1, + ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, +/* typename SrcElementwiseOperation, */ AElementwiseOperation, +/* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, +/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, +/* typename BlockSliceLengths, */ Sequence, +/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, +/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ FloatA, +/* typename DstData, */ FloatA, +/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), +/* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), +/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, +/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, +/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( + a_grid_desc_k0_m_k1, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, - a_block_desc_ak0_m_ak1, + a_block_desc_k0perblock_mperblock_k1, make_multi_index(0, 0, 0), - tensor_operation::element_wise::PassThrough{}); + ck::tensor_operation::element_wise::PassThrough{}); // B matrix blockwise copy auto b_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1, - BBlockTransferThreadClusterLengths_BK0_N_BK1, + Sequence, + BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b_grid_desc_bk0_n_bk1), - decltype(b_block_desc_bk0_n_bk1), + FloatB, + FloatB, + decltype(b0_grid_desc_k0_l_k1), + decltype(b_block_desc_k0perblock_nperblock_k1), BBlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, + Sequence<0, 1, 2>, BBlockTransferSrcVectorDim, 2, BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_BK1, + BBlockTransferDstScalarPerVector_K1, 1, 1, - true, // SrcResetCoord - true, // DstResetCoord - NumGemmKPrefetchStage>( - b_grid_desc_bk0_n_bk1, - make_multi_index(0, 0, 0), // will loop over GemmN dimension + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b0_grid_desc_k0_l_k1, + make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, - b_block_desc_bk0_n_bk1, + b_block_desc_k0perblock_nperblock_k1, make_multi_index(0, 0, 0), - tensor_operation::element_wise::PassThrough{}); - - // Fused Gemm+Gemm pipeline - // for n in N0: - // for k in K0: - // acc[m][n] += A[m][k] * B0[k][n] - // acc1[m][o] += acc[m][n] * B1[n][o] - - // sanity check - constexpr index_t KPack = math::max( - math::lcm(AK1, BK1), MfmaSelector::selected_mfma.k_per_blk); - - auto blockwise_gemm = BlockwiseGemmXdlops_v2< - BlockSize, - FloatAB, - FloatGemmAcc, - decltype(a_block_desc_ak0_m_ak1), - decltype(b_block_desc_bk0_n_bk1), - decltype(MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(a_block_desc_ak0_m_ak1)), - decltype(MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(b_block_desc_bk0_n_bk1)), - MPerBlock, - NPerBlock, - KPerBlock, - MPerXdl, - NPerXdl, - MXdlPerWave, - NXdlPerWave, - KPack, - true>{}; // TransposeC - + ck::tensor_operation::element_wise::PassThrough{}); + +/*******************************************************************************/ + // Gemm0 + constexpr auto WmmaK = 16; + constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + + auto blockwise_gemm0 = + BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; + + // Prepare Register for A*B0 matrix auto acc_thread_buf = blockwise_gemm.GetCThreadBuffer(); +/*******************************************************************************/ + constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); // LDS allocation for A and B: be careful of alignment - auto a_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::a_block_space_offset, - a_block_desc_ak0_m_ak1.GetElementSpaceSize()); - - auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::b_block_space_offset, - b_block_desc_bk0_n_bk1.GetElementSpaceSize()); - - constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0); - const auto a_block_reset_copy_step = - make_multi_index(-a_grid_desc_ak0_m_ak1.GetLength(I0), 0, 0); - const auto b_block_reset_copy_step = - make_multi_index(-b_grid_desc_bk0_n_bk1.GetLength(I0), NPerBlock, 0); - - // gridwise GEMM pipeline - // Only supports LoopScheduler::Default - const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_Selector(); - - const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane( - (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) / - KPerBlock); - - // - // set up Gemm1 - // - - // Acc matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type - constexpr auto acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = - blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); - - constexpr auto m0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0); - constexpr auto n0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1); - constexpr auto m1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2); - constexpr auto n1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3); - constexpr auto m2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4); - constexpr auto n2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5); - constexpr auto n3 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6); - constexpr auto n4 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7); - - constexpr auto b1_block_slice_copy_step = make_multi_index(Gemm1KPerBlock / B1K1, 0, 0); - - // acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 to acc_thread_desc_k0_m_k1 - // n0_n1_n2_n3 -> k0 - // m0_m1_m2 -> m - // n4 -> k1 - // NOTE: had to use merge_v3 or will spit out compilation errors - constexpr auto acc_thread_desc_k0_m_k1 = transform_tensor_descriptor( - acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4, - make_tuple(make_merge_transform_v3_division_mod(make_tuple(n0, n1, n2, n3)), - make_merge_transform_v3_division_mod(make_tuple(m0, m1, m2)), - make_pass_through_transform(n4)), - make_tuple(Sequence<1, 3, 5, 6>{}, Sequence<0, 2, 4>{}, Sequence<7>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - // A1 matrix in AccVGPR - // N2 num_groups_per_blk, N3 num_input_blks, N4 group_size - constexpr auto AccN3 = - blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLength(I6); - - constexpr auto A1ThreadSlice_K0_M_K1 = - make_tuple(Number{}, Number{}, Number{}); - - constexpr auto A1ThreadSliceK0 = A1ThreadSlice_K0_M_K1[I0]; - constexpr auto A1ThreadSliceM = A1ThreadSlice_K0_M_K1[I1]; - constexpr auto A1ThreadSliceK1 = A1ThreadSlice_K0_M_K1[I2]; - constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor( - A1ThreadSlice_K0_M_K1, - make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1)); + auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); + auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + + // Shift Per SUB_K + constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); + const auto b_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); +/*******************************************************************************/ +// softmax +/*******************************************************************************/ + auto workspace_buf = make_dynamic_buffer(static_cast(p_shared), math::integer_least_multiple(BlockSize, max_lds_align)); + // get acc0 8D thread cluster + constexpr auto thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4 = + blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths() / + blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); + constexpr auto tm0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I0); + constexpr auto tn0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I1); + constexpr auto tm1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I2); + constexpr auto tn1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I3); + constexpr auto tm2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I4); + constexpr auto tn2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I5); + constexpr auto tn3 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I6); + constexpr auto tn4 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I7); + // get acc0 thread map + constexpr auto m0_n_m1_to_m_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(tm0 * tm1, tm2)), + make_pass_through_transform(I1)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + constexpr auto threadid_to_m0_n_m1_adaptor = make_single_stage_tensor_adaptor( + make_tuple( + make_merge_transform(make_tuple(tm0 * tm1, tn0 * tn1 * tn2 * tn3 * tn4, tm2))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + const auto threadid_to_m_n_thread_cluster_adaptor = + chain_tensor_adaptors(m0_n_m1_to_m_n_adaptor, threadid_to_m0_n_m1_adaptor); + + // get acc0 2D thread cluster & 2D thread slice + constexpr auto thread_cluster_desc_m_n = make_naive_tensor_descriptor_packed( + make_tuple(tm0 * tm1 * tm2, tn0 * tn1 * tn2 * tn3 * tn4)); + constexpr auto thread_slice_desc_m_n = + make_naive_tensor_descriptor_packed(make_tuple(m0 * m1 * m2, n0 * n1 * n2 * n3 * n4)); + + auto blockwise_softmax = BlockwiseSoftmax{}; + + // Initialize running sum and max of exponentiating row vectors + using SoftmaxBuf = typename decltype(blockwise_softmax)::BufferType; + SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new; + running_sum = 0; + running_sum_new = 0; + running_max = NumericLimits::Lowest(); + running_max_new = NumericLimits::Lowest(); +/*******************************************************************************/ +// set up Gemm1 +/*******************************************************************************/ // B1 matrix in LDS memory, dst of blockwise copy - constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1(); + constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_L0PerBlock_NPerBlock_L1(); // A1 matrix blockwise copy auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic< - FloatGemmAcc, - FloatAB, + FloatAcc, + FloatA, decltype(acc_thread_desc_k0_m_k1), decltype(a1_thread_desc_k0_m_k1), tensor_operation::element_wise::PassThrough, @@ -574,7 +543,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle Sequence<1, 0, 2>, 2, n4>{tensor_operation::element_wise::PassThrough{}}; - + // B1 matrix blockwise copy auto b1_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1( - a1_thread_desc_k0_m_k1.GetElementSpaceSize()); - - // reuse LDS space for gemm0's b_block_buf - auto b1_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::b1_block_space_offset, - b1_block_desc_bk0_n_bk1.GetElementSpaceSize()); - - // selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size - // selected_mfma.k_per_blk <= Gemm1KPack - // - // Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common - // multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case - // Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs - // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will - // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7]. - // therefore we may just as well assign Gemm1KPack = group_size - constexpr index_t Gemm1KPack = - MfmaSelector::selected_mfma.group_size; - - auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2< - BlockSize, - FloatAB, - FloatGemmAcc, - decltype(a1_thread_desc_k0_m_k1), - decltype(b1_block_desc_bk0_n_bk1), - decltype(MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(a1_thread_desc_k0_m_k1)), - decltype(MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(b1_block_desc_bk0_n_bk1)), - MPerBlock, - Gemm1NPerBlock, - Gemm1KPerBlock, - MPerXdl, - NPerXdl, - MXdlPerWave, - Gemm1NXdlPerWave, - Gemm1KPack, - true, // TransposeC - Gemm1KPack, // AMmaKStride - Gemm1KPack * XdlopsGemm{}.K0PerXdlops>{ - // BMmaKStride - make_tuple(0, 0, 0, 0)}; // A_origin - - auto acc1_thread_buf = gemm1_blockwise_gemm.GetCThreadBuffer(); - - // - // Blockwise softmax - // - auto workspace_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::reduction_space_offset, - SharedMemTrait::reduction_space_size_aligned); - - // get acc0 8D thread cluster - constexpr auto thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4 = - blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths() / - blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); - constexpr auto tm0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I0); - constexpr auto tn0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I1); - constexpr auto tm1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I2); - constexpr auto tn1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I3); - constexpr auto tm2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I4); - constexpr auto tn2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I5); - constexpr auto tn3 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I6); - constexpr auto tn4 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I7); - - // get acc0 thread map - constexpr auto m0_n_m1_to_m_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(tm0 * tm1, tm2)), - make_pass_through_transform(I1)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - constexpr auto threadid_to_m0_n_m1_adaptor = make_single_stage_tensor_adaptor( - make_tuple( - make_merge_transform(make_tuple(tm0 * tm1, tn0 * tn1 * tn2 * tn3 * tn4, tm2))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - const auto threadid_to_m_n_thread_cluster_adaptor = - chain_tensor_adaptors(m0_n_m1_to_m_n_adaptor, threadid_to_m0_n_m1_adaptor); - - // get acc0 2D thread cluster & 2D thread slice - constexpr auto thread_cluster_desc_m_n = make_naive_tensor_descriptor_packed( - make_tuple(tm0 * tm1 * tm2, tn0 * tn1 * tn2 * tn3 * tn4)); - constexpr auto thread_slice_desc_m_n = - make_naive_tensor_descriptor_packed(make_tuple(m0 * m1 * m2, n0 * n1 * n2 * n3 * n4)); - - auto blockwise_softmax = BlockwiseSoftmax{}; - - const index_t num_gemm1_k_block_outer_loop = - b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock; + + auto a1_thread_buf = make_static_buffer(a1_thread_desc_k0_m_k1.GetElementSpaceSize()); + auto b1_block_buf = make_dynamic_buffer(static_cast(p_shared), b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + + auto blockwise_gemm1 = + BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle{make_tuple(0, 0, 0, 0, 0)}; + + auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); + + const index_t num_gemm1_k_block_outer_loop = b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock; constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock; // Initialize C - StaticBuffer - c_thread_buf; + StaticBuffer c_thread_buf; c_thread_buf.Clear(); - // Initialize running sum and max of exponentiating row vectors - using SoftmaxBuf = typename decltype(blockwise_softmax)::BufferType; - SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new; - running_sum = 0; - running_sum_new = 0; - running_max = NumericLimits::Lowest(); - running_max_new = NumericLimits::Lowest(); - - // gemm1 K loop +/*******************************************************************************/ + // Flash Attention + // Dao, Tri, et al. "Flashattention: Fast and memory-efficient exact attention with io-awareness." arXiv preprint arXiv:2205.14135 (2022). index_t gemm1_k_block_outer_index = 0; - do - { - auto n_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock); - if(c0_matrix_mask.IsTileSkippable( - m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock)) - { - continue; - } - // gemm0 - gridwise_gemm_pipeline.template Run(a_grid_desc_ak0_m_ak1, - a_block_desc_ak0_m_ak1, - a_blockwise_copy, - a_grid_buf, - a_block_buf, - a_block_slice_copy_step, - b_grid_desc_bk0_n_bk1, - b_block_desc_bk0_n_bk1, - b_blockwise_copy, - b_grid_buf, - b_block_buf, - b_block_slice_copy_step, - blockwise_gemm, - acc_thread_buf, - num_k_block_main_loop); - + // Outer loop, along GEMM_L + // Inner loop, along GEMM_K + do{ + // gemm0 start, A-B swaped + const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); + GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, + a_block_desc_k0perblock_mperblock_k1, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b0_grid_desc_k0_l_k1, + b_block_desc_k0perblock_nperblock_k1, + b_blockwise_copy, + b0_grid_buf, + b_block_buf, + b_block_slice_copy_step, + blockwise_gemm, + acc_thread_buf, + K0BlockMainLoop); // do MNK padding or upper triangular masking if constexpr(MaskOutUpperTriangle || PadN) { @@ -797,13 +680,15 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle }); } else - { - static_for<0, acc_thread_buf.Size(), 1>{}( + { static_for<0, acc_thread_buf.Size(), 1>{}( [&](auto i) { acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); }); } - block_sync_lds(); // wait for lds read in gemm0 blockwise gemm + block_sync_lds(); + // gemm0 end + + // Tiled softmax start // softmax SoftmaxBuf& max = blockwise_softmax.max_value_buf; SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; @@ -814,7 +699,13 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle running_max_new = mathext::max(max, running_max); running_sum_new = mathext::exp(running_max - running_max_new) * running_sum + mathext::exp(max - running_max_new) * sum; - + + // Intra-Row data permutation, make swizzled A input for WMMA + __builtin_amdgcn_permlane16(0xeca86420, 0xfdb97531); + // Low/high row move data to low/high half of thread buffer + /* thread copy*/ + // Inter-Row data permutation, fullfill data duplication requirement + __builtin_amdgcn_permlanex16(0x76543210, 0xfedcba98); // gemm1 { // TODO: explore using dynamic buffer for a1 thread buffer @@ -841,6 +732,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle if constexpr(num_gemm1_k_block_inner_loop > 1) { static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) { + // Data cast from FloatAcc to FloatA happen here a1_blockwise_copy.Run(acc_thread_desc_k0_m_k1, make_tuple(Number{}, I0, I0), acc_thread_buf, @@ -879,14 +771,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle } } // end gemm1 - // workaround compiler issue; see ck/ck.hpp - if constexpr(CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE == 1 && - is_same_v && MPerBlock == 256 && NPerBlock == 128 && - Gemm1NPerBlock == 128) - { - __builtin_amdgcn_sched_barrier(0); - } - constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); constexpr auto cm0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0); @@ -910,8 +794,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle FloatGemmAcc c_new = (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c + math::exp(max[iM] - running_max_new[iM]) * acc1) / - running_sum_new[iM]; // Formula by Dao et al., - // https://arxiv.org/pdf/2205.14135v2.pdf section 3.1 + running_sum_new[iM]; c_thread_buf(I) = c_new; // O_new }); @@ -927,120 +810,102 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle running_sum = running_sum_new; block_sync_lds(); // wait for gemm1 LDS read - } while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); // end j loop - - // shuffle C and write out + }while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); +/*******************************************************************************/ + // write out to C, implement shuffle { - static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 && - Gemm1NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0, - "wrong!"); - - constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl); - constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl); + constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - // TODO: hacky, fix it! - constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = - gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); + // This API Provide All dimension (size) you need + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = + blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - // TODO: hacky, fix it! - // c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp is only used to get lengths - constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp = - gemm1_blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); + constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); + constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); + constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I5); + constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I6); - constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0); - constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1); - constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2); - constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3); - constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4); - constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5); - constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6); - constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7); - - constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock = - GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); + // LDS descriptor, shuffle and write out in MRepeat x NRepeat times + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); auto c_shuffle_block_buf = make_dynamic_buffer( static_cast(p_shared), - c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); - constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor( - c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, make_tuple( make_freeze_transform(I0), make_unmerge_transform(make_tuple( - Number{}, // M0 (MXdlPerWave) per shuffle - M1, // M1 = MWave - M2)), // M2 = MPerXdl + Number{}, // MRepeat per shuffle repeat + MWave, // MWave + MSubGroup, // MSubGroup * MAccVgprs = MPerWmma + MAccVgprs)), make_freeze_transform(I0), make_unmerge_transform(make_tuple( - Number{}, // N0 (NXdlPerWave) per shuffle - N1, // N1 = NWave - N2, // N2 * N3 * N4 = NPerXdl - N3, - N4))), + Number{}, // NRepeat per shuffle repeat + NWave, // NWave + NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple( - Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{})); + make_tuple(Sequence<>{}, Sequence<0, 1, 2, 6>{}, Sequence<>{}, Sequence<3, 4, 5>{})); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = - gemm1_blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0); + const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; - const auto m_thread_data_on_block_to_m0_m1_m2_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(M0, M1, M2))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_block_idx = - m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_block)); - - const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))), - make_tuple(Sequence<0, 1, 2, 3, 4>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_block_idx = - n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_block)); + const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); // shuffle: threadwise copy C from VGPR to LDS auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0, 1, 2, 3, 4, 5, 6, 7>, - 7, - 1, + MAccVgprs>, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + 1, // vector write pixel InMemoryDataOperationEnum::Set, 1, true>{ - c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4, + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, make_multi_index(0, - 0, m_thread_data_on_block_idx[I1], - n_thread_data_on_block_idx[I1], m_thread_data_on_block_idx[I2], + 0, + n_thread_data_on_block_idx[I1], n_thread_data_on_block_idx[I2], - n_thread_data_on_block_idx[I3], - n_thread_data_on_block_idx[I4]), - tensor_operation::element_wise::PassThrough{}}; + m_thread_data_on_block_idx[I3]), + ck::tensor_operation::element_wise::PassThrough{}}; // shuffle: blockwise copy C from LDS to global auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< @@ -1048,47 +913,47 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle CElementwiseOperation, // ElementwiseOperation, CGlobalMemoryDataOperation, // DstInMemOp, Sequence<1, - CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, 1, - CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, FloatCShuffle, // typename SrcData, FloatC, // typename DstData, - decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock), + decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), Sequence<0, 1, 2, 3>, // typename DimAccessOrder, 3, // index_t VectorDim, CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, true, // bool ThreadTransferSrcResetCoordinateAfterRun, false> // bool ThreadTransferDstResetCoordinateAfterRun> - {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, make_multi_index(0, 0, 0, 0), c_grid_desc_mblock_mperblock_nblock_nperblock, make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0), c_element_op}; + // space filling curve for local reg & global memory // space filling curve for threadwise C in VGPR constexpr auto sfc_c_vgpr = - SpaceFillingCurve, - Sequence<0, 1, 2, 3, 4, 5, 6, 7>, - Sequence, + Sequence<0, 1, 2, 3, 4, 5, 6>, + Sequence>{}; + MAccVgprs>>{}; // space filling curve for shuffled blockwise C in global mem constexpr auto sfc_c_global = - SpaceFillingCurve, + SpaceFillingCurve, Sequence<0, 2, 1, 3>, Sequence<1, - CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, 1, - CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{}; + CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{}; constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); @@ -1099,10 +964,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle block_sync_lds(); // each thread write its data from VGPR to LDS - c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4, + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, sfc_c_vgpr.GetIndexTupleOfNumber(access_id), c_thread_buf, - c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4, + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, c_shuffle_block_buf); // make sure it's safe to read from LDS @@ -1110,7 +975,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle // each block copy its data from LDS to global c_shuffle_block_copy_lds_to_global.Run( - c_shuffle_block_desc_mblock_mperblock_nblock_nperblock, + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, c_shuffle_block_buf, c_grid_desc_mblock_mperblock_nblock_nperblock, c_grid_buf); @@ -1118,13 +983,13 @@ struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle if constexpr(access_id < num_access - 1) { constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); - // move on C c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); } }); } + // clang-format on } }; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index d70c5180da3..fda0464caa5 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -414,7 +414,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO { // * Inline assembly need to elimate the duplicated data load, compiler won't help you // delete them. - amd_assembly_wmma_f32_16x16x16_f16_w32( - reg_a, reg_b, reg_c.template AsType()(Number<0>{})); - // reg_c.template AsType()(Number<0>{}) = - // __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( reg_a, reg_b, reg_c.template - // AsType()[Number<0>{}]); + // amd_assembly_wmma_f32_16x16x16_f16_w32( + // reg_a, reg_b, reg_c.template AsType()(Number<0>{})); + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( reg_a, reg_b, reg_c.template + AsType()[Number<0>{}]); } }; From a6b2f1c1b96411cd45f453acd243adcb50c30f06 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 9 Feb 2023 03:24:27 +0000 Subject: [PATCH 032/118] Add Inter-Row thread transfer --- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 14 +-- .../threadwise_tensor_slice_transfer.hpp | 116 ++++++++++++++++++ 2 files changed, 122 insertions(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 76fca261d69..7a78acf4b91 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -533,7 +533,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_L0PerBlock_NPerBlock_L1(); // A1 matrix blockwise copy - auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic< + auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< FloatAcc, FloatA, decltype(acc_thread_desc_k0_m_k1), @@ -542,7 +542,11 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle Sequence, Sequence<1, 0, 2>, 2, - n4>{tensor_operation::element_wise::PassThrough{}}; + n4, + // dst Rowlane + // 0x76543210 0xfedcba98 + // src Rowlane + 0x76543210, 0xfedcba98>{tensor_operation::element_wise::PassThrough{}}; // B1 matrix blockwise copy auto b1_blockwise_copy = @@ -700,12 +704,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle running_sum_new = mathext::exp(running_max - running_max_new) * running_sum + mathext::exp(max - running_max_new) * sum; - // Intra-Row data permutation, make swizzled A input for WMMA - __builtin_amdgcn_permlane16(0xeca86420, 0xfdb97531); - // Low/high row move data to low/high half of thread buffer - /* thread copy*/ - // Inter-Row data permutation, fullfill data duplication requirement - __builtin_amdgcn_permlanex16(0x76543210, 0xfedcba98); // gemm1 { // TODO: explore using dynamic buffer for a1 thread buffer diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index b0f453b025f..28aab97f023 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1298,4 +1298,120 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic ElementwiseOperation element_op_; }; +// Specilized for WMMA +// A single Wave32 is composed by double row +// Data exchange allowed between these two rows +// This RowLane Dst buf will be filled from two Src buf +// SrcA: From specific thread buffer hold by This RowLane on This Row +// SrcB: From specific thread buffer hold by This RowLane on The other Row +template ::type = false> +struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow +{ + static constexpr index_t nDim = SliceLengths::Size(); + + using Index = MultiIndex; + + __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow( + const ElementwiseOperation& element_op) + : element_op_{element_op} + { + static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), + "wrong! Desc need to known at compile-time"); + + static_assert(SliceLengths::At(Number{}) % DstScalarPerVector == 0, + "wrong! Not divisible"); + } + + template + __device__ void Run(const SrcDesc&, + const SrcSliceOriginIdx&, + const SrcBuffer& src_buf, + const DstDesc&, + const DstSliceOriginIdx&, + DstBuffer& dst_buf) + { + static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), + "wrong! Desc need to known at compile-time"); + + static_assert(is_known_at_compile_time>::value && + is_known_at_compile_time>::value, + "wrong! SliceOrigin need to known at compile-time"); + + static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(), + "wrong! Buffer need to be StaticBuffer"); + + // SrcDesc and src_slice_origin_idx are known at compile-time + constexpr auto src_desc = remove_cvref_t{}; + constexpr auto dst_desc = remove_cvref_t{}; + constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{}); + constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{}); + + // scalar per access on each dim + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_scalar_step_in_vector = + generate_sequence(detail::lambda_scalar_step_in_vector{}, Number{}); + + using SpaceFillingCurve = SpaceFillingCurve>; + + static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector, + "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector"); + + constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess(); + + static_for<0, num_access, 1>{}([&](auto idx_1d) { + constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d); + + // copy data from src_buf into dst_vector + static_for<0, DstScalarPerVector, 1>{}([&](auto i) { + constexpr index_t src_offset = src_desc.CalculateOffset( + src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); + + constexpr index_t dst_offset = dst_desc.CalculateOffset( + dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); + + SrcData v; + + // apply element-wise operation + element_op_(v, src_buf[Number{}]); + + if(get_thread_local_1d_id() % 32 > 16){ + // apply type convert + dst_buf(Number{}) = type_convert(v); + dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), + type_convert(v), + LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); + } + else{ + // apply type convert + dst_buf(Number{}) = type_convert(v); + dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), + type_convert(v), + LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); + } + }); + }); + } + + ElementwiseOperation element_op_; +}; + } // namespace ck From 5df713ef0f41754b578b76f10fa16166cd5bec4c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Sat, 11 Feb 2023 10:14:06 +0000 Subject: [PATCH 033/118] save progress --- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 60 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 77 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 808 ++++++++---------- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 689 +++++++++------ .../threadwise_tensor_slice_transfer.hpp | 4 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 50 +- 7 files changed, 923 insertions(+), 767 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index f091f456eff..7c771f23c10 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -2,7 +2,7 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. /* -Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o +Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g_k_l) * B1_g_l_n |-----------------| Gemm0 |-------------------------------------| @@ -39,7 +39,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ADataType = F16; using B0DataType = F16; using B1DataType = F16; -using AccDataType = F32; +using Acc0DataType = F32; +using Acc1DataType = F32; using CShuffleDataType = F32; using CDataType = F16; using Acc0BiasDataType = ck::Tuple<>; @@ -67,7 +68,7 @@ static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecial static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; using DeviceGemmInstance = - ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle< + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimN, @@ -76,11 +77,12 @@ using DeviceGemmInstance = ADataType, B0DataType, B1DataType, - CDataType, Acc0BiasDataType, + Acc0DataType, Acc1BiasDataType, - AccDataType, + Acc1DataType, CShuffleDataType, + CDataType, AElementOp, B0ElementOp, Acc0ElementOp, @@ -91,21 +93,21 @@ using DeviceGemmInstance = TensorSpecB0, TensorSpecB1, TensorSpecC, - 1, 256, 128, // MPerBlock - 128, // NPerBlock - 32, // KPerBlock - 64, // Gemm1NPerBlock - 32, // Gemm1KPerBlock - 8, // AK1 - 8, // BK1 - 2, // B1K1 - 32, // MPerXDL - 32, // NPerXDL - 1, // MXdlPerWave - 4, // NXdlPerWave - 2, // Gemm1NXdlPerWave + 128, // LPerBlock + 4, // K0PerBlock + 8, // K1 + 64, // NPerBlock + 4, // L0PerBlock + 8, // L1 + 16, // MPerWMMA + 16, // LPerWMMA + 16, // NPerWMMA + //Per repeat = wave_m = wave_num, wave_n = 1 + 1, // MRepeat + 8, // LRepeat + 4, // NRepeat S<4, 64, 1>, // ABlockTransfer S<1, 0, 2>, S<1, 0, 2>, @@ -113,44 +115,44 @@ using DeviceGemmInstance = 8, 8, true, - S<4, 64, 1>, // BBlockTransfer + S<4, 64, 1>, // B0BlockTransfer S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<16, 16, 1>, // B1BlockTransfer - S<0, 2, 1>, - S<0, 2, 1>, + S<4, 64, 1>, // B1BlockTransfer + S<1, 0, 2>, + S<1, 0, 2>, 1, - 4, - 2, + 8, + 8, false, 1, // CShuffleMXdlPerWavePerShuffle 2, // CShuffleNXdlPerWavePerShuffle S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock - 8, // CShuffleBlockTransferScalarPerVector_NPerBlock + 4, // CShuffleBlockTransferScalarPerVector_NPerBlock MaskingSpec>; // MaskingSpecialization // Ref Gemm0: fp16 in, fp32 out using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm; // Ref Softmax: fp32 in, fp16 out using ReferenceSoftmaxInstance = - ck::tensor_operation::host::ReferenceSoftmax; + ck::tensor_operation::host::ReferenceSoftmax; // Ref Gemm1: fp16 in, fp16 out using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm; diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 0b876af952f..8aba9ccdcfa 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -198,7 +198,7 @@ int run(int argc, char* argv[]) Tensor a_g_m_k({BatchCount, M, K}); Tensor b0_g_k_n({BatchCount, K, N}); Tensor b1_g_n_o({BatchCount, N, O}); - Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 1cd5efd9bae..3cf62a0db66 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -129,11 +129,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle return make_tuple(c_thread_m, c_thread_n); } - using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle( - Tuple4 a_origin = CalculateAThreadOriginDataIndex(), - Tuple4 b_origin = CalculateBThreadOriginDataIndex()) - : a_thread_copy_(a_origin), b_thread_copy_(b_origin) + // using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); + // __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle( + // Tuple4 a_origin = CalculateAThreadOriginDataIndex(), + // Tuple4 b_origin = CalculateBThreadOriginDataIndex()) + // : a_thread_copy_(a_origin), b_thread_copy_(b_origin) + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle() { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && BK0NK1BlockDesc::IsKnownAtCompileTime(), @@ -303,8 +304,10 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle B_K1, B_K1>; - AThreadCopy a_thread_copy_; - BThreadCopy b_thread_copy_; + AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; + BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; + // AThreadCopy a_thread_copy_; + // BThreadCopy b_thread_copy_; }; // block wise level pipe designed for inline asm @@ -425,6 +428,25 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO return make_tuple(c_thread_m, c_thread_n); } + template + __device__ static auto CalculateCThreadOriginDataIndex7D(Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D(); + + return make_tuple(Number{}, + blk_idx[I0], + waveId_m, + Number{}, + waveId_n, + blk_idx[I1], + blk_idx[I2]); + } + __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO() { static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && @@ -438,6 +460,30 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } + + // transposed WMMA output C' = B' * A' + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + // constexpr auto NSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + // constexpr auto MThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + I1, + Number{}, + I1, + I1, + NAccVgprs)); + } + // Thread level, register decriptor. Vector-write __host__ __device__ static constexpr auto GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() @@ -483,6 +529,23 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); } + // transposed WMMA output C' = B' * A' + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() + { + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MThreadPerSubGroup_NBlockxRepeat_NWave_NSubGroup_NAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + } + // Provide dimension size __host__ __device__ static constexpr auto GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 6d09dd4bed2..84ef50b6895 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -22,186 +22,97 @@ namespace ck { namespace tensor_operation { namespace device { -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1( - const FloatAB* __restrict__ p_a_grid, - const FloatAB* __restrict__ p_b_grid, - const FloatAB* __restrict__ p_b1_grid, - FloatC* __restrict__ p_c_grid, - const AElementwiseOperation a_element_op, - const BElementwiseOperation b_element_op, - const AccElementwiseOperation acc_element_op, - const B1ElementwiseOperation b1_element_op, - const CElementwiseOperation c_element_op, - const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, - const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, - const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock - c_grid_desc_mblock_mperblock_nblock_nperblock, - const Block2CTileMap block_2_ctile_map, - const index_t batch_count, - const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch, - const C0MatrixMask c0_matrix_mask) -{ -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) - __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - const index_t num_blocks_per_batch = - __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); - const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - - const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); - const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetBBasePtr(g_idx))); - const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); - const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); - - GridwiseGemm::template Run(p_a_grid + a_batch_offset, - p_b_grid + b_batch_offset, - p_b1_grid + b1_batch_offset, - p_c_grid + c_batch_offset, - p_shared, - a_element_op, - b_element_op, - acc_element_op, - b1_element_op, - c_element_op, - a_grid_desc_ak0_m_ak1, - b_grid_desc_bk0_n_bk1, - b1_grid_desc_bk0_n_bk1, - c_grid_desc_mblock_mperblock_nblock_nperblock, - block_2_ctile_map, - c0_matrix_mask); -#else - ignore = p_a_grid; - ignore = p_b_grid; - ignore = p_b1_grid; - ignore = p_c_grid; - ignore = a_element_op; - ignore = b_element_op; - ignore = acc_element_op; - ignore = b1_element_op; - ignore = c_element_op; - ignore = a_grid_desc_ak0_m_ak1; - ignore = b_grid_desc_bk0_n_bk1; - ignore = b1_grid_desc_bk0_n_bk1; - ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; - ignore = block_2_ctile_map; - ignore = batch_count; - ignore = compute_base_ptr_of_batch; - ignore = c0_matrix_mask; -#endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) -} - -// Computes C = A * B0 * B1 +// Computes C = A * B0 * B1 +// MN = MK * KL * LN // ^^^^^^ (Acc0) // ^^^^^^^^^^^ (Acc1) template -struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle + ck::index_t NumPrefetch = 1, + ck::LoopScheduler LoopSched = make_default_loop_scheduler(), + ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1> +struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle : public DeviceBatchedGemmSoftmaxGemmPermute { - static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0, + static_assert(NumDimG > 0 && NumDimM > 0 && NumDimL > 0 && NumDimK > 0 && NumDimN > 0, "Number of dimension must be greater than 0"); static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size(); @@ -210,64 +121,69 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle // TODO ANT: implement bias combination static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented"); -#if 0 - // TODO ANT: use alias static constexpr index_t NumDimGemm0M = NumDimM; - static constexpr index_t NumDimGemm0N = NumDimN; + static constexpr index_t NumDimGemm0N = NumDimL; static constexpr index_t NumDimGemm0K = NumDimK; static constexpr index_t NumDimGemm1M = NumDimM; - static constexpr index_t NumDimGemm1N = NumDimO; - static constexpr index_t NumDimGemm1K = NumDimN; -#endif + static constexpr index_t NumDimGemm1N = NumDimN; + static constexpr index_t NumDimGemm1K = NumDimL; + + static constexpr index_t KPerBlock = K0PerBlock * K1; - using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle; + using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; - + static constexpr auto I3 = Number<3>{}; + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< - Sequence, - Sequence, + Sequence, + Sequence, GemmSpec, ASpec, - BSpec, + B0Spec, B1Spec, CSpec>; + // K1 = Max Vector Access Pixels + static constexpr auto K1Number = Number{}; + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, K0PerBlock* K1}; static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + const std::vector& a_gs_ms_ks_strides_vec) { return Transform::MakeAGridDescriptor_AK0_M_AK1( Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), - Number{}); + Number{}); } - static auto MakeBGridDescriptor_BK0_N_BK1(const std::vector& b_gs_ns_ks_lengths_vec, - const std::vector& b_gs_ns_ks_strides_vec) + static auto MakeB0GridDescriptor_BK0_L_BK1(const std::vector& b0_gs_ls_ks_lengths_vec, + const std::vector& b0_gs_ls_ks_strides_vec) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( - Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec), - Number{}); + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, b0_gs_ls_ks_strides_vec), + Number{}); } static auto - MakeB1GridDescriptor_BK0_N_BK1(const std::vector& b1_gs_gemm1ns_gemm1ks_lengths_vec, - const std::vector& b1_gs_gemm1ns_gemm1ks_strides_vec) + MakeB1GridDescriptor_BL0_N_BL1(const std::vector& b1_gs_ns_ls_lengths_vec, + const std::vector& b1_gs_ns_ls_strides_vec) { return Transform::MakeB1GridDescriptor_BK0_N_BK1( - Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec, - b1_gs_gemm1ns_gemm1ks_strides_vec), - Number{}); + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}); } using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1({}, {})); - using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1({}, {})); - using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1({}, {})); + using B0GridDesc_BK0_L_BK1 = decltype(MakeB0GridDescriptor_BK0_L_BK1({}, {})); + using B1GridDesc_BL0_N_BL1 = decltype(MakeB1GridDescriptor_BL0_N_BL1({}, {})); using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); - using BGridDesc_G_N_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); - using B1GridDesc_G_N_K = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); + using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); + using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); constexpr static auto make_MaskOutPredicate() @@ -286,12 +202,12 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle struct ComputeBasePtrOfStridedBatch { ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, - const BGridDesc_G_N_K& b_grid_desc_g_n_k, - const B1GridDesc_G_N_K& b1_grid_desc_g_n_k, + const B0GridDesc_G_L_K& b0_grid_desc_g_l_k, + const B1GridDesc_G_N_L& b1_grid_desc_g_n_l, const CGridDesc_G_M_N& c_grid_desc_g_m_n) : a_grid_desc_g_m_k_(a_grid_desc_g_m_k), - b_grid_desc_g_n_k_(b_grid_desc_g_n_k), - b1_grid_desc_g_n_k_(b1_grid_desc_g_n_k), + b0_grid_desc_g_l_k_(b0_grid_desc_g_l_k), + b1_grid_desc_g_n_l_(b1_grid_desc_g_n_l), c_grid_desc_g_m_n_(c_grid_desc_g_m_n) { } @@ -301,14 +217,14 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); } - __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const + __host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const { - return b_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + return b0_grid_desc_g_l_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); } __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const { - return b1_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + return b1_grid_desc_g_n_l_.CalculateOffset(make_multi_index(g_idx, 0, 0)); } __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const @@ -318,208 +234,202 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle private: AGridDesc_G_M_K a_grid_desc_g_m_k_; - BGridDesc_G_N_K b_grid_desc_g_n_k_; - B1GridDesc_G_N_K b1_grid_desc_g_n_k_; + B0GridDesc_G_L_K b0_grid_desc_g_l_k_; + B1GridDesc_G_N_L b1_grid_desc_g_n_l_; CGridDesc_G_M_N c_grid_desc_g_m_n_; }; - // GridwiseGemm - using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle< - ADataType, // TODO: distinguish A/B datatype - GemmAccDataType, + // GridwiseOp + using GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle< + // DataType Family + ADataType, + B0DataType, + Acc0DataType, + B1DataType, + Acc1DataType, CShuffleDataType, CDataType, + // ElementwiseOp Family AElementwiseOperation, - BElementwiseOperation, + B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, InMemoryDataOperationEnum::Set, + // InMemory Data Descriptor AGridDesc_AK0_M_AK1, - BGridDesc_BK0_N_BK1, - B1GridDesc_BK0_N_BK1, + B0GridDesc_BK0_L_BK1, + B1GridDesc_BL0_N_BL1, CGridDesc_M_N, - NumGemmKPrefetchStage, - BlockSize, + // Tiling Family MPerBlock, + LPerBlock, + K0PerBlock, // K0 * K1 = Gemm0 GEMM_K Dim + K1, // NPerBlock, - KPerBlock, - Gemm1NPerBlock, - Gemm1KPerBlock, - AK1, - BK1, - B1K1, - MPerXDL, - NPerXDL, - MXdlPerWave, - NXdlPerWave, - Gemm1NXdlPerWave, - ABlockTransferThreadClusterLengths_AK0_M_AK1, + L0PerBlock, + L1, + MPerWMMA, + LPerWMMA, + NPerWMMA, + MRepeat, + LRepeat, + NRepeat, + // ThreadCluster Family + BlockSize, + ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_AK1, + ABlockTransferDstScalarPerVector_K1, true, - ABlockLdsExtraM, - BBlockTransferThreadClusterLengths_BK0_N_BK1, - BBlockTransferThreadClusterArrangeOrder, - BBlockTransferSrcAccessOrder, - BBlockTransferSrcVectorDim, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_BK1, + ABlockLdsAddExtraM, + B0BlockTransferThreadClusterLengths_K0_L_K1, + B0BlockTransferThreadClusterArrangeOrder, + B0BlockTransferSrcAccessOrder, + B0BlockTransferSrcVectorDim, + B0BlockTransferSrcScalarPerVector, + B0BlockTransferDstScalarPerVector_K1, true, - BBlockLdsExtraN, - B1BlockTransferThreadClusterLengths_BK0_N_BK1, + B0BlockLdsAddExtraL, + B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, B1BlockTransferSrcAccessOrder, B1BlockTransferSrcVectorDim, B1BlockTransferSrcScalarPerVector, - B1BlockTransferDstScalarPerVector_BK1, + B1BlockTransferDstScalarPerVector_L1, false, - B1BlockLdsExtraN, - CShuffleMXdlPerWavePerShuffle, - CShuffleNXdlPerWavePerShuffle, + B1BlockLdsAddExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CShuffleBlockTransferScalarPerVector_NPerBlock, - LoopSched, Transform::matrix_padder.PadN, - MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>; + MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle, + NumPrefetch, + LoopSched, + PipelineVer>; // Argument - // FIXME: constness struct Argument : public BaseArgument { Argument( const ADataType* p_a_grid, - const BDataType* p_b_grid, + const B0DataType* p_b0_grid, const B1DataType* p_b1_grid, CDataType* p_c_grid, const std::array p_acc0_biases, const std::array p_acc1_biases, const std::vector& a_gs_ms_ks_lengths, const std::vector& a_gs_ms_ks_strides, - const std::vector& b_gs_ns_ks_lengths, - const std::vector& b_gs_ns_ks_strides, - const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_strides, - const std::array, NumAcc1Bias> - acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths - const std::array, NumAcc1Bias> - acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + const std::vector& b0_gs_ls_ks_lengths, + const std::vector& b0_gs_ls_ks_strides, + const std::vector& b1_gs_ns_ls_lengths, + const std::vector& b1_gs_ns_ls_strides, + const std::vector& c_gs_ms_ns_lengths, + const std::vector& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + const index_t M01, + const index_t N01, AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, + B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, CElementwiseOperation c_element_op) : p_a_grid_{p_a_grid}, - p_b_grid_{p_b_grid}, + p_b0_grid_{p_b0_grid}, p_b1_grid_{p_b1_grid}, p_c_grid_{p_c_grid}, a_grid_desc_ak0_m_ak1_{ DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b_grid_desc_bk0_n_bk1_{ - DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)}, - b1_grid_desc_bk0_n_bk1_{DeviceOp::MakeB1GridDescriptor_BK0_N_BK1( - b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)}, - c_grid_desc_m_n_{Transform::MakeCGridDescriptor_M_N(c_gs_ms_gemm1ns_lengths, - c_gs_ms_gemm1ns_strides)}, + b0_grid_desc_bk0_l_bk1_{ + DeviceOp::MakeB0GridDescriptor_BK0_L_BK1(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc_bl0_n_bl1_{ + DeviceOp::MakeB1GridDescriptor_BL0_N_BL1(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + c_grid_desc_m_n_{ + Transform::MakeCGridDescriptor_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, a_grid_desc_g_m_k_{ Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b_grid_desc_g_n_k_{ - Transform::MakeB0GridDescriptor_G_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)}, - b1_grid_desc_g_n_k_{Transform::MakeB1GridDescriptor_G_N_K( - b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)}, - c_grid_desc_g_m_n_{Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_gemm1ns_lengths, - c_gs_ms_gemm1ns_strides)}, + b0_grid_desc_g_l_k_{ + Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc_g_n_l_{ + Transform::MakeB1GridDescriptor_G_N_K(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + c_grid_desc_g_m_n_{ + Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, c_grid_desc_mblock_mperblock_nblock_nperblock_{}, - block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)}, + block_2_ctile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)}, a_element_op_{a_element_op}, - b_element_op_{b_element_op}, + b0_element_op_{b0_element_op}, acc_element_op_{acc_element_op}, b1_element_op_{b1_element_op}, c_element_op_{c_element_op}, - c0_matrix_mask_{b_grid_desc_g_n_k_.GetLength(I1)}, - raw_lengths_mz_nz_kz_gemm1nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], - b_gs_ns_ks_lengths[NumDimG + NumDimN - 1], - b_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1], - b1_gs_gemm1ns_gemm1ks_lengths[NumDimG + NumDimO - 1]}, + c0_matrix_mask_{b0_grid_desc_g_l_k_.GetLength(I1)}, + raw_lengths_mz_lz_kz_nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], + b0_gs_ls_ks_lengths[NumDimG + NumDimL - 1], + b0_gs_ls_ks_lengths[NumDimG + NumDimL + NumDimK - 1], + b1_gs_ns_ls_lengths[NumDimG + NumDimN - 1]}, a_mz_kz_strides_{a_gs_ms_ks_strides[NumDimG + NumDimM - 1], a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}, - b_nz_kz_strides_{b_gs_ns_ks_strides[NumDimG + NumDimN - 1], - b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1]}, - b1_nz_kz_strides_{b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO - 1], - b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO + NumDimN - 1]}, - c_mz_gemm1nz_strides_{c_gs_ms_gemm1ns_strides[NumDimG + NumDimM - 1], - c_gs_ms_gemm1ns_strides[NumDimG + NumDimM + NumDimO - 1]}, + b0_lz_kz_strides_{b0_gs_ls_ks_strides[NumDimG + NumDimL - 1], + b0_gs_ls_ks_strides[NumDimG + NumDimL + NumDimK - 1]}, + b1_nz_lz_strides_{b1_gs_ns_ls_strides[NumDimG + NumDimN - 1], + b1_gs_ns_ls_strides[NumDimG + NumDimN + NumDimL - 1]}, + c_mz_nz_strides_{c_gs_ms_ns_strides[NumDimG + NumDimM - 1], + c_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1]}, batch_count_{c_grid_desc_g_m_n_.GetLength(I0)}, - compute_base_ptr_of_batch_{ - a_grid_desc_g_m_k_, b_grid_desc_g_n_k_, b1_grid_desc_g_n_k_, c_grid_desc_g_m_n_} + compute_ptr_offset_of_batch_{ + a_grid_desc_g_m_k_, b0_grid_desc_g_l_k_, b1_grid_desc_g_n_l_, c_grid_desc_g_m_n_} { // TODO ANT: implement bias addition ignore = p_acc0_biases; ignore = p_acc1_biases; - ignore = acc0_biases_gs_ms_ns_lengths; - ignore = acc0_biases_gs_ms_ns_strides; - ignore = acc1_biases_gs_ms_gemm1ns_lengths; - ignore = acc1_biases_gs_ms_gemm1ns_strides; - - if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_, - b_grid_desc_bk0_n_bk1_, - b1_grid_desc_bk0_n_bk1_, - c_grid_desc_m_n_, - block_2_ctile_map_)) + ignore = acc0_biases_gs_ms_ls_lengths; + ignore = acc0_biases_gs_ms_ls_strides; + ignore = acc1_biases_gs_ms_ns_lengths; + ignore = acc1_biases_gs_ms_ns_strides; + + if(GridwiseOp::CheckValidity(a_grid_desc_ak0_m_ak1_, + b0_grid_desc_bk0_l_bk1_, + b1_grid_desc_bl0_n_bl1_, + c_grid_desc_m_n_, + block_2_ctile_map_)) { c_grid_desc_mblock_mperblock_nblock_nperblock_ = - GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( c_grid_desc_m_n_); } } - void Print() const - { - std::cout << "a_grid_desc_g_m_k_: " << a_grid_desc_g_m_k_.GetLength(I0) << ", " - << a_grid_desc_g_m_k_.GetLength(I1) << ", " - << a_grid_desc_g_m_k_.GetLength(I2) << '\n'; - std::cout << "b_grid_desc_g_n_k_: " << b_grid_desc_g_n_k_.GetLength(I0) << ", " - << b_grid_desc_g_n_k_.GetLength(I1) << ", " - << b_grid_desc_g_n_k_.GetLength(I2) << '\n'; - std::cout << "b1_grid_desc_g_n_k_: " << b1_grid_desc_g_n_k_.GetLength(I0) << ", " - << b1_grid_desc_g_n_k_.GetLength(I1) << ", " - << b1_grid_desc_g_n_k_.GetLength(I2) << '\n'; - std::cout << "c_grid_desc_g_m_n_: " << c_grid_desc_g_m_n_.GetLength(I0) << ", " - << c_grid_desc_g_m_n_.GetLength(I1) << ", " - << c_grid_desc_g_m_n_.GetLength(I2) << '\n'; - } - - // pointers + // Pointers const ADataType* p_a_grid_; - const BDataType* p_b_grid_; + const B0DataType* p_b0_grid_; const B1DataType* p_b1_grid_; CDataType* p_c_grid_; - // tensor descriptor + // Tensor Descriptors AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; - BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; - B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_; + B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1_; + B1GridDesc_BL0_N_BL1 b1_grid_desc_bl0_n_bl1_; CGridDesc_M_N c_grid_desc_m_n_; + AGridDesc_G_M_K a_grid_desc_g_m_k_; - BGridDesc_G_N_K b_grid_desc_g_n_k_; - B1GridDesc_G_N_K b1_grid_desc_g_n_k_; + B0GridDesc_G_L_K b0_grid_desc_g_l_k_; + B1GridDesc_G_N_L b1_grid_desc_g_n_l_; CGridDesc_G_M_N c_grid_desc_g_m_n_; - typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + + typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_; - // block-to-c-tile map - typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_; + // Block to Tile mapping + typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map_; - // element-wise op + // ElementwiseOp AElementwiseOperation a_element_op_; - BElementwiseOperation b_element_op_; + B0ElementwiseOperation b0_element_op_; AccElementwiseOperation acc_element_op_; B1ElementwiseOperation b1_element_op_; CElementwiseOperation c_element_op_; @@ -527,15 +437,17 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle // check C0 masking and padding C0MatrixMask c0_matrix_mask_; - // For robust IsSupportedArgument() check - std::vector raw_lengths_mz_nz_kz_gemm1nz_; + // Strides for the last M/N/K dimensions of A/B0/B1/C + // for sanity check of vector load/store + std::vector raw_lengths_mz_lz_kz_nz_; std::vector a_mz_kz_strides_; - std::vector b_nz_kz_strides_; - std::vector b1_nz_kz_strides_; - std::vector c_mz_gemm1nz_strides_; + std::vector b0_lz_kz_strides_; + std::vector b1_nz_lz_strides_; + std::vector c_mz_nz_strides_; index_t batch_count_; - ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_; + // Batch Offset + ComputeBasePtrOfStridedBatch compute_ptr_offset_of_batch_; }; // Invoker @@ -545,38 +457,32 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - if(!DeviceOp::IsSupportedArgument(arg)) - { - throw std::runtime_error("wrong! unsupported argument"); - } - - const index_t grid_size = - arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; + const index_t grid_size = arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; - // Gemm0_K - const auto K = - arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + const auto K = arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); - float ave_time = 0; + auto launch_kernel = [&](auto has_main_k_block_loop) { + constexpr bool has_main_loop = has_main_k_block_loop.value; - auto launch_kernel = [&](auto has_main_k_block_loop_) { - const auto kernel = kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1< - GridwiseGemm, - ADataType, // TODO: distiguish A/B datatype + const auto kernel = kernel_batched_gemm_softmax_gemm_wmma_cshuffle< + GridwiseOp, + ADataType, + B0DataType, + B1DataType, CDataType, + DeviceOp::AGridDesc_AK0_M_AK1, + DeviceOp::B0GridDesc_BK0_L_BK1, + DeviceOp::B1GridDesc_BL0_N_BL1, + typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, AElementwiseOperation, - BElementwiseOperation, + B0ElementwiseOperation, AccElementwiseOperation, B1ElementwiseOperation, CElementwiseOperation, - DeviceOp::AGridDesc_AK0_M_AK1, - DeviceOp::BGridDesc_BK0_N_BK1, - DeviceOp::B1GridDesc_BK0_N_BK1, - typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, - typename GridwiseGemm::DefaultBlock2CTileMap, ComputeBasePtrOfStridedBatch, C0MatrixMask, - has_main_k_block_loop_>; + typename GridwiseOp::DefaultBlock2CTileMap, + has_main_loop>; return launch_and_time_kernel(stream_config, kernel, @@ -584,36 +490,32 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle dim3(BlockSize), 0, arg.p_a_grid_, - arg.p_b_grid_, + arg.p_b0_grid_, arg.p_b1_grid_, arg.p_c_grid_, + arg.a_grid_desc_ak0_m_ak1_, + arg.b0_grid_desc_bk0_l_bk1_, + arg.b1_grid_desc_bl0_n_bl1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, arg.a_element_op_, - arg.b_element_op_, + arg.b0_element_op_, arg.acc_element_op_, arg.b1_element_op_, arg.c_element_op_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.b1_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.block_2_ctile_map_, arg.batch_count_, - arg.compute_base_ptr_of_batch_, - arg.c0_matrix_mask_); + arg.compute_ptr_offset_of_batch_, + arg.c0_matrix_mask_, + arg.block_2_ctile_map_); }; - // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need - // to concern Gemm0's loop - if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) { - ave_time = launch_kernel(integral_constant{}); + return launch_kernel(integral_constant{}); } else { - ave_time = launch_kernel(integral_constant{}); + return launch_kernel(integral_constant{}); } - - return ave_time; } // polymorphic @@ -632,25 +534,40 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle static bool IsSupportedArgument(const Argument& arg) { -#if DEBUG_LOG - arg.Print(); -#endif + if(ck::get_device_name() == "gfx1100") + { + if constexpr(!(is_same_v || is_same_v)) + { + return false; + } - if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a")) + if constexpr(!(is_same_v || is_same_v)) + { + return false; + } + } + else { return false; } - // TODO ANT: Check if tensor specialization & strides mismatch + if(!GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, + arg.b0_grid_desc_bk0_l_bk1_, + arg.b1_grid_desc_bl0_n_bl1_, + arg.c_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + return false; + } // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); - const index_t c_gemm1n = arg.c_grid_desc_m_n_.GetLength(I1); - const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); - const index_t b1_gemm1n = arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1); + const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded + const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); + const index_t c_n = arg.c_grid_desc_m_n_.GetLength(I1); + const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); + const index_t b1_n = arg.b1_grid_desc_bl0_n_bl1_.GetLength(I1); - if(!(c_g == arg.batch_count_ && c_m == a_m && c_gemm1n == b1_gemm1n)) + if(!(c_g == arg.batch_count_ && c_m == a_m && c_n == b1_n)) { return false; } @@ -658,19 +575,19 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle // Note: we need raw lengths since threadwise copy can not handle vector load when part of // vector is out of bounds // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O - const auto MzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[0]; - const auto NzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[1]; - const auto KzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[2]; - const auto Gemm1NzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[3]; + const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; + const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; + const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; + const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; // Check scalar per vector requirement const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; - const auto b_extent_lowest = BBlockTransferSrcVectorDim == 2 ? KzRaw : NzRaw; - const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? NzRaw : Gemm1NzRaw; - const auto c_extent_lowest = Gemm1NzRaw; + const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; + const auto c_extent_lowest = NzRaw; if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && - b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 && + b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) { @@ -680,24 +597,20 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle // Check vector load/store requirement const auto a_stride_lowest = ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; - const auto b_stride_lowest = - BBlockTransferSrcVectorDim == 2 ? arg.b_nz_kz_strides_[1] : arg.b_nz_kz_strides_[0]; + const auto b0_stride_lowest = + B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; const auto b1_stride_lowest = - B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_kz_strides_[1] : arg.b1_nz_kz_strides_[0]; - const auto c_stride_lowest = - arg.c_mz_gemm1nz_strides_[1]; // cshuffle assumes lowest dim in Gemm1Ns to be contiguous + B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; + const auto c_stride_lowest = + arg.c_mz_nz_strides_[1]; - if(!(a_stride_lowest == 1 || b_stride_lowest == 1 || b1_stride_lowest == 1 || + if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || c_stride_lowest == 1)) { return false; } - return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, - arg.b_grid_desc_bk0_n_bk1_, - arg.b1_grid_desc_bk0_n_bk1_, - arg.c_grid_desc_m_n_, - arg.block_2_ctile_map_); + return true; } // polymorphic @@ -706,114 +619,115 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle return IsSupportedArgument(*dynamic_cast(p_arg)); } - static auto MakeArgument( + static auto + MakeArgument( const ADataType* p_a, - const BDataType* p_b, + const B0DataType* p_b0, const B1DataType* p_b1, CDataType* p_c, const std::array p_acc0_biases, const std::array p_acc1_biases, const std::vector& a_gs_ms_ks_lengths, const std::vector& a_gs_ms_ks_strides, - const std::vector& b_gs_ns_ks_lengths, - const std::vector& b_gs_ns_ks_strides, - const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_strides, - const std::array, NumAcc1Bias> - acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths - const std::array, NumAcc1Bias> - acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + const std::vector& b0_gs_ls_ks_lengths, + const std::vector& b0_gs_ls_ks_strides, + const std::vector& b1_gs_ns_ls_lengths, + const std::vector& b1_gs_ns_ls_strides, + const std::vector& c_gs_ms_ns_lengths, + const std::vector& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, + B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) + CElementwiseOperation c_element_op) { return Argument{p_a, - p_b, + p_b0, p_b1, p_c, p_acc0_biases, p_acc1_biases, a_gs_ms_ks_lengths, a_gs_ms_ks_strides, - b_gs_ns_ks_lengths, - b_gs_ns_ks_strides, - b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides - acc0_biases_gs_ms_ns_lengths, - acc0_biases_gs_ms_ns_strides, - acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths - acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + b0_gs_ls_ks_lengths, + b0_gs_ls_ks_strides, + b1_gs_ns_ls_lengths, + b1_gs_ns_ls_strides, + c_gs_ms_ns_lengths, + c_gs_ms_ns_strides, + acc0_biases_gs_ms_ls_lengths, + acc0_biases_gs_ms_ls_strides, + acc1_biases_gs_ms_ns_lengths, + acc1_biases_gs_ms_ns_strides, + 1, + 1, a_element_op, - b_element_op, + b0_element_op, acc_element_op, b1_element_op, c_element_op}; } - static auto MakeInvoker() { return Invoker{}; } - // polymorphic - // FIXME: constness - std::unique_ptr MakeArgumentPointer( + std::unique_ptr + MakeArgumentPointer( const void* p_a, - const void* p_b, + const void* p_b0, const void* p_b1, void* p_c, const std::array p_acc0_biases, const std::array p_acc1_biases, const std::vector& a_gs_ms_ks_lengths, const std::vector& a_gs_ms_ks_strides, - const std::vector& b_gs_ns_ks_lengths, - const std::vector& b_gs_ns_ks_strides, - const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ns_strides, - const std::array, NumAcc1Bias> - acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths - const std::array, NumAcc1Bias> - acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides + const std::vector& b0_gs_ls_ks_lengths, + const std::vector& b0_gs_ls_ks_strides, + const std::vector& b1_gs_ns_ls_lengths, + const std::vector& b1_gs_ns_ls_strides, + const std::vector& c_gs_ms_ns_lengths, + const std::vector& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, AElementwiseOperation a_element_op, - BElementwiseOperation b_element_op, + B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, CElementwiseOperation c_element_op) override { return std::make_unique(static_cast(p_a), - static_cast(p_b), + static_cast(p_b0), static_cast(p_b1), static_cast(p_c), - p_acc0_biases, // cast in struct Argument - p_acc1_biases, // cast in struct Argument + p_acc0_biases, + p_acc1_biases, a_gs_ms_ks_lengths, a_gs_ms_ks_strides, - b_gs_ns_ks_lengths, - b_gs_ns_ks_strides, - b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides - acc0_biases_gs_ms_ns_lengths, - acc0_biases_gs_ms_ns_strides, - acc1_biases_gs_ms_gemm1ns_lengths, - acc1_biases_gs_ms_gemm1ns_strides, + b0_gs_ls_ks_lengths, + b0_gs_ls_ks_strides, + b1_gs_ns_ls_lengths, + b1_gs_ns_ls_strides, + c_gs_ms_ns_lengths, + c_gs_ms_ns_strides, + acc0_biases_gs_ms_ls_lengths, + acc0_biases_gs_ms_ls_strides, + acc1_biases_gs_ms_ns_lengths, + acc1_biases_gs_ms_ns_strides, + 1, + 1, a_element_op, - b_element_op, + b0_element_op, acc_element_op, b1_element_op, c_element_op); } + static auto MakeInvoker() { return Invoker{}; } + // polymorphic std::unique_ptr MakeInvokerPointer() override { @@ -825,25 +739,33 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle { auto str = std::stringstream(); + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + // clang-format off - str << "DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle" + str << "DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle" << "<" << BlockSize << ", " << MPerBlock << ", " - << NPerBlock << ", " - << KPerBlock << ", " - << AK1 << ", " - << BK1 << ", " + << LPerBlock << ", " + << K0PerBlock << ", " + << K1 << ", " + << MPerBlock << ", " + << NPerWMMA << ", " << MPerBlock << ", " - << Gemm1NPerBlock << ", " - << Gemm1KPerBlock << ", " - << B1K1 << ", " - << getGemmSpecializationString(GemmSpec) << ", " - << "ASpec" << getTensorSpecializationString(ASpec) << ", " - << "B0Spec" << getTensorSpecializationString(BSpec) << ", " - << "B1Spec" << getTensorSpecializationString(B1Spec) << ", " - << "CSpec" << getTensorSpecializationString(CSpec) << ", " - << getMaskingSpecializationString(MaskingSpec) << ">"; + << NPerBlock << ", " + << L0PerBlock << ", " + << L1 + << ">" + << " NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; // clang-format on return str.str(); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 7a78acf4b91..9300c1df595 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -20,71 +20,106 @@ namespace ck { template __global__ void #if CK_USE_LAUNCH_BOUNDS __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) #endif - kernel_gemm_wmma( + kernel_batched_gemm_softmax_gemm_wmma_cshuffle( const FloatA* __restrict__ p_a_grid, - const FloatB* __restrict__ p_b0_grid, + const FloatB0* __restrict__ p_b0_grid, + const FloatB1* __restrict__ p_b1_grid, FloatC* __restrict__ p_c_grid, - const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1, - const BGridDesc_K0_N_K1 b0_grid_desc_k0_l_k1, + const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, + const B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1, + const B1GridDesc_BL0_N_BL1 b1_grid_desc_l0_n_l1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock, - // const - // CGridDescriptor_MBlockxRepeat_MWave_MSubGroup_MAccVgprs_NBlockxRepeat_NWave_NThreadPerSubGroup - // c_grid_desc_mblockxrepeat_mwave_msubgroup_maccvgprs_nblockxrepeat_nwave_nthreadpersubgroup, const AElementwiseOperation a_element_op, - const BElementwiseOperation b_element_op, + const B0ElementwiseOperation b0_element_op, + const AccElementwiseOperation acc_element_op, + const B1ElementwiseOperation b1_element_op, const CElementwiseOperation c_element_op, + const index_t batch_count, + const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch, + const C0MatrixMask c0_matrix_mask, const Block2CTileMap block_2_ctile_map) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - GridwiseGemm::template Run(p_a_grid, - p_b0_grid, - p_c_grid, + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + GridwiseGemm::template Run(p_a_grid + a_batch_offset, + p_b0_grid + b0_batch_offset, + p_b1_grid + b1_batch_offset, + p_c_grid + c_batch_offset, p_shared, - a_grid_desc_k0_m_k1, - b0_grid_desc_k0_l_k1, + a_grid_desc_ak0_m_ak1, + b0_grid_desc_bk0_l_bk1, + b1_grid_desc_l0_n_l1, c_grid_desc_mblock_mperblock_nblock_nperblock, a_element_op, - b_element_op, + b0_element_op, + acc_element_op, + b1_element_op, c_element_op, + c0_matrix_mask, block_2_ctile_map); #else ignore = p_a_grid; ignore = p_b0_grid; + ignore = p_b1_grid; ignore = p_c_grid; - ignore = a_grid_desc_k0_m_k1; - ignore = b0_grid_desc_k0_l_k1; + ignore = a_grid_desc_ak0_m_ak1; + ignore = b0_grid_desc_bk0_l_bk1; + ignore = b1_grid_desc_l0_n_l1; ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; ignore = a_element_op; - ignore = b_element_op; + ignore = b0_element_op; + ignore = acc_element_op; + ignore = b1_element_op; ignore = c_element_op; + ignore = batch_count; + ignore = compute_base_ptr_of_batch; + ignore = c0_matrix_mask; ignore = block_2_ctile_map; #endif // end of if (defined(__gfx1100__)) } // Gemm0: A [M x K] x B0 [K x L] = Acc [M x L] // Gemm1: Acc [M x L] x B1 [L x N] = C [M x N] -template @@ -155,57 +190,44 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static constexpr auto I6 = Number<6>{}; static constexpr auto I7 = Number<7>{}; - // K1 should be Number<...> - static constexpr auto K1 = Number{}; - static constexpr auto N1 = Number{}; + // K1Value should be Number<...> + static constexpr auto AK0 = Number{}; + static constexpr auto AK1 = Number{}; + static constexpr auto BK0 = Number{}; + static constexpr auto BK1 = Number{}; + static constexpr auto L0 = Number{}; + static constexpr auto L1 = Number{}; + + static constexpr auto Gemm0MWaves = MPerBlock / (MPerWmma * MRepeat); + static constexpr auto Gemm0LWaves = L0PerBlock * L1Value / (LPerWmma * LRepeat); using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = remove_cvref_t())>; - __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() + __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() { - constexpr auto max_lds_align = K1; - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { - if constexpr(ABlockLdsExtraM) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); - - return a_block_desc_k0perblock_mperblock_k1; + return make_naive_tensor_descriptor( + make_tuple(AK0, Number{}, AK1), + make_tuple(Number{} * AK1, AK1, I1)); } - __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() + __host__ __device__ static constexpr auto GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1() { - constexpr auto max_lds_align = K1; - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { - if constexpr(BBlockLdsExtraN) - { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); - } - else - { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); - } - }(); + return make_naive_tensor_descriptor( + make_tuple(BK0, Number{}, BK1), + make_tuple(Number{} * BK1, BK1, I1)); + } - return b_block_desc_k0perblock_nperblock_k1; + __host__ __device__ static constexpr auto GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1() + { + // B1 matrix in LDS memory, dst of blockwise copy + return make_naive_tensor_descriptor( + make_tuple(L0, Number{}, L1), + make_tuple(Number{} * L1, L1, I1)); } __host__ __device__ static constexpr auto @@ -228,55 +250,68 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_desc_k0perblock_mperblock_k1 = - GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - - constexpr auto b_block_desc_k0perblock_nperblock_k1 = - GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - - constexpr auto max_lds_align = K1; - - constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); + const index_t gemm0_bytes_end = (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + + SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); - constexpr auto b_block_space_size_aligned = math::integer_least_multiple( - b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); + const index_t gemm1_bytes_end = + (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * + sizeof(FloatB1); + + const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset + + SharedMemTrait::reduction_space_size_aligned) * + sizeof(FloatAcc0); + + const index_t c_block_bytes_end = + SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); - return (a_block_space_size_aligned * sizeof(FloatA) + - b_block_space_size_aligned * sizeof(FloatB)); + return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); } // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, - const BGridDesc_K0_N_K1& b0_grid_desc_k0_l_k1, + CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, + const B0GridDesc_BK0_L_BK1& b0_grid_desc_bk0_l_bk1, + const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, const CGridDesc_M_N& c_grid_desc_m_n, const Block2CTileMap& block_2_ctile_map) { - static_assert(is_known_at_compile_time>::value, - "wrong! K1 need to be known at compile-time"); - static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && - (NPerBlock % (NRepeat * NPerWmma)) == 0, + (LPerBlock % (LPerWmma * LRepeat)) == 0, "Invalid tuning param!"); - const auto M = a_grid_desc_k0_m_k1.GetLength(I1); - const auto N = b0_grid_desc_k0_l_k1.GetLength(I1); - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1); + const auto L = b0_grid_desc_bk0_l_bk1.GetLength(I1); + const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); + const auto N = b1_grid_desc_l0_n_l1.GetLength(I1); - if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) && - K0 == b0_grid_desc_k0_l_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && - K1 == b0_grid_desc_k0_l_k1.GetLength(I2))) + const auto KPerBlock = K0PerBlock * K1Value; + if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1))) + { return false; + } + + if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && + N % NPerBlock == 0)) + { + return false; + } - if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) + // check gemm0 gridwise gemm pipeline + const auto num_gemm0_k_loop = K / KPerBlock; + if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop)) + { return false; + } - // check gridwise gemm pipeline - const auto num_k_loop = K0 / K0PerBlock; + // check gemm1 gridwise gemm pipeline + if(!(LPerBlock % (L0PerBlock * L1Value) == 0)) + { + return false; + } - if(!GridwiseGemmPipe::IsSupported(num_k_loop)) + const auto num_gemm1_k_inner_loop = LPerBlock / (L0PerBlock * L1Value); + if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop)) { return false; } @@ -292,7 +327,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / (K0PerBlock * K1); + const index_t num_loop = K / (K0PerBlock * K1Value); return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -328,6 +363,42 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>; using DefaultBlock2CTileMap = remove_cvref_t; + + struct SharedMemTrait + { + // LDS allocation for A and B: be careful of alignment + static constexpr auto a_block_desc_ak0_m_ak1 = + GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + static constexpr auto b0_block_desc_bk0_l_bk1 = + GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); + static constexpr auto b1_block_desc_bl0_n_bl1 = + GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); + + static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), L1); + + static constexpr auto a_block_space_size_aligned = math::integer_least_multiple( + a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); + static constexpr auto b0_block_space_size_aligned = math::integer_least_multiple( + b0_block_desc_bk0_l_bk1.GetElementSpaceSize(), max_lds_align); + static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple( + b1_block_desc_bl0_n_bl1.GetElementSpaceSize(), max_lds_align); + + static constexpr auto a_block_space_offset = 0; + static constexpr auto b0_block_space_offset = a_block_space_size_aligned.value; + static constexpr auto b1_block_space_offset = 0; + + // LDS allocation for reduction + static constexpr index_t reduction_space_size_aligned = + math::integer_least_multiple(BlockSize, max_lds_align); + + static constexpr auto reduction_space_offset = 0; + + // LDS allocation for C shuffle in LDS + static constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); + static constexpr auto c_block_space_size = + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize(); + }; template __device__ static void Run(const FloatA* __restrict__ p_a_grid, @@ -335,9 +406,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle const FloatB1* __restrict__ p_b1_grid, FloatC* __restrict__ p_c_grid, void* __restrict__ p_shared, - const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, - const B0GridDesc_K0_L_K1& b0_grid_desc_k0_l_k1, - const B1GridDesc_L0_N_L1& b1_grid_desc_l0_n_l1, + const AGridDesc_AK0_M_AK1& a_grid_desc_k0_m_k1, + const B0GridDesc_BK0_L_BK1& b0_grid_desc_k0_l_k1, + const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation& a_element_op, @@ -380,9 +451,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - constexpr auto max_lds_align = K1; - constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + // constexpr auto max_lds_align = K1Value; + constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + constexpr auto b0_block_desc_k0perblock_lperblock_k1 = GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); // A matrix blockwise copy auto a_blockwise_copy = @@ -390,7 +461,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /* typename SrcElementwiseOperation, */ AElementwiseOperation, /* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, /* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, -/* typename BlockSliceLengths, */ Sequence, +/* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, /* typename SrcData, */ FloatA, @@ -415,134 +486,177 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle ck::tensor_operation::element_wise::PassThrough{}); // B matrix blockwise copy - auto b_blockwise_copy = + auto b0_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatB, - FloatB, + Sequence, + B0BlockTransferThreadClusterLengths_K0_L_K1, + B0BlockTransferThreadClusterArrangeOrder, + FloatB0, + FloatB0, decltype(b0_grid_desc_k0_l_k1), - decltype(b_block_desc_k0perblock_nperblock_k1), - BBlockTransferSrcAccessOrder, + decltype(b0_block_desc_k0perblock_lperblock_k1), + B0BlockTransferSrcAccessOrder, Sequence<0, 1, 2>, - BBlockTransferSrcVectorDim, + B0BlockTransferSrcVectorDim, 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, + B0BlockTransferSrcScalarPerVector, + B0BlockTransferDstScalarPerVector_K1, 1, 1, - BThreadTransferSrcResetCoordinateAfterRun, + B0ThreadTransferSrcResetCoordinateAfterRun, true>( b0_grid_desc_k0_l_k1, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_element_op, - b_block_desc_k0perblock_nperblock_k1, + make_multi_index(0, 0, 0), + b0_element_op, + b0_block_desc_k0perblock_lperblock_k1, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); /*******************************************************************************/ // Gemm0 constexpr auto WmmaK = 16; - constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + constexpr auto KPack = math::integer_least_multiple(K1Value, WmmaK); auto blockwise_gemm0 = BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; // Prepare Register for A*B0 matrix - auto acc_thread_buf = blockwise_gemm.GetCThreadBuffer(); + auto acc0_thread_buf = blockwise_gemm0.GetCThreadBuffer(); + + // Acc matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type + constexpr auto acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = + blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); + + constexpr auto mrepeat = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I0); + constexpr auto mwave = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I1); + constexpr auto mthreadpersubgroup = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I2); + constexpr auto lrepeat = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I3); + constexpr auto lwave = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I4); + constexpr auto lsubgroup = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I5); + constexpr auto laccvgprs = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I6); + + constexpr auto acc0_thread_desc_l0perblock_mperblock_l1 = transform_tensor_descriptor( + acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, + make_tuple(make_merge_transform_v3_division_mod(make_tuple(lrepeat, lrepeat, lsubgroup)), + make_merge_transform_v3_division_mod(make_tuple(mrepeat, mwave, mthreadpersubgroup)), + make_pass_through_transform(laccvgprs)), + make_tuple(Sequence<3, 4, 5>{}, Sequence<0, 1, 2>{}, Sequence<6>{}), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); /*******************************************************************************/ - constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); // LDS allocation for A and B: be careful of alignment - auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + auto a_block_buf = make_dynamic_buffer(static_cast(p_shared) + SharedMemTrait::a_block_space_offset, + a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); + auto b0_block_buf = make_dynamic_buffer(static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, + b0_block_desc_k0perblock_lperblock_k1.GetElementSpaceSize()); // Shift Per SUB_K constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); - const auto b_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); + constexpr auto b0_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); + const auto b0_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); + + const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); /*******************************************************************************/ // softmax /*******************************************************************************/ - auto workspace_buf = make_dynamic_buffer(static_cast(p_shared), math::integer_least_multiple(BlockSize, max_lds_align)); - // get acc0 8D thread cluster - constexpr auto thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4 = - blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths() / - blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); - constexpr auto tm0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I0); - constexpr auto tn0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I1); - constexpr auto tm1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I2); - constexpr auto tn1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I3); - constexpr auto tm2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I4); - constexpr auto tn2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I5); - constexpr auto tn3 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I6); - constexpr auto tn4 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I7); + auto workspace_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::reduction_space_offset, + SharedMemTrait::reduction_space_size_aligned); + // get acc0 7D thread cluster + constexpr auto thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = + blockwise_gemm0.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths() / + blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths(); + constexpr auto t_mrepeat = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I0); + constexpr auto t_mwave = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I1); + constexpr auto t_mthreadpersubgroup = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I2); + constexpr auto t_lrepeat = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I3); + constexpr auto t_lwave = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I4); + constexpr auto t_lsubgroup = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I5); + constexpr auto t_laccvgprs = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I6); // get acc0 thread map - constexpr auto m0_n_m1_to_m_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(tm0 * tm1, tm2)), + constexpr auto m0_l_m1_to_m_l_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(t_mrepeat * t_mwave, t_mthreadpersubgroup)), make_pass_through_transform(I1)), make_tuple(Sequence<0>{}, Sequence<1>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - constexpr auto threadid_to_m0_n_m1_adaptor = make_single_stage_tensor_adaptor( + constexpr auto threadid_to_m0_l_m1_adaptor = make_single_stage_tensor_adaptor( make_tuple( - make_merge_transform(make_tuple(tm0 * tm1, tn0 * tn1 * tn2 * tn3 * tn4, tm2))), + make_merge_transform( + make_tuple(t_mrepeat * t_mwave, t_lrepeat * t_lwave * t_lsubgroup * t_laccvgprs, t_mthreadpersubgroup))), make_tuple(Sequence<0, 1, 2>{}), make_tuple(Sequence<0>{})); - const auto threadid_to_m_n_thread_cluster_adaptor = - chain_tensor_adaptors(m0_n_m1_to_m_n_adaptor, threadid_to_m0_n_m1_adaptor); + const auto threadid_to_l_n_thread_cluster_adaptor = + chain_tensor_adaptors(m0_l_m1_to_m_l_adaptor, threadid_to_m0_l_m1_adaptor); // get acc0 2D thread cluster & 2D thread slice - constexpr auto thread_cluster_desc_m_n = make_naive_tensor_descriptor_packed( - make_tuple(tm0 * tm1 * tm2, tn0 * tn1 * tn2 * tn3 * tn4)); - constexpr auto thread_slice_desc_m_n = - make_naive_tensor_descriptor_packed(make_tuple(m0 * m1 * m2, n0 * n1 * n2 * n3 * n4)); + constexpr auto thread_cluster_desc_m_l = make_naive_tensor_descriptor_packed( + make_tuple(t_mrepeat * t_mwave * t_mthreadpersubgroup, t_lrepeat * t_lwave * t_lsubgroup * t_laccvgprs)); + constexpr auto thread_slice_desc_m_l = make_naive_tensor_descriptor_packed( + make_tuple(mrepeat * mwave * mthreadpersubgroup, lrepeat * lwave * lsubgroup * laccvgprs)); + auto blockwise_softmax = BlockwiseSoftmax{}; + FloatAcc0, + decltype(threadid_to_l_n_thread_cluster_adaptor), + decltype(thread_cluster_desc_m_l), + decltype(thread_slice_desc_m_l)>{}; // Initialize running sum and max of exponentiating row vectors using SoftmaxBuf = typename decltype(blockwise_softmax)::BufferType; SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new; running_sum = 0; running_sum_new = 0; - running_max = NumericLimits::Lowest(); - running_max_new = NumericLimits::Lowest(); + running_max = NumericLimits::Lowest(); + running_max_new = NumericLimits::Lowest(); /*******************************************************************************/ // set up Gemm1 /*******************************************************************************/ // B1 matrix in LDS memory, dst of blockwise copy - constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_L0PerBlock_NPerBlock_L1(); + constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); + constexpr auto b1_block_slice_copy_step = make_multi_index(L0PerBlock, 0, 0); + + // A1 matrix in VGPR + constexpr auto A1ThreadSlice_L0PerBlock_MPerBlock_L1 = make_tuple( + Number{}, + Number{}, + Number{}); // Data duplicated dimension + + constexpr auto A1ThreadSliceL0PerBlock = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I0]; + constexpr auto A1ThreadSliceMPerBlock = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I1]; + constexpr auto A1ThreadSliceL1 = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I2]; + + // A1 has duplicated data + constexpr auto A1ThreadDuplicatedDim = I2 * A1ThreadSliceL1; + constexpr auto a1_thread_desc_l0perblock_mperblock_l1 = make_naive_tensor_descriptor( + make_tuple(A1ThreadSliceL0PerBlock, A1ThreadSliceMPerBlock, A1ThreadDuplicatedDim), + make_tuple(A1ThreadSliceMPerBlock * A1ThreadDuplicatedDim, A1ThreadDuplicatedDim, I1)); // A1 matrix blockwise copy auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< - FloatAcc, + FloatAcc0, FloatA, - decltype(acc_thread_desc_k0_m_k1), - decltype(a1_thread_desc_k0_m_k1), + decltype(acc0_thread_desc_l0perblock_mperblock_l1), + decltype(a1_thread_desc_l0perblock_mperblock_l1), tensor_operation::element_wise::PassThrough, - Sequence, - Sequence<1, 0, 2>, + Sequence, + Sequence<0, 1, 2>, 2, - n4, + laccvgprs, // dst Rowlane // 0x76543210 0xfedcba98 // src Rowlane @@ -551,68 +665,77 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // B1 matrix blockwise copy auto b1_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1, - B1BlockTransferThreadClusterLengths_BK0_N_BK1, + Sequence, + B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, - FloatAB, - FloatAB, - decltype(b1_grid_desc_bk0_n_bk1), - decltype(b1_block_desc_bk0_n_bk1), + FloatB1, + FloatB1, + decltype(b1_grid_desc_l0_n_l1), + decltype(b1_block_desc_l0perblock_nperblock_l1), B1BlockTransferSrcAccessOrder, Sequence<1, 0, 2>, B1BlockTransferSrcVectorDim, 2, B1BlockTransferSrcScalarPerVector, - B1BlockTransferDstScalarPerVector_BK1, + B1BlockTransferDstScalarPerVector_L1, 1, 1, B1ThreadTransferSrcResetCoordinateAfterRun, true, // DstResetCoord NumGemmKPrefetchStage>( - b1_grid_desc_bk0_n_bk1, - make_multi_index(0, gemm1_n_block_data_idx_on_grid, 0), + b1_grid_desc_l0_n_l1, + make_multi_index(0, n_block_data_idx_on_grid, 0), b1_element_op, - b1_block_desc_bk0_n_bk1, + b1_block_desc_l0perblock_nperblock_l1, make_multi_index(0, 0, 0), tensor_operation::element_wise::PassThrough{}); - auto a1_thread_buf = make_static_buffer(a1_thread_desc_k0_m_k1.GetElementSpaceSize()); - auto b1_block_buf = make_dynamic_buffer(static_cast(p_shared), b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); + auto a1_thread_buf = make_static_buffer( + a1_thread_desc_l0perblock_mperblock_l1.GetElementSpaceSize()); + auto b1_block_buf = make_dynamic_buffer( + static_cast(p_shared)+ SharedMemTrait::b1_block_space_offset, + b1_block_desc_l0perblock_nperblock_l1.GetElementSpaceSize()); auto blockwise_gemm1 = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle{make_tuple(0, 0, 0, 0, 0)}; + KPack>{}; auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); - const index_t num_gemm1_k_block_outer_loop = b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock; - constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock; + const index_t num_gemm1_l_block_outer_loop = b0_grid_desc_k0_l_k1.GetLength(I1) / LPerBlock; + constexpr index_t num_gemm1_l_block_inner_loop = LPerBlock / (L0PerBlock * L1Value); // Initialize C - StaticBuffer c_thread_buf; + StaticBuffer c_thread_buf; c_thread_buf.Clear(); /*******************************************************************************/ // Flash Attention // Dao, Tri, et al. "Flashattention: Fast and memory-efficient exact attention with io-awareness." arXiv preprint arXiv:2205.14135 (2022). - index_t gemm1_k_block_outer_index = 0; + index_t gemm1_l_block_outer_index = 0; // Outer loop, along GEMM_L // Inner loop, along GEMM_K do{ + auto l_block_data_idx_on_grid = + __builtin_amdgcn_readfirstlane(gemm1_l_block_outer_index * LPerBlock); + if(c0_matrix_mask.IsTileSkippable( + m_block_data_idx_on_grid, l_block_data_idx_on_grid, MPerBlock, LPerBlock)) + { + continue; + } // gemm0 start, A-B swaped - const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, a_block_desc_k0perblock_mperblock_k1, a_blockwise_copy, @@ -620,33 +743,32 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle a_block_buf, a_block_slice_copy_step, b0_grid_desc_k0_l_k1, - b_block_desc_k0perblock_nperblock_k1, - b_blockwise_copy, + b0_block_desc_k0perblock_lperblock_k1, + b0_blockwise_copy, b0_grid_buf, - b_block_buf, - b_block_slice_copy_step, - blockwise_gemm, - acc_thread_buf, + b0_block_buf, + b0_block_slice_copy_step, + blockwise_gemm0, + acc0_thread_buf, K0BlockMainLoop); // do MNK padding or upper triangular masking if constexpr(MaskOutUpperTriangle || PadN) { - // 8d thread_desc in thread scope + // 7d thread_desc in thread scope constexpr auto c_thread_lengths = - blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); + blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths(); - // 8d block_desc in block scope + // 7d block_desc in block scope constexpr auto c_block_lengths = - blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths(); + blockwise_gemm0.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths(); - constexpr auto M0 = c_block_lengths[I0]; - constexpr auto N0 = c_block_lengths[I1]; - constexpr auto M1 = c_block_lengths[I2]; - constexpr auto N1 = c_block_lengths[I3]; - constexpr auto M2 = c_block_lengths[I4]; - constexpr auto N2 = c_block_lengths[I5]; - constexpr auto N3 = c_block_lengths[I6]; - constexpr auto N4 = c_block_lengths[I7]; + constexpr auto MREPEAT = c_block_lengths[I0]; + constexpr auto MWAVE = c_block_lengths[I1]; + constexpr auto MTHREADSubGroup = c_block_lengths[I2]; + constexpr auto LREPEAT = c_block_lengths[I3]; + constexpr auto LWAVE = c_block_lengths[I4]; + constexpr auto LSUBGROUP = c_block_lengths[I5]; + constexpr auto LACCVGPRS = c_block_lengths[I6]; // works like multi-dimension static_for (static_ford), but provides both the linear // index as well as n-d index @@ -656,36 +778,34 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle typename uniform_sequence_gen::type, false>; // SnakeCurved - auto acc0_thread_origin = blockwise_gemm.CalculateCThreadOriginDataIndex8D( - Number<0>{}, Number<0>{}, Number<0>{}, Number<0>{}); + auto acc0_thread_origin = blockwise_gemm0.CalculateCThreadOriginDataIndex7D( + Number<0>{}, Number<0>{}); - constexpr auto block_idx_to_m_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(M0, M1, M2)), - make_unmerge_transform(make_tuple(N0, N1, N2, N3, N4))), + constexpr auto block_idx_to_m_l_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(MREPEAT, MWAVE, MTHREADSubGroup)), + make_unmerge_transform(make_tuple(LREPEAT, LWAVE, LSUBGROUP, LACCVGPRS))), make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5, 6, 7>{})); + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5, 6>{})); static_for<0, Acc0TileIterator::GetNumOfAccess(), 1>{}([&](auto i) { auto acc0_thread_idx = Acc0TileIterator::GetIndex(i) + acc0_thread_origin; - auto m_local = - block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; - auto n_local = - block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; + auto m_local = block_idx_to_m_l_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; + auto l_local = block_idx_to_m_l_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; auto m_global = m_local + m_block_data_idx_on_grid; - auto n_global = n_local + n_block_data_idx_on_grid; - if(c0_matrix_mask.IsMaskedElement(m_global, n_global)) + auto l_global = l_local + l_block_data_idx_on_grid; + if(c0_matrix_mask.IsMaskedElement(m_global, l_global)) { - acc_thread_buf(i) = -ck::NumericLimits::Infinity(); + acc0_thread_buf(i) = -ck::NumericLimits::Infinity(); } else { - acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); + acc_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); } }); } else - { static_for<0, acc_thread_buf.Size(), 1>{}( - [&](auto i) { acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); }); + { static_for<0, acc0_thread_buf.Size(), 1>{}( + [&](auto i) { acc_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); }); } @@ -697,7 +817,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle SoftmaxBuf& max = blockwise_softmax.max_value_buf; SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; - blockwise_softmax.Run(acc_thread_buf, workspace_buf); + blockwise_softmax.Run(acc0_thread_buf, workspace_buf); // TODO: may convert to log domain running_max_new = mathext::max(max, running_max); @@ -717,79 +837,80 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle acc1_thread_buf.Clear(); // preload data into LDS - b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf); + b1_blockwise_copy.RunRead(b1_grid_desc_l0_n_l1, b1_grid_buf); - b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1, + b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_l0_n_l1, b1_block_slice_copy_step); block_sync_lds(); // wait for reduction LDS read - b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf); + b1_blockwise_copy.RunWrite(b1_block_desc_l0perblock_nperblock_l1, b1_block_buf); // main body - if constexpr(num_gemm1_k_block_inner_loop > 1) + if constexpr(num_gemm1_l_block_inner_loop > 1) { - static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) { - // Data cast from FloatAcc to FloatA happen here - a1_blockwise_copy.Run(acc_thread_desc_k0_m_k1, - make_tuple(Number{}, I0, I0), - acc_thread_buf, - a1_thread_desc_k0_m_k1, + static_for<0, num_gemm1_l_block_inner_loop - 1, 1>{}([&](auto i) { + // Data cast from FloatAcc0 to FloatA happen here + a1_blockwise_copy.Run(acc0_thread_desc_l0perblock_mperblock_l1, + make_tuple(Number{}, I0, I0), + acc0_thread_buf, + a1_thread_desc_l0perblock_mperblock_l1, make_tuple(I0, I0, I0), a1_thread_buf); - b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf); + b1_blockwise_copy.RunRead(b1_grid_desc_l0_n_l1, b1_grid_buf); block_sync_lds(); - gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); + blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); block_sync_lds(); - b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1, + b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_l0_n_l1, b1_block_slice_copy_step); - b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf); + b1_blockwise_copy.RunWrite(b1_block_desc_l0perblock_nperblock_l1, b1_block_buf); }); } // tail { a1_blockwise_copy.Run( - acc_thread_desc_k0_m_k1, + acc0_thread_desc_l0perblock_mperblock_l1, make_tuple( - Number<(num_gemm1_k_block_inner_loop - 1) * A1ThreadSliceK0>{}, I0, I0), - acc_thread_buf, - a1_thread_desc_k0_m_k1, + Number<(num_gemm1_l_block_inner_loop - 1) * A1ThreadSliceL0PerBlock>{}, I0, I0), + acc0_thread_buf, + a1_thread_desc_l0perblock_mperblock_l1, make_tuple(I0, I0, I0), a1_thread_buf); block_sync_lds(); - gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); + blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); } } // end gemm1 - constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 = - gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(); - constexpr auto cm0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0); - constexpr auto cn0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1); - constexpr auto cm1 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2); - constexpr auto cn1 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3); - constexpr auto cm2 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4); - constexpr auto cn2 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5); - constexpr auto cn3 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6); - constexpr auto cn4 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7); + constexpr auto c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = + blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); + constexpr auto c_mrepeat = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I0); + constexpr auto c_mwave = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I1); + constexpr auto c_mthreadpersubgroup = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I2); + constexpr auto c_nrepeat = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I3); + constexpr auto c_nwave = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I4); + constexpr auto c_nsubgroup = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I5); + constexpr auto c_naccvgprs = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I6); + constexpr auto c_thread_slice_desc_m_n = make_naive_tensor_descriptor_packed( - make_tuple(cm0 * cm1 * cm2, cn0 * cn1 * cn2 * cn3 * cn4)); + make_tuple(c_mrepeat * c_mwave * c_mthreadpersubgroup, + c_nrepeat * c_nwave * c_nsubgroup * c_naccvgprs)); constexpr auto c_thread_buf_slice_m = c_thread_slice_desc_m_n.GetLength(I0); constexpr auto c_thread_buf_slice_n = c_thread_slice_desc_m_n.GetLength(I1); static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) { static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) { auto I = Number{}; - FloatGemmAcc acc1 = acc1_thread_buf[I]; // P*V - FloatGemmAcc c = c_thread_buf[I]; // O - FloatGemmAcc c_new = + FloatAcc1 acc1 = acc1_thread_buf[I]; // P*V + FloatAcc1 c = c_thread_buf[I]; // O + FloatAcc1 c_new = (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c + math::exp(max[iM] - running_max_new[iM]) * acc1) / running_sum_new[iM]; @@ -798,26 +919,26 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle }); }); - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_ak0_m_ak1, + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, a_block_reset_copy_step); // rewind K - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_bk0_n_bk1, - b_block_reset_copy_step); // rewind K and step N + b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc_k0_l_k1, + b0_block_reset_copy_step); // rewind K and step N // update before next j iteration running_max = running_max_new; running_sum = running_sum_new; block_sync_lds(); // wait for gemm1 LDS read - }while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); + }while(++gemm1_l_block_outer_index < num_gemm1_l_block_outer_loop); /*******************************************************************************/ // write out to C, implement shuffle { constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); // This API Provide All dimension (size) you need constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = - blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + blockwise_gemm0.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); @@ -852,7 +973,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); + const auto c_thread_mtx_on_block = blockwise_gemm0.CalculateCThreadOriginDataIndex(I0, I0); const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; @@ -877,7 +998,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // shuffle: threadwise copy C from VGPR to LDS auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3::type = false> struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 0672bf8e5b2..26ac87ea7f9 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -369,7 +369,7 @@ struct WmmaGemm static constexpr auto I5 = Number<5>{}; using CIndex = MultiIndex<2>; - using CIndex4D = MultiIndex<4>; + using CIndex3D = MultiIndex<3>; __host__ __device__ constexpr WmmaGemm() { @@ -421,6 +421,46 @@ struct WmmaGemm Sequence<5>{})); } + // Transposed WMMA Output C' = B' * A' + template + __host__ __device__ static constexpr auto + MakeCDesc_MBlockxRepeat_MWave_MThreadPerSubGroup_NBlockxRepeat_NWave_NSubGroup_NAccVgprs( + const CDesc_MBlockxRepeat_MWave_MPerWMMA_NBlockxRepeat_NWave_NPerWMMA& + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma) + { + const auto MBlockxRepeat = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I0); + const auto NBlockxRepeat = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I3); + const auto MWave = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I1); + const auto NWave = + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma.GetLength(I4); + + return transform_tensor_descriptor( + c_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma, + make_tuple( + make_pass_through_transform(MBlockxRepeat), + make_pass_through_transform(MWave), + make_pass_through_transform(Number{}), + make_pass_through_transform(NBlockxRepeat), + make_pass_through_transform(NWave), + make_unmerge_transform(make_tuple(Number{}, + Number{}))), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{}, + Sequence<5, 6>{})); + } + __device__ static constexpr index_t GetRegSizePerWmma() { return wmma_instr.num_acc_vgprs_per_wave; @@ -493,6 +533,14 @@ struct WmmaGemm return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset}; } + __device__ static CIndex3D GetBeginOfThreadBlk3D() + { + index_t n_offset = GetLaneIdUnderSubGroup(); + index_t m_offset = GetSubGroupId(); + + return TransposeC ? CIndex3D{n_offset, m_offset, I0} : CIndex3D{m_offset, n_offset, I0}; + } + static constexpr auto wmma = WmmaSelector{}; static constexpr auto wmma_instr = wmma.selected_wmma; From 74f0d5dee0cfb9719ef2640b3c5462a15117b1ae Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 14 Feb 2023 13:38:16 +0000 Subject: [PATCH 034/118] save debugging progress --- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 10 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 26 +++ include/ck/host_utility/kernel_launch.hpp | 6 +- .../gpu/block/blockwise_gemm_wmma.hpp | 190 ++++++++++++------ ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 29 ++- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 189 ++++++++++++----- .../gpu/grid/gridwise_gemm_wmma.hpp | 58 +++++- .../threadwise_tensor_slice_transfer.hpp | 5 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 4 +- .../library/utility/host_tensor_generator.hpp | 12 ++ 10 files changed, 383 insertions(+), 146 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index 7c771f23c10..759507185b6 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -39,8 +39,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; using ADataType = F16; using B0DataType = F16; using B1DataType = F16; -using Acc0DataType = F32; -using Acc1DataType = F32; +using Acc0DataType = F32; +using Acc1DataType = F32; using CShuffleDataType = F32; using CDataType = F16; using Acc0BiasDataType = ck::Tuple<>; @@ -125,12 +125,12 @@ using DeviceGemmInstance = S<4, 64, 1>, // B1BlockTransfer S<1, 0, 2>, S<1, 0, 2>, - 1, + 2, 8, 8, false, - 1, // CShuffleMXdlPerWavePerShuffle - 2, // CShuffleNXdlPerWavePerShuffle + 1, // CShuffleMWmmaPerWavePerShuffle + 2, // CShuffleNWmmaPerWavePerShuffle S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock 4, // CShuffleBlockTransferScalarPerVector_NPerBlock MaskingSpec>; // MaskingSpecialization diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 8aba9ccdcfa..4be9d908a88 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -117,6 +117,26 @@ int run(int argc, char* argv[]) b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); break; + case 4: // A, B0, B1 1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 5: // Rand: b1 ; unit: a b0 fail + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 6: // Rand: b0 ; unit: a b1 fail + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 7: // Rand: a ; unit: b0 b1 pass + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); @@ -220,6 +240,12 @@ int run(int argc, char* argv[]) a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); ref_gemm0_invoker.Run(ref_gemm0_argument); + // for(int i =0; i< 128; i++){ + // for(int j =0; j< 128; j++){ + // printf("%0.2lf ", acc0_g_m_n.mData[i*128 +j]); + // } + // printf("\n"); + // } // masking const auto mask = DeviceGemmInstance::C0MatrixMask(N); diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index ed6e2f0ba1d..8015eaf5df4 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -29,12 +29,12 @@ float launch_and_time_kernel(const StreamConfig& stream_config, block_dim.y, block_dim.z); - const int nrepeat = 10; + const int nrepeat = 1; - printf("Warm up 1 time\n"); + // printf("Warm up 1 time\n"); // warm up - kernel<<>>(args...); + // kernel<<>>(args...); printf("Start running %d times...\n", nrepeat); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 3cf62a0db66..b3f65aeabab 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -16,19 +16,36 @@ template -/* A: K0PerBlock x MPerBlock x K1 + index_t KPack, + bool TransposeC = false> +/* Option: Read from LDS, big buffer hold all threads required data + * Source + * A: K0PerBlock x MPerBlock x K1 * B: K0PerBlock x NPerBlock x K1 - * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs + * Destination + * C, non-transpose + * thread level: MRepeat x NRepeat x MAccVgprs + * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs * KPACK == WMMA_K = 16 + * + * Option: Read from VMEM, small buffer hold each thread own required data (Skip LDS) + * Source: + * A(if skip LDS): MRepeat x KPack + * B(if skip LDS): NRepeat x KPack + * Destination + * C, non-transpose + * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs */ -struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle +struct BlockwiseGemmWMMA { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -42,18 +59,10 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. static constexpr index_t WaveSize = 32; - static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); - static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); - static constexpr index_t KPerBlock = - BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); + static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I4); + static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I4); - static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); - static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); - static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); - static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); - - static constexpr auto wmma_gemm = - WmmaGemm{}; + static constexpr auto wmma_gemm = WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -79,6 +88,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); } + // Default, Block buffer in LDS, thread level offset enabled __device__ static auto CalculateAThreadOriginDataIndex() { const auto wave_idx = GetWaveIdx(); @@ -129,23 +139,63 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle return make_tuple(c_thread_m, c_thread_n); } - // using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); - // __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle( - // Tuple4 a_origin = CalculateAThreadOriginDataIndex(), - // Tuple4 b_origin = CalculateBThreadOriginDataIndex()) - // : a_thread_copy_(a_origin), b_thread_copy_(b_origin) - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle() + template + __device__ static auto CalculateCThreadOriginDataIndex7D(Number, Number) { - static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && - BK0NK1BlockDesc::IsKnownAtCompileTime(), + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D(); + + return make_tuple(Number{}, + blk_idx[I0], + waveId_m, + Number{}, + waveId_n, + blk_idx[I1], + blk_idx[I2]); + } + + using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); + __host__ __device__ BlockwiseGemmWMMA(Tuple5 a_origin = CalculateAThreadOriginDataIndex(), + Tuple5 b_origin = CalculateBThreadOriginDataIndex()) + : a_thread_copy_(a_origin), b_thread_copy_(b_origin) + { + static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time"); static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); - static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && - NPerBlock % (NPerWMMA * NRepeat) == 0, + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); + + // printf("tid %03d, Mat-B offset %d\n", get_thread_local_1d_id()%32, CalculateBThreadOriginDataIndex().At(Number<3>{})); + } + + // transposed WMMA output C' = B' * A' + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + // constexpr auto NSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; + // constexpr auto MThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + I1, + Number{}, + I1, + I1, + NAccVgprs)); } // Thread level, register decriptor. Vector-write @@ -171,9 +221,31 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle MAccVgprs)); } - // Provide dimension size + template __host__ __device__ static constexpr auto - GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // transposed WMMA output C' = B' * A' + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() { constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = make_naive_tensor_descriptor_packed(make_tuple(Number{}, @@ -184,37 +256,31 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle Number{})); return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + .MakeCDesc_MBlockxRepeat_MWave_MThreadPerSubGroup_NBlockxRepeat_NWave_NSubGroup_NAccVgprs( c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); } - __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() + // Provide dimension size + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() { - return transform_tensor_descriptor( - AK0MK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); - __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() - { - return transform_tensor_descriptor( - BK0NK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); } + // Describe how data allocated in thread copy src buffer // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma - static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); - static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); + static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1; + static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1; template __device__ void Run(const ABlockBuffer& a_block_buf, @@ -235,6 +301,9 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle a_thread_desc_, make_tuple(I0, m0, I0, I0, I0), a_thread_buf); + // static_for<0, a_thread_buf.size(), 1>{}([&](auto i) { + // a_thread_buf(i) = 1; + // }); static_for<0, NRepeat, 1>{}([&](auto n0) { // read B @@ -254,6 +323,9 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle b_thread_vec.template AsType()(i) = b_thread_buf[Number{}]; + + // a_thread_vec.template AsType()(i) = 1; + // b_thread_vec.template AsType()(i) = 1; }); using wmma_input_type_a = typename vector_type::type; @@ -261,6 +333,12 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + + // printf("GPU Gemm0 input, Tid %03d, A%2d = %04x, B%2d = %0x4\n", + // get_thread_local_1d_id(), + // i.value, *(reinterpret_cast(&a_thread_vec.template AsType()(i))), + // i.value, *(reinterpret_cast(&b_thread_vec.template AsType()(i)))); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), @@ -304,10 +382,8 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle B_K1, B_K1>; - AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; - BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; - // AThreadCopy a_thread_copy_; - // BThreadCopy b_thread_copy_; + AThreadCopy a_thread_copy_; + BThreadCopy b_thread_copy_; }; // block wise level pipe designed for inline asm @@ -601,7 +677,9 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); + // TODO: Fix it, MRepeat < NRepeat constexpr auto RepeatDiff = MRepeat - NRepeat; + // Read all Mrepeat, Nrepeat static_for<0, NRepeat, 1>{}([&](auto iN) { b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 84ef50b6895..d609ba4290d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -145,14 +145,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B0Spec, B1Spec, CSpec>; - // K1 = Max Vector Access Pixels - static constexpr auto K1Number = Number{}; - - static constexpr auto matrix_padder = - MatrixPadder{MPerBlock, NPerBlock, K0PerBlock* K1}; static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + const std::vector& a_gs_ms_ks_strides_vec) { return Transform::MakeAGridDescriptor_AK0_M_AK1( Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), @@ -160,20 +155,18 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } static auto MakeB0GridDescriptor_BK0_L_BK1(const std::vector& b0_gs_ls_ks_lengths_vec, - const std::vector& b0_gs_ls_ks_strides_vec) + const std::vector& b0_gs_ls_ks_strides_vec) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, b0_gs_ls_ks_strides_vec), Number{}); } - static auto - MakeB1GridDescriptor_BL0_N_BL1(const std::vector& b1_gs_ns_ls_lengths_vec, - const std::vector& b1_gs_ns_ls_strides_vec) + static auto MakeB1GridDescriptor_BL0_N_BL1(const std::vector& b1_gs_ns_ls_lengths_vec, + const std::vector& b1_gs_ns_ls_strides_vec) { return Transform::MakeB1GridDescriptor_BK0_N_BK1( - Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, - b1_gs_ns_ls_strides_vec), + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, b1_gs_ns_ls_strides_vec), Number{}); } @@ -462,8 +455,6 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle const auto K = arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); auto launch_kernel = [&](auto has_main_k_block_loop) { - constexpr bool has_main_loop = has_main_k_block_loop.value; - const auto kernel = kernel_batched_gemm_softmax_gemm_wmma_cshuffle< GridwiseOp, ADataType, @@ -482,7 +473,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle ComputeBasePtrOfStridedBatch, C0MatrixMask, typename GridwiseOp::DefaultBlock2CTileMap, - has_main_loop>; + has_main_k_block_loop>; return launch_and_time_kernel(stream_config, kernel, @@ -754,11 +745,15 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle << K0PerBlock << ", " << K1 << ", " << MPerBlock << ", " - << NPerWMMA << ", " - << MPerBlock << ", " << NPerBlock << ", " << L0PerBlock << ", " << L1 + << getGemmSpecializationString(GemmSpec) << ", " + << "ASpec" << getTensorSpecializationString(ASpec) << ", " + << "B0Spec" << getTensorSpecializationString(B0Spec) << ", " + << "B1Spec" << getTensorSpecializationString(B1Spec) << ", " + << "CSpec" << getTensorSpecializationString(CSpec) << ", " + << getMaskingSpecializationString(MaskingSpec) << ">" << " NumPrefetch: " << NumPrefetch << ", " diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 9300c1df595..12030b4d93e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -190,22 +190,89 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static constexpr auto I6 = Number<6>{}; static constexpr auto I7 = Number<7>{}; - // K1Value should be Number<...> static constexpr auto AK0 = Number{}; static constexpr auto AK1 = Number{}; static constexpr auto BK0 = Number{}; static constexpr auto BK1 = Number{}; - static constexpr auto L0 = Number{}; - static constexpr auto L1 = Number{}; - static constexpr auto Gemm0MWaves = MPerBlock / (MPerWmma * MRepeat); - static constexpr auto Gemm0LWaves = L0PerBlock * L1Value / (LPerWmma * LRepeat); + static constexpr auto AL0 = Number{}; + static constexpr auto AL1 = Number{}; + static constexpr auto BL0 = Number{}; + static constexpr auto BL1 = Number{}; using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = remove_cvref_t())>; + template + __host__ __device__ static constexpr auto + MakeA0BlockDescriptor_K0_M0_M1_M2_K1(const A0BlockDesc_AK0_M_AK1&) + { + constexpr index_t A_K0 = A0BlockDesc_AK0_M_AK1{}.GetLength(I0); + constexpr index_t A_K1 = A0BlockDesc_AK0_M_AK1{}.GetLength(I2); + constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma); + + return transform_tensor_descriptor( + A0BlockDesc_AK0_M_AK1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + template + __host__ __device__ static constexpr auto + MakeB0BlockDescriptor_K0_L0_L1_L2_K1(const B0BlockDesc_BK0_L_BK1&) + { + constexpr index_t B_K0 = B0BlockDesc_BK0_L_BK1{}.GetLength(I0); + constexpr index_t B_K1 = B0BlockDesc_BK0_L_BK1{}.GetLength(I2); + constexpr index_t LWaves = LPerBlock / (LRepeat * LPerWmma); + return transform_tensor_descriptor( + B0BlockDesc_BK0_L_BK1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + template + __host__ __device__ static constexpr auto + MakeA1BlockDescriptor_L0_M0_M1_M2_L1(const A1BlockDesc_AL0_M_AL1&) + { + constexpr index_t A_L0 = A1BlockDesc_AL0_M_AL1{}.GetLength(I0); + constexpr index_t A_L1 = A1BlockDesc_AL0_M_AL1{}.GetLength(I2); + + return transform_tensor_descriptor( + A1BlockDesc_AL0_M_AL1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, I1, I1)), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + template + __host__ __device__ static constexpr auto MakeB1BlockDescriptor_L0_N0_N1_N2_L1(const B1BlockDesc_BL0_N_BL1&) + { + constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); + constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); + constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); + return transform_tensor_descriptor( + B1BlockDesc_BL0_N_BL1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() { // A matrix in LDS memory, dst of blockwise copy @@ -226,8 +293,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle { // B1 matrix in LDS memory, dst of blockwise copy return make_naive_tensor_descriptor( - make_tuple(L0, Number{}, L1), - make_tuple(Number{} * L1, L1, I1)); + make_tuple(BL0, Number{}, BL1), + make_tuple(Number{} * BL1, BL1, I1)); } __host__ __device__ static constexpr auto @@ -374,7 +441,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static constexpr auto b1_block_desc_bl0_n_bl1 = GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); - static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), L1); + static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), BL1); static constexpr auto a_block_space_size_aligned = math::integer_least_multiple( a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); @@ -451,8 +518,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - // constexpr auto max_lds_align = K1Value; - constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); + + constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); constexpr auto b0_block_desc_k0perblock_lperblock_k1 = GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); // A matrix blockwise copy @@ -491,7 +558,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle B0ElementwiseOperation, ck::tensor_operation::element_wise::PassThrough, InMemoryDataOperationEnum::Set, - Sequence, + Sequence, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, FloatB0, @@ -520,23 +587,27 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1Value, WmmaK); - auto blockwise_gemm0 = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; + auto blockwise_gemm0 = BlockwiseGemmWMMA< + BlockSize, + FloatA, + FloatB0, + FloatAcc0, + decltype(MakeA0BlockDescriptor_K0_M0_M1_M2_K1(a_block_desc_k0perblock_mperblock_k1)), + decltype(MakeB0BlockDescriptor_K0_L0_L1_L2_K1(b0_block_desc_k0perblock_lperblock_k1)), + MPerBlock, + LPerBlock, + K0PerBlock * K1Value, + MPerWmma, + LPerWmma, + MRepeat, + LRepeat, + KPack, + true>{}; // C' = B' x A' + // Prepare Register for A*B0 matrix auto acc0_thread_buf = blockwise_gemm0.GetCThreadBuffer(); - // Acc matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type constexpr auto acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); @@ -550,7 +621,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto acc0_thread_desc_l0perblock_mperblock_l1 = transform_tensor_descriptor( acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, - make_tuple(make_merge_transform_v3_division_mod(make_tuple(lrepeat, lrepeat, lsubgroup)), + make_tuple(make_merge_transform_v3_division_mod(make_tuple(lrepeat, lwave, lsubgroup)), make_merge_transform_v3_division_mod(make_tuple(mrepeat, mwave, mthreadpersubgroup)), make_pass_through_transform(laccvgprs)), make_tuple(Sequence<3, 4, 5>{}, Sequence<0, 1, 2>{}, Sequence<6>{}), @@ -564,9 +635,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle b0_block_desc_k0perblock_lperblock_k1.GetElementSpaceSize()); // Shift Per SUB_K - constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); constexpr auto b0_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); + const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); const auto b0_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); @@ -587,7 +658,16 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto t_lwave = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I4); constexpr auto t_lsubgroup = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I5); constexpr auto t_laccvgprs = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I6); - + if(get_thread_local_1d_id()==0){ + printf("t_mrepeat %d, t_mwave %d, t_mthreadpersubgroup %d, t_lrepeat %d, t_lwave %d, t_lsubgroup %d, t_laccvgprs %d \n", + t_mrepeat.value, + t_mwave.value, + t_mthreadpersubgroup.value, + t_lrepeat.value, + t_lwave.value, + t_lsubgroup.value, + t_laccvgprs.value); + } // get acc0 thread map constexpr auto m0_l_m1_to_m_l_adaptor = make_single_stage_tensor_adaptor( make_tuple(make_unmerge_transform(make_tuple(t_mrepeat * t_mwave, t_mthreadpersubgroup)), @@ -628,11 +708,11 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // B1 matrix in LDS memory, dst of blockwise copy constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); - constexpr auto b1_block_slice_copy_step = make_multi_index(L0PerBlock, 0, 0); + constexpr auto b1_block_slice_copy_step = make_multi_index(BL0, 0, 0); // A1 matrix in VGPR constexpr auto A1ThreadSlice_L0PerBlock_MPerBlock_L1 = make_tuple( - Number{}, + Number{}, Number{}, Number{}); // Data duplicated dimension @@ -665,10 +745,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // B1 matrix blockwise copy auto b1_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1, + Sequence, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, FloatB1, @@ -700,22 +780,25 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle b1_block_desc_l0perblock_nperblock_l1.GetElementSpaceSize()); auto blockwise_gemm1 = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; + BlockwiseGemmWMMA{make_tuple(0, 0, 0, 0, 0)}; auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); const index_t num_gemm1_l_block_outer_loop = b0_grid_desc_k0_l_k1.GetLength(I1) / LPerBlock; - constexpr index_t num_gemm1_l_block_inner_loop = LPerBlock / (L0PerBlock * L1Value); + constexpr index_t num_gemm1_l_block_inner_loop = LPerBlock / (BL0 * BL1); // Initialize C StaticBuffer c_thread_buf; @@ -809,15 +892,19 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle } - block_sync_lds(); + block_sync_lds(); // gemm0 end - + // gemm0 incorrect // Tiled softmax start // softmax SoftmaxBuf& max = blockwise_softmax.max_value_buf; SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; - + // printf("GPU Gemm 0, Tid %03d, GPU acc0 = %lf\n", get_thread_local_1d_id(), acc0_thread_buf[I0]); + // static_for<0, acc0_thread_buf.Size(), 1>{}([&](auto i) { + // printf("GPU Gemm0, Tid %03d, GPU acc%d = %lf\n", get_thread_local_1d_id(), i.value, acc0_thread_buf[i]); + // }); blockwise_softmax.Run(acc0_thread_buf, workspace_buf); + // printf("GPU SoftMax, Tid %03d, GPU acc0 = %lf\n", get_thread_local_1d_id(), acc0_thread_buf[I0]); // TODO: may convert to log domain running_max_new = mathext::max(max, running_max); @@ -862,6 +949,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle block_sync_lds(); + // printf("GPU permute lanex, Tid %03d, GPU 0 = %04x\n", get_thread_local_1d_id(), *(reinterpret_cast(&a1_thread_buf[I0]))); + blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); block_sync_lds(); @@ -934,11 +1023,11 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // write out to C, implement shuffle { constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); // This API Provide All dimension (size) you need constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = - blockwise_gemm0.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + blockwise_gemm1.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); @@ -973,7 +1062,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = blockwise_gemm0.CalculateCThreadOriginDataIndex(I0, I0); + const auto c_thread_mtx_on_block = blockwise_gemm1.CalculateCThreadOriginDataIndex(I0, I0); const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index fda0464caa5..f2c6495994b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -140,6 +140,39 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma using GridwiseGemmPipe = remove_cvref_t())>; + template + __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1(const ABlockDesc_AK0_M_AK1&) + { + constexpr index_t A_K0 = ABlockDesc_AK0_M_AK1{}.GetLength(I0); + constexpr index_t A_K1 = ABlockDesc_AK0_M_AK1{}.GetLength(I2); + constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma); + + return transform_tensor_descriptor( + ABlockDesc_AK0_M_AK1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + + template + __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + { + constexpr index_t B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); + constexpr index_t B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); + constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); + return transform_tensor_descriptor( + BBlockDesc_BK0_N_BK1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() { constexpr auto max_lds_align = K1; @@ -414,17 +447,20 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle{}; + BlockwiseGemmWMMA{}; // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 360336751b9..4f8cbd9855d 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1382,6 +1382,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // copy data from src_buf into dst_vector static_for<0, DstScalarPerVector, 1>{}([&](auto i) { + // idx_md err. as dst access 2 strided elements while src visit 1 per loop constexpr index_t src_offset = src_desc.CalculateOffset( src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); @@ -1396,13 +1397,13 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow if(get_thread_local_1d_id() % 32 > 16){ // apply type convert dst_buf(Number{}) = type_convert(v); - dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), + dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), type_convert(v), LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); } else{ // apply type convert - dst_buf(Number{}) = type_convert(v); + dst_buf(Number{}) = type_convert(v); dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), type_convert(v), LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 26ac87ea7f9..80c72f5d2e6 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -517,12 +517,12 @@ struct WmmaGemm __host__ __device__ static auto CalculateAThreadOriginDataIndex() { - return GetSwizzledLaneIdLow(); + return TransposeC ? GetLaneIdUnderSubGroup() : GetSwizzledLaneIdLow(); } __host__ __device__ static auto CalculateBThreadOriginDataIndex() { - return GetLaneIdUnderSubGroup(); + return TransposeC ? GetSwizzledLaneIdLow() : GetLaneIdUnderSubGroup(); } __device__ static CIndex GetBeginOfThreadBlk() diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/library/include/ck/library/utility/host_tensor_generator.hpp index 4259862e65e..9421ebed268 100644 --- a/library/include/ck/library/utility/host_tensor_generator.hpp +++ b/library/include/ck/library/utility/host_tensor_generator.hpp @@ -55,6 +55,18 @@ struct GeneratorTensor_1 } }; +template +struct GeneratorTensor_dec1 +{ + T value = 0.1; + + template + T operator()(Is...) + { + return value; + } +}; + template struct GeneratorTensor_2 { From 4ddda63ba87c5c7c9bf728083d49bd788d0381e3 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 02:35:31 +0000 Subject: [PATCH 035/118] sanity check pass --- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 14 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 8 +- .../gpu/block/blockwise_gemm_wmma.hpp | 14 -- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 140 ++++++++---------- .../threadwise_tensor_slice_transfer.hpp | 19 ++- include/ck/utility/data_type.hpp | 26 ++++ 6 files changed, 109 insertions(+), 112 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index 759507185b6..b738af6b276 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -108,26 +108,26 @@ using DeviceGemmInstance = 1, // MRepeat 8, // LRepeat 4, // NRepeat - S<4, 64, 1>, // ABlockTransfer + S<4, 64, 1>, // ABlockTransfer MK -> K0 M K1 S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<4, 64, 1>, // B0BlockTransfer + S<4, 64, 1>, // B0BlockTransfer LK -> K0 L K1 S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<4, 64, 1>, // B1BlockTransfer - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, + S<4, 8, 8>, // B1BlockTransfer LN -> L0 N L1 + S<0, 2, 1>, + S<0, 2, 1>, + 1, 8, + 1, false, 1, // CShuffleMWmmaPerWavePerShuffle 2, // CShuffleNWmmaPerWavePerShuffle diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 4be9d908a88..cb7b2a54cdb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -127,7 +127,7 @@ int run(int argc, char* argv[]) b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; - case 6: // Rand: b0 ; unit: a b1 fail + case 6: // Rand: b0 ; unit: a b1 pass a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); @@ -240,12 +240,6 @@ int run(int argc, char* argv[]) a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); ref_gemm0_invoker.Run(ref_gemm0_argument); - // for(int i =0; i< 128; i++){ - // for(int j =0; j< 128; j++){ - // printf("%0.2lf ", acc0_g_m_n.mData[i*128 +j]); - // } - // printf("\n"); - // } // masking const auto mask = DeviceGemmInstance::C0MatrixMask(N); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index b3f65aeabab..7f084aef682 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -171,8 +171,6 @@ struct BlockwiseGemmWMMA static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); - - // printf("tid %03d, Mat-B offset %d\n", get_thread_local_1d_id()%32, CalculateBThreadOriginDataIndex().At(Number<3>{})); } // transposed WMMA output C' = B' * A' @@ -301,9 +299,6 @@ struct BlockwiseGemmWMMA a_thread_desc_, make_tuple(I0, m0, I0, I0, I0), a_thread_buf); - // static_for<0, a_thread_buf.size(), 1>{}([&](auto i) { - // a_thread_buf(i) = 1; - // }); static_for<0, NRepeat, 1>{}([&](auto n0) { // read B @@ -323,9 +318,6 @@ struct BlockwiseGemmWMMA b_thread_vec.template AsType()(i) = b_thread_buf[Number{}]; - - // a_thread_vec.template AsType()(i) = 1; - // b_thread_vec.template AsType()(i) = 1; }); using wmma_input_type_a = typename vector_type::type; @@ -333,12 +325,6 @@ struct BlockwiseGemmWMMA constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - - - // printf("GPU Gemm0 input, Tid %03d, A%2d = %04x, B%2d = %0x4\n", - // get_thread_local_1d_id(), - // i.value, *(reinterpret_cast(&a_thread_vec.template AsType()(i))), - // i.value, *(reinterpret_cast(&b_thread_vec.template AsType()(i)))); wmma_gemm.template Run( a_thread_vec.template AsType()(Number<0>{}), diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 12030b4d93e..ef88fb82e69 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -658,16 +658,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto t_lwave = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I4); constexpr auto t_lsubgroup = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I5); constexpr auto t_laccvgprs = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I6); - if(get_thread_local_1d_id()==0){ - printf("t_mrepeat %d, t_mwave %d, t_mthreadpersubgroup %d, t_lrepeat %d, t_lwave %d, t_lsubgroup %d, t_laccvgprs %d \n", - t_mrepeat.value, - t_mwave.value, - t_mthreadpersubgroup.value, - t_lrepeat.value, - t_lwave.value, - t_lsubgroup.value, - t_laccvgprs.value); - } // get acc0 thread map constexpr auto m0_l_m1_to_m_l_adaptor = make_single_stage_tensor_adaptor( make_tuple(make_unmerge_transform(make_tuple(t_mrepeat * t_mwave, t_mthreadpersubgroup)), @@ -744,28 +734,28 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // B1 matrix blockwise copy auto b1_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - B1BlockTransferThreadClusterLengths_L0_N_L1, - B1BlockTransferThreadClusterArrangeOrder, - FloatB1, - FloatB1, - decltype(b1_grid_desc_l0_n_l1), - decltype(b1_block_desc_l0perblock_nperblock_l1), - B1BlockTransferSrcAccessOrder, - Sequence<1, 0, 2>, - B1BlockTransferSrcVectorDim, - 2, - B1BlockTransferSrcScalarPerVector, - B1BlockTransferDstScalarPerVector_L1, - 1, - 1, - B1ThreadTransferSrcResetCoordinateAfterRun, - true, // DstResetCoord - NumGemmKPrefetchStage>( + ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, +/* typename SrcElementwiseOperation, */ B1ElementwiseOperation, +/* typename DstElementwiseOperation, */ tensor_operation::element_wise::PassThrough, +/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, +/* typename BlockSliceLengths, */ Sequence, +/* typename ThreadClusterLengths, */ B1BlockTransferThreadClusterLengths_L0_N_L1, +/* typename ThreadClusterArrangeOrder, */ B1BlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ FloatB1, +/* typename DstData, */ FloatB1, +/* typename SrcDesc, */ decltype(b1_grid_desc_l0_n_l1), +/* typename DstDesc, */ decltype(b1_block_desc_l0perblock_nperblock_l1), +/* typename SrcDimAccessOrder, */ B1BlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<1, 0, 2>, +/* index_t SrcVectorDim, */ B1BlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ B1BlockTransferSrcScalarPerVector, +/* index_t DstScalarPerVector, */ B1BlockTransferDstScalarPerVector_L1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ B1ThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, // DstResetCoord + NumGemmKPrefetchStage>( b1_grid_desc_l0_n_l1, make_multi_index(0, n_block_data_idx_on_grid, 0), b1_element_op, @@ -793,7 +783,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle NPerWmma, MRepeat, NRepeat, - KPack>{make_tuple(0, 0, 0, 0, 0)}; + KPack, + true>{make_tuple(0, 0, 0, 0, 0)}; auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); @@ -899,18 +890,14 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // softmax SoftmaxBuf& max = blockwise_softmax.max_value_buf; SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; - // printf("GPU Gemm 0, Tid %03d, GPU acc0 = %lf\n", get_thread_local_1d_id(), acc0_thread_buf[I0]); - // static_for<0, acc0_thread_buf.Size(), 1>{}([&](auto i) { - // printf("GPU Gemm0, Tid %03d, GPU acc%d = %lf\n", get_thread_local_1d_id(), i.value, acc0_thread_buf[i]); - // }); - blockwise_softmax.Run(acc0_thread_buf, workspace_buf); - // printf("GPU SoftMax, Tid %03d, GPU acc0 = %lf\n", get_thread_local_1d_id(), acc0_thread_buf[I0]); + blockwise_softmax.Run(acc0_thread_buf, workspace_buf); + // TODO: may convert to log domain running_max_new = mathext::max(max, running_max); running_sum_new = mathext::exp(running_max - running_max_new) * running_sum + mathext::exp(max - running_max_new) * sum; - + // gemm1 { // TODO: explore using dynamic buffer for a1 thread buffer @@ -949,8 +936,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle block_sync_lds(); - // printf("GPU permute lanex, Tid %03d, GPU 0 = %04x\n", get_thread_local_1d_id(), *(reinterpret_cast(&a1_thread_buf[I0]))); - blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); block_sync_lds(); @@ -973,7 +958,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle a1_thread_buf); block_sync_lds(); - + blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); } } // end gemm1 @@ -997,8 +982,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) { static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) { auto I = Number{}; - FloatAcc1 acc1 = acc1_thread_buf[I]; // P*V - FloatAcc1 c = c_thread_buf[I]; // O + FloatAcc1 acc1 = acc1_thread_buf[I]; // P*V + FloatAcc1 c = c_thread_buf[I]; // O FloatAcc1 c_new = (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c + math::exp(max[iM] - running_max_new[iM]) * acc1) / @@ -1022,18 +1007,18 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // write out to C, implement shuffle { - constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = - blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + constexpr auto c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = + blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); // This API Provide All dimension (size) you need - constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = - blockwise_gemm1.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp = + blockwise_gemm1.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); - constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); - constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); - constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I4); - constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I5); - constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I6); + constexpr auto MWave = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I1); + constexpr auto MThreadPerSubGroup = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I2); + constexpr auto NWave = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I4); + constexpr auto NSubGroup = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I5); + constexpr auto NAccVgprs = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I6); // LDS descriptor, shuffle and write out in MRepeat x NRepeat times constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = @@ -1043,22 +1028,23 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static_cast(p_shared), c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); - constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( + constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = transform_tensor_descriptor( c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, make_tuple( make_freeze_transform(I0), make_unmerge_transform(make_tuple( Number{}, // MRepeat per shuffle repeat MWave, // MWave - MSubGroup, // MSubGroup * MAccVgprs = MPerWmma - MAccVgprs)), + MThreadPerSubGroup // MThreadPerSubGroup = MPerWmma + )), make_freeze_transform(I0), make_unmerge_transform(make_tuple( Number{}, // NRepeat per shuffle repeat NWave, // NWave - NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma + NSubGroup, + NAccVgprs))), // NSubGroup * NAccVgprs = NPerWmma make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<>{}, Sequence<0, 1, 2, 6>{}, Sequence<>{}, Sequence<3, 4, 5>{})); + make_tuple(Sequence<>{}, Sequence<0, 1, 2>{}, Sequence<>{}, Sequence<3, 4, 5, 6>{})); // calculate origin of thread output tensor on global memory // blockwise GEMM c matrix starting index @@ -1067,30 +1053,30 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; - const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = + const auto m_thread_data_on_block_to_mrepeat_mwave_mthreadpersubgroup_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), - make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MThreadPerSubGroup))), + make_tuple(Sequence<0, 1, 2>{}), make_tuple(Sequence<0>{})); - const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor = + const auto n_thread_data_on_block_to_nrepeat_nwave_nsubgroup_naccvgprs_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), - make_tuple(Sequence<0, 1, 2>{}), + make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NSubGroup, NAccVgprs))), + make_tuple(Sequence<0, 1, 2, 3>{}), make_tuple(Sequence<0>{})); - const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( + const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_mthreadpersubgroup_adaptor.CalculateBottomIndex( make_multi_index(m_thread_data_on_block)); - const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( + const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nsubgroup_naccvgprs_adaptor.CalculateBottomIndex( make_multi_index(n_thread_data_on_block)); // shuffle: threadwise copy C from VGPR to LDS auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3, + NAccVgprs>, Sequence<0, 1, 2, 3, 4, 5, 6>, 6, - 1, // vector write pixel + 8, // vector write pixel InMemoryDataOperationEnum::Set, 1, true>{ - c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, make_multi_index(0, m_thread_data_on_block_idx[I1], m_thread_data_on_block_idx[I2], 0, n_thread_data_on_block_idx[I1], n_thread_data_on_block_idx[I2], - m_thread_data_on_block_idx[I3]), + n_thread_data_on_block_idx[I3]), ck::tensor_operation::element_wise::PassThrough{}}; // shuffle: blockwise copy C from LDS to global @@ -1144,7 +1130,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // space filling curve for local reg & global memory // space filling curve for threadwise C in VGPR constexpr auto sfc_c_vgpr = - SpaceFillingCurve, + SpaceFillingCurve, Sequence<0, 1, 2, 3, 4, 5, 6>, Sequence>{}; + NAccVgprs>>{}; // space filling curve for shuffled blockwise C in global mem constexpr auto sfc_c_global = @@ -1172,10 +1158,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle block_sync_lds(); // each thread write its data from VGPR to LDS - c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, sfc_c_vgpr.GetIndexTupleOfNumber(access_id), c_thread_buf, - c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, c_shuffle_block_buf); // make sure it's safe to read from LDS diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 4f8cbd9855d..de67901c068 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1394,19 +1394,24 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // apply element-wise operation element_op_(v, src_buf[Number{}]); - if(get_thread_local_1d_id() % 32 > 16){ + if(get_thread_local_1d_id() % 32 < 16){ // apply type convert dst_buf(Number{}) = type_convert(v); - dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), - type_convert(v), - LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); } else{ // apply type convert dst_buf(Number{}) = type_convert(v); - dst_buf(Number{}) = __builtin_amdgcn_permlanex16(type_convert(dst_buf(Number{})), - type_convert(v), - LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); + } + SrcData d = 0; + int temp = 0; + temp = __builtin_amdgcn_permlanex16(temp, type_convert(v), + LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); + d = type_convert(temp); + if(get_thread_local_1d_id() % 32 < 16){ + dst_buf(Number{}) = type_convert(d); + } + else{ + dst_buf(Number{}) = type_convert(d); } }); }); diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 40ee8b617e2..504bd49c5a9 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -964,6 +964,32 @@ inline __host__ __device__ constexpr float type_convert(bhalf_t return u.fp32; } +template <> +inline __host__ __device__ constexpr int type_convert(float x) +{ + union + { + float fp32; + int int32; + } u = {x}; + // u.fp32 = x; + + return u.int32; +} + +template <> +inline __host__ __device__ constexpr float type_convert(int x) +{ + union + { + int int32; + float fp32; + } u = {x}; + // u.fp32 = x; + + return u.fp32; +} + // convert fp32 to bfp16 template <> inline __host__ __device__ constexpr bhalf_t type_convert(float x) From 27dc055bf58af5bf037e5ed61bcadbe156ddfdad Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 07:34:13 +0000 Subject: [PATCH 036/118] fix a host tensor bug and clean up flash-attn code --- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 60 ++++++++++++------- include/ck/host_utility/kernel_launch.hpp | 6 +- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 8 +++ ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 5 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 15 +++-- .../threadwise_tensor_slice_transfer.hpp | 39 ++++++------ include/ck/utility/data_type.hpp | 2 - .../ck/library/utility/host_tensor.hpp | 2 +- 8 files changed, 83 insertions(+), 54 deletions(-) diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index 2a2e8899d10..1207339221e 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -43,9 +43,10 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough; using BElementOp = ck::tensor_operation::element_wise::PassThrough; using CDEElementOp = ck::tensor_operation::element_wise::Add; -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; -static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed; +static constexpr auto ASpec = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto BSpec = ck::tensor_operation::device::TensorSpecialization::Default; static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default; using DeviceOpInstanceKKNN = @@ -64,18 +65,18 @@ using DeviceOpInstanceKKNN = BElementOp, CDEElementOp, GemmSpec, - ABSpec, - ABSpec, + ASpec, + BSpec, DESpec, 256, 128, - 256, - 8, + 128, + 4, 8, 16, 16, 4, - 4, + 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -252,21 +253,6 @@ int main(int argc, char* argv[]) ck::index_t K0 = 2048; - // A[G0, G1, M0, M1, K0] - std::vector a_gs_ms_ks_lengths{G0, G1, M0, M1, K0}; - std::vector a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1}; - // B[G0, G1, N0, N1, K0] - std::vector b_gs_ns_ks_lengths{G0, G1, N0, N1, K0}; - std::vector b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1}; - - // D[G0, G1, M0, N0, M1, N1] - std::vector d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1}; - std::vector d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1}; - // E[G0, G1, M0, N0, M1, N1] - std::vector e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1}; - std::vector e_gs_ms_ns_strides{ - G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1}; - if(argc == 1) { // use default case @@ -277,13 +263,43 @@ int main(int argc, char* argv[]) init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); } + else if(argc == 11) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + G0 = std::stoi(argv[4]); + G1 = std::stoi(argv[5]); + M0 = std::stoi(argv[6]); + M1 = std::stoi(argv[7]); + N0 = std::stoi(argv[8]); + N1 = std::stoi(argv[9]); + K0 = std::stoi(argv[10]); + } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4-10: G0, G1, M0, M1, N0, N1, K0\n"); exit(0); } + + // A[G0, G1, M0, M1, K0] + std::vector a_gs_ms_ks_lengths{G0, G1, M0, M1, K0}; + std::vector a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1}; + // B[G0, G1, N0, N1, K0] + std::vector b_gs_ns_ks_lengths{G0, G1, N0, N1, K0}; + std::vector b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1}; + + // D[G0, G1, M0, N0, M1, N1] + std::vector d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1}; + std::vector d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1}; + // E[G0, G1, M0, N0, M1, N1] + std::vector e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1}; + std::vector e_gs_ms_ns_strides{ + G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1}; + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); Tensor b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); Tensor d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides); diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index 8015eaf5df4..9b89676259c 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -29,12 +29,12 @@ float launch_and_time_kernel(const StreamConfig& stream_config, block_dim.y, block_dim.z); - const int nrepeat = 1; + const int nrepeat = 100; - // printf("Warm up 1 time\n"); + printf("Warm up 1 time\n"); // warm up - // kernel<<>>(args...); + kernel<<>>(args...); printf("Start running %d times...\n", nrepeat); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 8fc6358e095..da71a3b067c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -771,6 +771,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle { if constexpr(!(is_same_v || is_same_v)) { + printf("DeviceOp: Arch check failure\n"); return false; } } @@ -785,6 +786,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle arg.e_grid_desc_m_n_, arg.block_2_ctile_map_)) { + printf("GridwiseOp: Validity check failure\n"); return false; } @@ -799,6 +801,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle if(!(arg.a_mz_stride_ == 1 && arg.a_grid_desc_k0_m_k1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0)) { + printf("DeviceOp: Vector Access A-m check failure\n"); return false; } } @@ -807,6 +810,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle if(!(arg.a_kz_stride_ == 1 && arg.a_grid_desc_k0_m_k1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0)) { + printf("DeviceOp: Vector Access A-k check failure\n"); return false; } } @@ -817,6 +821,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle if(!(arg.b_nz_stride_ == 1 && arg.b_grid_desc_k0_n_k1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0)) { + printf("DeviceOp: Vector Access B-n check failure\n"); return false; } } @@ -825,6 +830,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle if(!(arg.b_kz_stride_ == 1 && arg.b_grid_desc_k0_n_k1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0)) { + printf("DeviceOp: Vector Access B-k check failure\n"); return false; } } @@ -838,6 +844,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle CDEShuffleBlockTransferScalarPerVector_NPerBlock == 0)) { + printf("DeviceOp: Vector Access D-n check failure\n"); valid_d_access = false; } }); @@ -854,6 +861,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle 0) || CDEShuffleBlockTransferScalarPerVector_NPerBlock == 1)) { + printf("DeviceOp: Vector Access E-n check failure\n"); return false; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index ef88fb82e69..39863027da9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -352,6 +352,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); const auto N = b1_grid_desc_l0_n_l1.GetLength(I1); + printf("M = %d, L = %d, K = %d, N = %d\n", M, L, K, N); + const auto KPerBlock = K0PerBlock * K1Value; if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1))) { @@ -730,7 +732,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // dst Rowlane // 0x76543210 0xfedcba98 // src Rowlane - 0x76543210, 0xfedcba98>{tensor_operation::element_wise::PassThrough{}}; + 0x76543210, 0xfedcba98, + false>{tensor_operation::element_wise::PassThrough{}}; // B1 matrix blockwise copy auto b1_blockwise_copy = diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index c5ea67117e9..8fb4ad123b6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -148,14 +148,12 @@ __global__ void const Block2CTileMap block_2_etile_map) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) - //printf("entry kernel launch"); __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - //printf("before compute_ptr_offset call"); const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( static_cast(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx))); const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane( @@ -170,13 +168,9 @@ __global__ void DsPointer p_ds_grid_grp; - //printf("before allocate pointer d"); - static_for<0, NumDTensor, 1>{}( [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; }); - //printf("before entry"); - GridwiseOp::template Run(p_a_grid + a_batch_offset, p_b_grid + b_batch_offset, p_ds_grid_grp, @@ -469,16 +463,23 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle if(!valid) { + printf("GridwiseOp: D descriptor dimension check failure\n"); return false; } if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) && K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && K1 == b_grid_desc_k0_n_k1.GetLength(I2))) + { + printf("GridwiseOp: ABE descriptor dimension cross check failure\n"); return false; + } if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) + { + printf("GridwiseOp: Problemsize descriptor dimension check failure\n"); return false; + } // check gridwise gemm pipeline const auto num_k_loop = K0 / K0PerBlock; @@ -570,7 +571,6 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle const CDEElementwiseOperation& cde_element_op, const Block2CTileMap& block_2_ctile_map) { - //printf("safe entry"); // clang-format off /*******************************************************************************/ // Memory buffer zone. @@ -716,7 +716,6 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle c_thread_buf, K0BlockMainLoop); /*******************************************************************************/ - //printf("safe 1"); // write out to C, implement shuffle { constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index de67901c068..dd4d368bd49 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1311,10 +1311,11 @@ template ::type = false> struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow @@ -1389,29 +1390,33 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow constexpr index_t dst_offset = dst_desc.CalculateOffset( dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); - SrcData v; + SrcData v_this_row, v_theother_row; + // int type temp value due to intrinsic requirement + int temp = 0; // apply element-wise operation - element_op_(v, src_buf[Number{}]); + element_op_(v_this_row, src_buf[Number{}]); + + // apply intra-row swizzle permute + if constexpr(IntraRowSwizzlePerm){ + // origin: 0xfedcba98, 0x76543210 + temp = __builtin_amdgcn_permlane16(temp, type_convert(v_this_row), 0xeca86420, 0xfdb97531, 1, 0); + v_this_row = type_convert(temp); + } + + // apply inter-row permute. + temp = __builtin_amdgcn_permlanex16(temp, type_convert(v_this_row), LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); + v_theother_row = type_convert(temp); if(get_thread_local_1d_id() % 32 < 16){ // apply type convert - dst_buf(Number{}) = type_convert(v); + dst_buf(Number{}) = type_convert(v_this_row); + dst_buf(Number{}) = type_convert(v_theother_row); } else{ // apply type convert - dst_buf(Number{}) = type_convert(v); - } - SrcData d = 0; - int temp = 0; - temp = __builtin_amdgcn_permlanex16(temp, type_convert(v), - LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); - d = type_convert(temp); - if(get_thread_local_1d_id() % 32 < 16){ - dst_buf(Number{}) = type_convert(d); - } - else{ - dst_buf(Number{}) = type_convert(d); + dst_buf(Number{}) = type_convert(v_this_row); + dst_buf(Number{}) = type_convert(v_theother_row); } }); }); diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 504bd49c5a9..07f1a6bf3c1 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -972,7 +972,6 @@ inline __host__ __device__ constexpr int type_convert(float x) float fp32; int int32; } u = {x}; - // u.fp32 = x; return u.int32; } @@ -985,7 +984,6 @@ inline __host__ __device__ constexpr float type_convert(int x) int int32; float fp32; } u = {x}; - // u.fp32 = x; return u.fp32; } diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp index a8c7fd03953..29d94b0036c 100644 --- a/library/include/ck/library/utility/host_tensor.hpp +++ b/library/include/ck/library/utility/host_tensor.hpp @@ -396,7 +396,7 @@ struct Tensor } case 6: { auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) { - (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4, i5); + (*this)(i0, i1, i2, i3, i4, i5) = g(i0, i1, i2, i3, i4, i5); }; make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], From 8dbb73b172abf0f8229c348acaf49fec72b9da6c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 07:53:09 +0000 Subject: [PATCH 037/118] format --- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 98 +++++++++---------- .../CMakeLists.txt | 10 +- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 22 ++--- ...atched_gemm_scale_softmax_gemm_permute.inc | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 34 +++---- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 27 ++--- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 45 +++++---- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 49 +++++----- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 4 +- .../gpu/grid/gridwise_gemm_wmma.hpp | 16 +-- .../threadwise_tensor_slice_transfer.hpp | 37 ++++--- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 2 +- include/ck/utility/amd_wmma.hpp | 7 +- script/clang-format-overwrite.sh | 4 +- 14 files changed, 186 insertions(+), 171 deletions(-) diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index 1207339221e..0b0c130874a 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -45,56 +45,55 @@ using CDEElementOp = ck::tensor_operation::element_wise::Add; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; -static constexpr auto ASpec = ck::tensor_operation::device::TensorSpecialization::Default; -static constexpr auto BSpec = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto ASpec = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto BSpec = ck::tensor_operation::device::TensorSpecialization::Default; static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default; using DeviceOpInstanceKKNN = - ck::tensor_operation::device::DeviceBatchedContractionMultipleD_Wmma_CShuffle< - NumDimG, - NumDimM, - NumDimN, - NumDimK, - ADataType, - BDataType, - DsDataType, - EDataType, - AccDataType, - CShuffleDataType, - AElementOp, - BElementOp, - CDEElementOp, - GemmSpec, - ASpec, - BSpec, - DESpec, - 256, - 128, - 128, - 4, - 8, - 16, - 16, - 4, - 2, - S<4, 64, 1>, - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - true, - S<4, 64, 1>, - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - true, - 1, - 1, - S<1, 32, 1, 8>, - 8>; + ck::tensor_operation::device::DeviceBatchedContractionMultipleD_Wmma_CShuffle, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + 1, + 1, + S<1, 32, 1, 8>, + 8>; using DeviceOpInstance = DeviceOpInstanceKKNN; @@ -327,7 +326,8 @@ int main(int argc, char* argv[]) DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize()); - DeviceMem e_device_buf(sizeof(EDataType) * e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize()); + DeviceMem e_device_buf(sizeof(EDataType) * + e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize()); a_device_buf.ToDevice(a_gs_ms_ks.mData.data()); b_device_buf.ToDevice(b_gs_ns_ks.mData.data()); @@ -379,7 +379,7 @@ int main(int argc, char* argv[]) ck::index_t K = ck::accumulate_n( a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{}); - std::cout<<"GMNK="< a_g_m_k({BatchCount, M, K}); Tensor b0_g_k_n({BatchCount, K, N}); Tensor b1_g_n_o({BatchCount, N, O}); - Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 78e648d8e6e..de29d1e18b7 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -33,10 +33,10 @@ template {}; + static constexpr auto wmma_gemm = + WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -149,13 +150,8 @@ struct BlockwiseGemmWMMA const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D(); - return make_tuple(Number{}, - blk_idx[I0], - waveId_m, - Number{}, - waveId_n, - blk_idx[I1], - blk_idx[I2]); + return make_tuple( + Number{}, blk_idx[I0], waveId_m, Number{}, waveId_n, blk_idx[I1], blk_idx[I2]); } using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); @@ -169,7 +165,8 @@ struct BlockwiseGemmWMMA static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); - static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && NPerBlock % (NPerWMMA * NRepeat) == 0, + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && + NPerBlock % (NPerWMMA * NRepeat) == 0, "wrong!"); } @@ -180,20 +177,15 @@ struct BlockwiseGemmWMMA constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - // constexpr auto NSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - // constexpr auto MThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + // constexpr auto NSubGroup = + // c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; constexpr auto MThreadPerSubGroup + // = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; + constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; return make_naive_tensor_descriptor_packed( // |MRepeat |MWave |MSubGroup |NRepeat |NWave // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, - I1, - I1, - Number{}, - I1, - I1, - NAccVgprs)); + make_tuple(Number{}, I1, I1, Number{}, I1, I1, NAccVgprs)); } // Thread level, register decriptor. Vector-write diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index da71a3b067c..1eff05096e9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -393,10 +393,10 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle } // Gridwise descriptor, mapping to whole given provblem. - using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K({}, {})); - using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K({}, {})); - using DsGridDesc_M_N = remove_cvref_t; - using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N({}, {})); + using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K({}, {})); + using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K({}, {})); + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N({}, {})); using DsGridDesc_G_M_N = remove_cvref_t; using EGridDesc_G_M_N = decltype(MakeEGridDescriptor_G_M_N({}, {})); @@ -604,10 +604,12 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); b_grid_desc_n_k_ = DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); - - ds_grid_desc_m_n_ = DeviceOp::MakeDsGridDescriptor_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides); - - e_grid_desc_m_n_ = DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); + + ds_grid_desc_m_n_ = + DeviceOp::MakeDsGridDescriptor_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides); + + e_grid_desc_m_n_ = + DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(a_grid_desc_m_k_); b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_grid_desc_n_k_); @@ -619,8 +621,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle ds_grid_desc_m_n_); e_grid_desc_mblock_mperblock_nblock_nperblock = - GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - e_grid_desc_m_n_); + GridwiseOp::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_); // for sanity check of vector memory access a_mz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM - 1]; @@ -696,9 +697,11 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle { const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0); - const index_t grid_size = arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G; + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G; - const auto K = arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); + const auto K = + arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); auto launch_kernel = [&](auto has_main_k_block_loop) { constexpr bool has_main_loop = has_main_k_block_loop.value; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index d609ba4290d..eac76633d69 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -54,10 +54,10 @@ template {}; static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; - + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< Sequence, Sequence, @@ -261,7 +261,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle K1, // NPerBlock, L0PerBlock, - L1, + L1, MPerWMMA, LPerWMMA, NPerWMMA, @@ -339,10 +339,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle p_c_grid_{p_c_grid}, a_grid_desc_ak0_m_ak1_{ DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b0_grid_desc_bk0_l_bk1_{ - DeviceOp::MakeB0GridDescriptor_BK0_L_BK1(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, - b1_grid_desc_bl0_n_bl1_{ - DeviceOp::MakeB1GridDescriptor_BL0_N_BL1(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + b0_grid_desc_bk0_l_bk1_{DeviceOp::MakeB0GridDescriptor_BK0_L_BK1( + b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc_bl0_n_bl1_{DeviceOp::MakeB1GridDescriptor_BL0_N_BL1( + b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, c_grid_desc_m_n_{ Transform::MakeCGridDescriptor_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, a_grid_desc_g_m_k_{ @@ -408,7 +408,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1_; B1GridDesc_BL0_N_BL1 b1_grid_desc_bl0_n_bl1_; CGridDesc_M_N c_grid_desc_m_n_; - + AGridDesc_G_M_K a_grid_desc_g_m_k_; B0GridDesc_G_L_K b0_grid_desc_g_l_k_; B1GridDesc_G_N_L b1_grid_desc_g_n_l_; @@ -450,9 +450,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - const index_t grid_size = arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; - const auto K = arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + const auto K = + arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); auto launch_kernel = [&](auto has_main_k_block_loop) { const auto kernel = kernel_batched_gemm_softmax_gemm_wmma_cshuffle< @@ -552,11 +554,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); - const index_t c_n = arg.c_grid_desc_m_n_.GetLength(I1); - const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); - const index_t b1_n = arg.b1_grid_desc_bl0_n_bl1_.GetLength(I1); + const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded + const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); + const index_t c_n = arg.c_grid_desc_m_n_.GetLength(I1); + const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); + const index_t b1_n = arg.b1_grid_desc_bl0_n_bl1_.GetLength(I1); if(!(c_g == arg.batch_count_ && c_m == a_m && c_n == b1_n)) { @@ -592,8 +594,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; const auto b1_stride_lowest = B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; - const auto c_stride_lowest = - arg.c_mz_nz_strides_[1]; + const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || c_stride_lowest == 1)) @@ -610,8 +611,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle return IsSupportedArgument(*dynamic_cast(p_arg)); } - static auto - MakeArgument( + static auto MakeArgument( const ADataType* p_a, const B0DataType* p_b0, const B1DataType* p_b1, @@ -634,7 +634,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B0ElementwiseOperation b0_element_op, AccElementwiseOperation acc_element_op, B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) + CElementwiseOperation c_element_op) { return Argument{p_a, p_b0, @@ -664,8 +664,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } // polymorphic - std::unique_ptr - MakeArgumentPointer( + std::unique_ptr MakeArgumentPointer( const void* p_a, const void* p_b0, const void* p_b1, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 39863027da9..1c6891f1f60 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -135,10 +135,10 @@ template {}), - make_unmerge_transform( - make_tuple(Number{}, I1, I1)), + make_unmerge_transform(make_tuple(Number{}, I1, I1)), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } template - __host__ __device__ static constexpr auto MakeB1BlockDescriptor_L0_N0_N1_N2_L1(const B1BlockDesc_BL0_N_BL1&) + __host__ __device__ static constexpr auto + MakeB1BlockDescriptor_L0_N0_N1_N2_L1(const B1BlockDesc_BL0_N_BL1&) { - constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); - constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); + constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); + constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); return transform_tensor_descriptor( B1BlockDesc_BL0_N_BL1{}, @@ -317,17 +317,18 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment - const index_t gemm0_bytes_end = (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + - SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); + const index_t gemm0_bytes_end = + (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + + SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); const index_t gemm1_bytes_end = - (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * - sizeof(FloatB1); - + (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * + sizeof(FloatB1); + const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset + SharedMemTrait::reduction_space_size_aligned) * sizeof(FloatAcc0); - + const index_t c_block_bytes_end = SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); @@ -360,8 +361,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle return false; } - if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && - N % NPerBlock == 0)) + if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && N % NPerBlock == 0)) { return false; } @@ -432,7 +432,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>; using DefaultBlock2CTileMap = remove_cvref_t; - + struct SharedMemTrait { // LDS allocation for A and B: be careful of alignment @@ -453,7 +453,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle b1_block_desc_bl0_n_bl1.GetElementSpaceSize(), max_lds_align); static constexpr auto a_block_space_offset = 0; - static constexpr auto b0_block_space_offset = a_block_space_size_aligned.value; + static constexpr auto b0_block_space_offset = a_block_space_size_aligned.value; static constexpr auto b1_block_space_offset = 0; // LDS allocation for reduction @@ -466,10 +466,13 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); static constexpr auto c_block_space_size = - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize(); + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat + .GetElementSpaceSize(); }; - template + template __device__ static void Run(const FloatA* __restrict__ p_a_grid, const FloatB0* __restrict__ p_b0_grid, const FloatB1* __restrict__ p_b1_grid, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 8fb4ad123b6..127137faf11 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -165,7 +165,7 @@ __global__ void static constexpr index_t NumDTensor = DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size(); - + DsPointer p_ds_grid_grp; static_for<0, NumDTensor, 1>{}( @@ -530,7 +530,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle template __host__ __device__ static constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N_& ds_grid_desc_m_n) - { + { return generate_tuple( [&](auto i) { return MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index f2c6495994b..9ec90494d78 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -141,10 +141,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma GridwiseGemmPipeline_Selector())>; template - __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1(const ABlockDesc_AK0_M_AK1&) + __host__ __device__ static constexpr auto + MakeABlockDescriptor_K0_M0_M1_M2_K1(const ABlockDesc_AK0_M_AK1&) { - constexpr index_t A_K0 = ABlockDesc_AK0_M_AK1{}.GetLength(I0); - constexpr index_t A_K1 = ABlockDesc_AK0_M_AK1{}.GetLength(I2); + constexpr index_t A_K0 = ABlockDesc_AK0_M_AK1{}.GetLength(I0); + constexpr index_t A_K1 = ABlockDesc_AK0_M_AK1{}.GetLength(I2); constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma); return transform_tensor_descriptor( @@ -157,11 +158,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } - template - __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + template + __host__ __device__ static constexpr auto + MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) { - constexpr index_t B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); - constexpr index_t B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); + constexpr index_t B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); + constexpr index_t B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); return transform_tensor_descriptor( BBlockDesc_BK0_N_BK1{}, diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index dd4d368bd49..9b641fc57c3 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1311,11 +1311,11 @@ template ::type = false> struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow @@ -1383,7 +1383,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // copy data from src_buf into dst_vector static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - // idx_md err. as dst access 2 strided elements while src visit 1 per loop + // idx_md err. as dst access 2 strided elements while src visit 1 per loop constexpr index_t src_offset = src_desc.CalculateOffset( src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); @@ -1398,24 +1398,37 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow element_op_(v_this_row, src_buf[Number{}]); // apply intra-row swizzle permute - if constexpr(IntraRowSwizzlePerm){ - // origin: 0xfedcba98, 0x76543210 - temp = __builtin_amdgcn_permlane16(temp, type_convert(v_this_row), 0xeca86420, 0xfdb97531, 1, 0); + if constexpr(IntraRowSwizzlePerm) + { + // origin: + // 0xfedcba98, + // 0x76543210 + temp = __builtin_amdgcn_permlane16( + temp, type_convert(v_this_row), 0xeca86420, 0xfdb97531, 1, 0); v_this_row = type_convert(temp); } // apply inter-row permute. - temp = __builtin_amdgcn_permlanex16(temp, type_convert(v_this_row), LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); + temp = __builtin_amdgcn_permlanex16(temp, + type_convert(v_this_row), + LowEightRowlaneIdx, + HighEightRowLaneIdx, + 1, + 0); v_theother_row = type_convert(temp); - if(get_thread_local_1d_id() % 32 < 16){ + if(get_thread_local_1d_id() % 32 < 16) + { // apply type convert dst_buf(Number{}) = type_convert(v_this_row); - dst_buf(Number{}) = type_convert(v_theother_row); + dst_buf(Number{}) = + type_convert(v_theother_row); } - else{ + else + { // apply type convert - dst_buf(Number{}) = type_convert(v_this_row); + dst_buf(Number{}) = + type_convert(v_this_row); dst_buf(Number{}) = type_convert(v_theother_row); } }); diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 80c72f5d2e6..f934073ddaa 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -444,7 +444,7 @@ struct WmmaGemm make_pass_through_transform(MWave), make_pass_through_transform(Number{}), make_pass_through_transform(NBlockxRepeat), - make_pass_through_transform(NWave), + make_pass_through_transform(NWave), make_unmerge_transform(make_tuple(Number{}, Number{}))), make_tuple(Sequence<0>{}, diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index c059c1ffec4..7b0064c0b83 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -24,10 +24,9 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> // * Inline assembly need to elimate the duplicated data load, compiler won't help you // delete them. // amd_assembly_wmma_f32_16x16x16_f16_w32( - // reg_a, reg_b, reg_c.template AsType()(Number<0>{})); - reg_c.template AsType()(Number<0>{}) = - __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( reg_a, reg_b, reg_c.template - AsType()[Number<0>{}]); + // reg_a, reg_b, reg_c.template AsType()(Number<0>{})); + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); } }; diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index f9d11fcd8cb..3a09d6038a4 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From f45099cdf8580ae62450f5fc21dc3dcdb3c53616 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 08:19:46 +0000 Subject: [PATCH 038/118] cancel unnecessary change --- .../grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp | 2 -- script/clang-format-overwrite.sh | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 1c6891f1f60..ebfa0765a0a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -353,8 +353,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); const auto N = b1_grid_desc_l0_n_l1.GetLength(I1); - printf("M = %d, L = %d, K = %d, N = %d\n", M, L, K, N); - const auto KPerBlock = K0PerBlock * K1Value; if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1))) { diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index 3a09d6038a4..f9d11fcd8cb 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From 9620dbc9ac96c790fc49222f4fbee9d59de884f5 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 08:28:15 +0000 Subject: [PATCH 039/118] cancel unnecessary change --- .../ck/library/utility/host_tensor_generator.hpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/library/include/ck/library/utility/host_tensor_generator.hpp index 9421ebed268..4259862e65e 100644 --- a/library/include/ck/library/utility/host_tensor_generator.hpp +++ b/library/include/ck/library/utility/host_tensor_generator.hpp @@ -55,18 +55,6 @@ struct GeneratorTensor_1 } }; -template -struct GeneratorTensor_dec1 -{ - T value = 0.1; - - template - T operator()(Is...) - { - return value; - } -}; - template struct GeneratorTensor_2 { From c749c262fd03106d486fb197422814daeca44760 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 08:35:37 +0000 Subject: [PATCH 040/118] cancel unnecessary change --- include/ck/host_utility/kernel_launch.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index 19b38a068ee..24f2121674c 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -30,8 +30,6 @@ float launch_and_time_kernel(const StreamConfig& stream_config, block_dim.y, block_dim.z); - const int nrepeat = 100; - printf("Warm up 1 time\n"); #endif // warm up From c811a0e99aebc19fcd0011cde834815286e5a264 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 16 Feb 2023 12:13:26 +0000 Subject: [PATCH 041/118] temp save, add asm backend flag to amd_wmma --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 4 ++-- .../gpu/block/blockwise_gemm_wmma.hpp | 6 ++++-- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 4 +++- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 11 +++++----- include/ck/utility/amd_wmma.hpp | 20 ++++++++++--------- 5 files changed, 26 insertions(+), 19 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 8161b1088ad..d59d1bc7025 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -53,13 +53,13 @@ using DeviceConvFwdInstance = GemmSpec, // GemmSpecialization 256, // BlockSize 128, // MPerBlock - 128, // NPerBlock + 256, // NPerBlock 4, // K0PerBlock 8, // K1 16, // MPerWMMA 16, // NPerWMMA 4, // MRepeat - 2, // NRepeat + 4, // NRepeat S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index de29d1e18b7..5668e7e0b17 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -375,7 +375,9 @@ template + index_t KPack, + bool TransposeC = false, + bool AssemblyBackend = true> /* A: K0PerBlock x MPerBlock x K1 * B: K0PerBlock x NPerBlock x K1 * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs @@ -406,7 +408,7 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); static constexpr auto wmma_gemm = - WmmaGemm{}; + WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 127137faf11..da2a5d36f32 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -683,7 +683,9 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle NPerWmma, MRepeat, NRepeat, - KPack>{}; + KPack, + false, + true>{}; // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index f934073ddaa..2a2cb6f05ed 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -103,12 +103,12 @@ struct wmma_type + template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const { if constexpr(wave_size == 32) { - intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); + intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); } else if constexpr(wave_size == 64) { @@ -358,7 +358,8 @@ template + bool TransposeC = false, + bool AssemblyBackend = false> struct WmmaGemm { static constexpr auto I0 = Number<0>{}; @@ -491,11 +492,11 @@ struct WmmaGemm "(int8, int32) or (int4, int32)!"); if constexpr(!TransposeC) { - wmma_instr.template run(p_a_wave, p_b_wave, p_c_thread); + wmma_instr.template run(p_a_wave, p_b_wave, p_c_thread); } else { - wmma_instr.template run(p_b_wave, p_a_wave, p_c_thread); + wmma_instr.template run(p_b_wave, p_a_wave, p_c_thread); } } diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index 7b0064c0b83..6178e5d19d5 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -12,21 +12,23 @@ namespace ck { /********************************WAVE32 MODE***********************************************/ // src: fp16, dst: fp32 -template +template struct intrin_wmma_f32_16x16x16_f16_w32; -template <> -struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> +template +struct intrin_wmma_f32_16x16x16_f16_w32<16, 16, AssemblyBackend> { template __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) { - // * Inline assembly need to elimate the duplicated data load, compiler won't help you - // delete them. - // amd_assembly_wmma_f32_16x16x16_f16_w32( - // reg_a, reg_b, reg_c.template AsType()(Number<0>{})); - reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + if constexpr(AssemblyBackend){ + amd_assembly_wmma_f32_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()(Number<0>{})); + } + else{ + reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + } } }; From d4adc71aa80b23c6a0511f85dd327f08ade5f0b7 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 24 Feb 2023 03:29:54 +0000 Subject: [PATCH 042/118] Mat-A LDS Bypass sanity pass --- example/01_gemm/gemm_wmma_fp16.cpp | 46 +- example/01_gemm/run_gemm_example.inc | 12 + include/ck/host_utility/kernel_launch.hpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 199 +++++-- .../gpu/device/impl/device_gemm_wmma.hpp | 192 +++--- .../grid/gridwise_gemm_pipeline_selector.hpp | 4 +- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 127 +++- .../gpu/grid/gridwise_gemm_wmma.hpp | 556 +++++++++++++----- .../threadwise_tensor_slice_transfer.hpp | 39 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 15 +- include/ck/utility/amd_wmma.hpp | 13 +- include/ck/utility/data_type.hpp | 24 + 12 files changed, 887 insertions(+), 342 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 48bcca257a3..797cff5346d 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -19,15 +19,49 @@ using AElementOp = PassThrough; using BElementOp = PassThrough; using CElementOp = PassThrough; -static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding; // clang-format off using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle -// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer|MRepeat|NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| -// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN|MWmmaPerWave|NWmmaPerWave| _MBlock_MWaveMPerWmma| ScalarPerVector| -// ######| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| -// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 256, 8, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8, 1>; + < ALayout, + BLayout, + CLayout, + ADataType, + BDataType, + CDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CElementOp, + GemmDefault, + 256, // BlockSize + 128, // MPerBlock + 128, // NPerBlock + 64, // KPerBlock + 8, // K1 + 16, // MPerWmma + 16, // NPerWmma + 1, // M Repeat + 8, // N-Repeat + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 64, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + 1, // C shuffle (M Repeat) Per store + 4, // C shuffle (N Repeat) Per store + S<1, 64, 1, 4>, + 8>; // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host:: diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 4e2cedb52ad..e9b6e9830ce 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -35,6 +35,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); break; + case 2: + ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + break; + case 3: + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + break; + case 4: + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); + break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index 24f2121674c..f5d534c75a3 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -35,7 +35,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, // warm up kernel<<>>(args...); - const int nrepeat = 10; + const int nrepeat = 100; #if DEBUG_LOG printf("Start running %d times...\n", nrepeat); #endif diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 5668e7e0b17..fe448e5bcef 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -62,12 +62,33 @@ struct BlockwiseGemmWMMA static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I4); static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I4); + static constexpr auto A_temp0 = Number{}; + static constexpr auto A_temp1 = Number{}; + static constexpr auto A_temp2 = Number{}; + static constexpr auto A_temp3 = Number{}; + static constexpr auto A_temp4 = Number{}; + + // FIX it, workaround + using ABlockDesc_temp = decltype( + make_naive_tensor_descriptor(make_tuple(A_temp0, A_temp1, A_temp2, A_temp3, A_temp4), + make_tuple(A_temp1* A_temp2* A_temp3* A_temp4, + A_temp2* A_temp3* A_temp4, + A_temp3* A_temp4, + A_temp4, + I1))); static constexpr auto wmma_gemm = WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); + static constexpr bool AEnableLds = NWaves == 1 ? false : true; + static constexpr bool BEnableLds = MWaves == 1 ? false : true; + + // Read from Lds, duplicate Twice, Read from VGPR, no duplication. + static constexpr index_t A_Data_Duplicated_Rate = AEnableLds ? 2 : 1; + static constexpr index_t B_Data_Duplicated_Rate = BEnableLds ? 2 : 1; + StaticBufferTupleOfVector @@ -269,7 +302,7 @@ struct BlockwiseGemmWMMA // Describe how data allocated in thread copy src buffer // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma - static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1; + static constexpr ABlockDesc_temp a_block_desc_k0_m0_m1_m2_k1; static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1; template @@ -285,21 +318,28 @@ struct BlockwiseGemmWMMA static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... static_for<0, MRepeat, 1>{}([&](auto m0) { // read A - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0), - a_thread_buf); + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple( + Number{}, m0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0), + a_thread_buf); static_for<0, NRepeat, 1>{}([&](auto n0) { // read B - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0), - b_thread_buf); + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, + n0, + I0, + I0, + I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0), + b_thread_buf); vector_type a_thread_vec; vector_type b_thread_vec; @@ -324,6 +364,7 @@ struct BlockwiseGemmWMMA c_thread_buf.GetVectorTypeReference(Number{})); }); }); + }); } @@ -340,28 +381,78 @@ struct BlockwiseGemmWMMA static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - A_K1, - A_K1>; + template + struct AThreadCopySelector; - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - B_K1, - B_K1>; - - AThreadCopy a_thread_copy_; - BThreadCopy b_thread_copy_; + template <> + struct AThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4>, + 4, + A_K1, + A_K1>; + }; + + template <> + struct AThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< + FloatA, + FloatA, + decltype(a_block_desc_k0_m0_m1_m2_k1), + decltype(a_thread_desc_), + tensor_operation::element_wise::PassThrough, + Sequence<1, 1, 1, 1, A_K1>, + Sequence<0, 1, 2, 3, 4>, + 4, + A_K1, + 0x76543210, + 0xfedcba98, + true>; + }; + + template + struct BThreadCopySelector; + + template <> + struct BThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4>, + 4, + B_K1, + B_K1>; + }; + + template <> + struct BThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< + FloatB, + FloatB, + decltype(b_block_desc_k0_n0_n1_n2_k1), + decltype(b_thread_desc_), + tensor_operation::element_wise::PassThrough, + Sequence<1, 1, 1, 1, B_K1>, + Sequence<0, 1, 2, 3, 4>, + 4, + B_K1, + 0x76543210, + 0xfedcba98, + false>; + }; + + typename AThreadCopySelector::type a_thread_copy_; + typename BThreadCopySelector::type b_thread_copy_; }; // block wise level pipe designed for inline asm @@ -376,7 +467,7 @@ template /* A: K0PerBlock x MPerBlock x K1 * B: K0PerBlock x NPerBlock x K1 @@ -407,8 +498,14 @@ struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); - static constexpr auto wmma_gemm = - WmmaGemm{}; + static constexpr auto wmma_gemm = WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index dbcceac68f2..d0211fe5a08 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -15,6 +15,7 @@ #include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" namespace ck { namespace tensor_operation { @@ -35,10 +36,10 @@ template {}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; - static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA) - { - assert(K % K1 == 0); + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; - const index_t K0 = K / K1; + static constexpr auto AEnableLds = NWaves == 1 ? false : true; + static constexpr auto BEnableLds = MWaves == 1 ? false : true; + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; + // Describe how data read from Global memory + static auto MakeAGridDescriptor(index_t MRaw, index_t KRaw, index_t StrideA) + { const auto a_grid_desc_m_k = [&]() { if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1)); + const auto a_grid_desc_mraw_kraw = + make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), make_tuple(StrideA, I1)); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); } #ifdef ENABLE_COLMAJOR else if constexpr(is_same::value) @@ -97,104 +110,88 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{}, Sequence<0>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } else { + constexpr auto A_KRow = WmmaK / K1; + const auto A_KWmma = K / WmmaK; + + const auto M0 = M / MPerBlock; + return transform_tensor_descriptor( a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_pass_through_transform(M)), + make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(M0 * MRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); } } - static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB) + static auto MakeBGridDescriptor_K0_N_K1(index_t KRaw, index_t NRaw, index_t StrideB) { - assert(K % K1 == 0); - - const index_t K0 = K / K1; - - const auto b_grid_desc_k_n = [&]() { + const auto b_grid_desc_nraw_kraw = [&]() { if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1)); + return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + make_tuple(I1, StrideB)); } else if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB)); + return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + make_tuple(StrideB, I1)); } }(); - if constexpr(GemmSpec == GemmSpecialization::MNPadding) - { - const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); - return transform_tensor_descriptor( - b_grid_desc_k_n, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_right_pad_transform(N, PadN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - else - { - return transform_tensor_descriptor( - b_grid_desc_k_n, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + assert(K % K1 == 0); + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } - static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC) + static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC) { - const auto c_grid_desc_m_n = [&]() { + const auto c_grid_desc_mraw_nraw = [&]() { if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1)); + return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), + make_tuple(StrideC, I1)); } else if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC)); + return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), + make_tuple(I1, StrideC)); } }(); - if constexpr(GemmSpec == GemmSpecialization::MNPadding) - { - const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock; - const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; - - return transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } - else - { - - return transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - } + return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw); } // Gridwise descriptor, mapping to whole given provblem. - using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1)); + using AGridDesc = decltype(MakeAGridDescriptor(1, 1, 1)); using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1)); using CGridDesc_M_N = decltype(MakeCGridDescriptor_M_N(1, 1, 1)); @@ -207,7 +204,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, + remove_reference_t, remove_reference_t, remove_reference_t< typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, @@ -378,7 +384,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, + remove_reference_t, remove_reference_t, remove_reference_t< typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, @@ -411,7 +417,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm || is_same_v)) { + printf("DeviceOp err: AccDataType"); return false; } } else { + printf("DeviceOp err: Arch"); return false; } - return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_, + return GridwiseGemm::CheckValidity(arg.a_grid_desc_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_, arg.block_2_ctile_map_); @@ -547,10 +555,10 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm" diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index 98331d85449..a9bea5886fd 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -15,6 +15,8 @@ enum struct PipelineVersion }; template constexpr auto GridwiseGemmPipeline_Selector() @@ -23,7 +25,7 @@ constexpr auto GridwiseGemmPipeline_Selector() { if constexpr(LoopSched == LoopScheduler::Default) { - return GridwiseGemmPipeline_v1{}; + return GridwiseGemmPipeline_v1{}; } else if constexpr(LoopSched == LoopScheduler::Interwave) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index e9097552c2b..46e0493e503 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -8,12 +8,12 @@ namespace ck { -template +template struct GridwiseGemmPipeline_v1; // 1-stage prefetch template <> -struct GridwiseGemmPipeline_v1<1> +struct GridwiseGemmPipeline_v1<1, true, true> { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -107,7 +107,7 @@ struct GridwiseGemmPipeline_v1<1> // 2-stage prefetch template <> -struct GridwiseGemmPipeline_v1<2> +struct GridwiseGemmPipeline_v1<2, true, true> { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -253,6 +253,123 @@ struct GridwiseGemmPipeline_v1<2> } }; +template <> +struct GridwiseGemmPipeline_v1<1, false, true> +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + __device__ static void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { +#if 0 + constexpr auto a_block_origin_idx = generate_sequence_v2( + []() constexpr { + return Number<0>{}; + }, + Number{}); +#endif + constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + auto a_block_buf_switch = a_block_buf; + + // preload data into LDS + a_blockwise_copy.Run( + a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + a_blockwise_copy.Run( + a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf_switch); + + block_sync_lds(); + + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + a_block_buf = a_block_buf_switch; + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + } + } +}; + +// placeholder +template <> +struct GridwiseGemmPipeline_v1<1, true, false> +{ +}; + +template <> +struct GridwiseGemmPipeline_v1<1, false, false> +{ +}; + template struct GridwiseGemmPipelineInterwave_v1; @@ -348,7 +465,7 @@ struct GridwiseGemmPipelineInterwave_v1<1> // Note: 2 stage prefetch not optimized for inter-wave loop scheduler template <> -struct GridwiseGemmPipelineInterwave_v1<2> : public GridwiseGemmPipeline_v1<2> +struct GridwiseGemmPipelineInterwave_v1<2> : public GridwiseGemmPipeline_v1<2, true, true> { }; @@ -358,7 +475,7 @@ constexpr auto GridwiseGemmPipeline_v1_Selector() { if constexpr(LoopSched == LoopScheduler::Default) { - return GridwiseGemmPipeline_v1{}; + return GridwiseGemmPipeline_v1{}; } else if constexpr(LoopSched == LoopScheduler::Interwave) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 9ec90494d78..a652ce8bcee 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -21,7 +21,7 @@ template (p_a_grid, p_b_grid, p_c_grid, p_shared, - a_grid_desc_k0_m_k1, + a_grid_desc, b_grid_desc_k0_n_k1, c_grid_desc_mblock_mperblock_nblock_nperblock, a_element_op, @@ -67,7 +63,7 @@ __global__ void ignore = p_a_grid; ignore = p_b_grid; ignore = p_c_grid; - ignore = a_grid_desc_k0_m_k1; + ignore = a_grid_desc; ignore = b_grid_desc_k0_n_k1; ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; ignore = a_element_op; @@ -84,7 +80,7 @@ template {}; static constexpr auto I7 = Number<7>{}; - // K1 should be Number<...> + static constexpr auto B_K0 = BGridDesc_K0_N_K1{}.GetLength(I0); + static constexpr auto B_K1 = BGridDesc_K0_N_K1{}.GetLength(I2); + // FIX ME: To be deprecated static constexpr auto K1 = Number{}; + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + using ThisThreadBlock = ThisThreadBlock; - using GridwiseGemmPipe = remove_cvref_t())>; + using GridwiseGemmPipe = + remove_cvref_t())>; - template - __host__ __device__ static constexpr auto - MakeABlockDescriptor_K0_M0_M1_M2_K1(const ABlockDesc_AK0_M_AK1&) + // Describe how data store to (LDS/VGPR) buffer from Global memory + __host__ __device__ static constexpr auto MakeABlockDescriptor() { - constexpr index_t A_K0 = ABlockDesc_AK0_M_AK1{}.GetLength(I0); - constexpr index_t A_K1 = ABlockDesc_AK0_M_AK1{}.GetLength(I2); - constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma); + constexpr auto a_block_desc = [&]() { + if constexpr(AEnableLds) + { + // K0->M->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + constexpr auto max_lds_align = K1; - return transform_tensor_descriptor( - ABlockDesc_AK0_M_AK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, I1, K1), + make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + } + }(); + + return a_block_desc; } - template - __host__ __device__ static constexpr auto - MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() { - constexpr index_t B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); - constexpr index_t B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); - constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); - return transform_tensor_descriptor( - BBlockDesc_BK0_N_BK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + constexpr auto a_block_copy_step = [&]() { + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / K1; + + return make_multi_index(K0PerBlock, 0, 0); + } + else + { + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + } + }(); + + return a_block_copy_step; } - __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() + __host__ __device__ static constexpr auto MakeBBlockSliceCopyStep() { - constexpr auto max_lds_align = K1; + constexpr auto b_block_copy_step = [&]() { + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / K1; - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { - if constexpr(ABlockLdsExtraM) + return make_multi_index(K0PerBlock, 0, 0); + } + else { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + } + }(); + + return b_block_copy_step; + } + + // Describe how data read from (LDS/VGPR) buffer + template + __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&) + { + + constexpr auto a_wave_desc = [&]() { + if constexpr(AEnableLds) + { + // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1 + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } else { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); + // KWmma_MRepeat_MWave_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0, 3>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); } }(); - return a_block_desc_k0perblock_mperblock_k1; + return a_wave_desc; + } + + template + __host__ __device__ static constexpr auto + MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + { + return transform_tensor_descriptor( + BBlockDesc_BK0_N_BK1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() { constexpr auto max_lds_align = K1; - + constexpr auto K0PerBlock = KPerBlock / K1; // B matrix in LDS memory, dst of blockwise copy constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { if constexpr(BBlockLdsExtraN) @@ -223,44 +309,20 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // *Caution Here repeat is shuffle repeat GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() { - constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma); - constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma); - constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = make_naive_tensor_descriptor_packed( make_tuple(I1, - Number{}, + Number{}, I1, - Number{})); + Number{})); return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat; } - __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() - { - // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_desc_k0perblock_mperblock_k1 = - GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - - constexpr auto b_block_desc_k0perblock_nperblock_k1 = - GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - - constexpr auto max_lds_align = K1; - - constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); - - constexpr auto b_block_space_size_aligned = math::integer_least_multiple( - b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); - - return (a_block_space_size_aligned * sizeof(FloatA) + - b_block_space_size_aligned * sizeof(FloatB)); - } - // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + CheckValidity(const AGridDesc& a_grid_desc, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, const CGridDesc_M_N& c_grid_desc_m_n, const Block2CTileMap& block_2_ctile_map) @@ -272,23 +334,68 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma (NPerBlock % (NRepeat * NPerWmma)) == 0, "Invalid tuning param!"); - const auto M = a_grid_desc_k0_m_k1.GetLength(I1); - const auto N = b_grid_desc_k0_n_k1.GetLength(I1); - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + const auto GetAProblemsizeMK = [&]() { + if constexpr(AEnableLds) + { + return make_tuple(a_grid_desc.GetLength(I1), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * + a_grid_desc.GetLength(I4), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I5)); + } + }; + + const auto GetBProblemsizeNK = [&]() { + if constexpr(BEnableLds) + { + return make_tuple(b_grid_desc_k0_n_k1.GetLength(I1), + b_grid_desc_k0_n_k1.GetLength(I0) * + b_grid_desc_k0_n_k1.GetLength(I2)); + } + else + { + return make_tuple( + b_grid_desc_k0_n_k1.GetLength(I1) * b_grid_desc_k0_n_k1.GetLength(I2) * + b_grid_desc_k0_n_k1.GetLength(I4), + b_grid_desc_k0_n_k1.GetLength(I0) * b_grid_desc_k0_n_k1.GetLength(I3) * + b_grid_desc_k0_n_k1.GetLength(I5)); + } + }; + + const auto M = GetAProblemsizeMK()[I0]; + const auto N = GetBProblemsizeNK()[I0]; + const auto K = GetAProblemsizeMK()[I1]; if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) && - K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && - K1 == b_grid_desc_k0_n_k1.GetLength(I2))) + K == GetBProblemsizeNK()[I1])) + { + printf("A: MxK = %d x %d, B: NxK = %d x %d, C: MxN = %d x %d\n", + GetAProblemsizeMK()[I0], + GetAProblemsizeMK()[I1], + GetBProblemsizeNK()[I0], + GetBProblemsizeNK()[I1], + c_grid_desc_m_n.GetLength(I0), + c_grid_desc_m_n.GetLength(I1)); + printf("GridwiseOp err: ProblemSize check"); return false; + } - if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0)) + { + printf("GridwiseOp err: ProblemSize division"); return false; + } // check gridwise gemm pipeline - const auto num_k_loop = K0 / K0PerBlock; + const auto num_k_loop = K / KPerBlock; if(!GridwiseGemmPipe::IsSupported(num_k_loop)) { + printf("GridwiseOp err: Pipeline not support this k_loop"); return false; } @@ -303,7 +410,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / (K0PerBlock * K1); + const index_t num_loop = K / KPerBlock; return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -340,12 +447,45 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma using DefaultBlock2CTileMap = remove_cvref_t; + struct SharedMemTrait + { + // LDS allocation for A and B: be careful of alignment + + static constexpr auto max_lds_align = K1; + + static constexpr auto a_block_space_size_aligned = + AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(), + max_lds_align) * + sizeof(FloatA) + : 0; + static constexpr auto b_block_space_size_aligned = + BEnableLds ? math::integer_least_multiple( + GetBBlockDescriptor_K0PerBlock_NPerBlock_K1().GetElementSpaceSize(), + max_lds_align) * + sizeof(FloatB) + : 0; + + static constexpr auto a_block_space_offset = 0; + static constexpr auto b_block_space_offset = a_block_space_size_aligned; + + // LDS allocation for C shuffle in LDS + static constexpr auto c_shuffle_block_space_size = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + .GetElementSpaceSize() * + sizeof(FloatCShuffle); + + static constexpr auto c_shuffle_block_space_offset = 0; + + static constexpr auto lds_size = math::max( + c_shuffle_block_space_size, (a_block_space_size_aligned + b_block_space_size_aligned)); + }; + template __device__ static void Run(const FloatA* __restrict__ p_a_grid, const FloatB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, void* __restrict__ p_shared, - const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + const AGridDesc& a_grid_desc, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& c_grid_desc_mblock_mperblock_nblock_nperblock, @@ -358,7 +498,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /*******************************************************************************/ // Memory buffer zone. const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); + p_a_grid, a_grid_desc.GetElementSpaceSize()); const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( @@ -378,14 +518,32 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); /*******************************************************************************/ -// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - constexpr auto max_lds_align = K1; - constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - // A matrix blockwise copy - auto a_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, +// BlockLevel, A/B Matrix ThreadMapping in WMMA Source buffer, As Destinaion of BlockWise_Copy + const auto K = [&](){ + if constexpr(AEnableLds){ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); + } + else{ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); + } + }(); + + // printf("---------------K = %d\n", K); + + constexpr auto a_block_desc = MakeABlockDescriptor(); + constexpr auto b_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + + auto a_block_trait = [&](){ + // A matrix blockwise copy + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared), + a_block_desc.GetElementSpaceSize()); + + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, /* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, @@ -406,46 +564,121 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /* index_t DstScalarStrideInVector, */ 1, /* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, /* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( - a_grid_desc_k0_m_k1, + a_grid_desc, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, - a_block_desc_k0perblock_mperblock_k1, + a_block_desc, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); - // B matrix blockwise copy - auto b_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - FloatB, - FloatB, - decltype(b_grid_desc_k0_n_k1), - decltype(b_block_desc_k0perblock_nperblock_k1), - BBlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>( - b_grid_desc_k0_n_k1, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_element_op, - b_block_desc_k0perblock_nperblock_k1, - make_multi_index(0, 0, 0), - ck::tensor_operation::element_wise::PassThrough{}); + return make_tuple(a_block_buf, a_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + auto a_block_buf = make_static_buffer( + a_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto a_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + ABlockTransferSrcScalarPerVector, + AThreadTransferSrcResetCoordinateAfterRun, + true>( + a_grid_desc, + make_multi_index(0, + m_block_data_idx_on_grid/(MWaves * MPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(a_block_buf, a_blockwise_copy); + } + }; + + auto b_block_trait = [&](){ + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto b_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::a_block_space_size_aligned, + b_block_desc.GetElementSpaceSize()); + + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + FloatB, + FloatB, + decltype(b_grid_desc_k0_n_k1), + decltype(b_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc_k0_n_k1, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + return make_tuple(b_block_buf, b_blockwise_copy); + } + else + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto b_block_buf = make_static_buffer( + b_block_desc.GetElementSpaceSize()); + auto b_blockwise_copy = + ThreadwiseTensorSliceTransfer_v4{}, + Number{}, + Number{}>, + Sequence<0, 1, 2>, + 2, + BBlockTransferSrcScalarPerVector, + 1>( + make_multi_index(0, get_thread_local_1d_id()/32 * 16 + get_thread_local_1d_id() % 16, 0)); + + return make_tuple(b_block_buf, b_blockwise_copy); + } + }; + + auto a_block_buf = a_block_trait()[I0]; + auto a_blockwise_copy = a_block_trait()[I1]; + auto b_block_buf = b_block_trait()[I0]; + auto b_blockwise_copy = b_block_trait()[I1]; /*******************************************************************************/ // GEMM - constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = @@ -453,11 +686,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma FloatA, FloatB, FloatAcc, - decltype(MakeABlockDescriptor_K0_M0_M1_M2_K1(a_block_desc_k0perblock_mperblock_k1)), - decltype(MakeBBlockDescriptor_K0_N0_N1_N2_K1(b_block_desc_k0perblock_nperblock_k1)), + decltype(MakeAWaveDescriptor(a_block_desc)), + decltype(MakeBBlockDescriptor_K0_N0_N1_N2_K1(b_block_desc)), MPerBlock, NPerBlock, - K0PerBlock * K1, + KPerBlock, MPerWmma, NPerWmma, MRepeat, @@ -467,26 +700,22 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); -/*******************************************************************************/ - constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); - // LDS allocation for A and B: be careful of alignment - auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); - +/*******************************************************************************/ // Shift Per SUB_K - constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); + // printf("a_block_slice_copy_step FirstKdim = %d\n", a_block_slice_copy_step[I0]); + constexpr auto b_block_slice_copy_step = MakeBBlockSliceCopyStep(); // gridwise GEMM pipeline - const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); - GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, - a_block_desc_k0perblock_mperblock_k1, + const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); + GridwiseGemmPipe::template Run(a_grid_desc, + a_block_desc, a_blockwise_copy, a_grid_buf, a_block_buf, a_block_slice_copy_step, b_grid_desc_k0_n_k1, - b_block_desc_k0perblock_nperblock_k1, + b_block_desc, b_blockwise_copy, b_grid_buf, b_block_buf, @@ -497,6 +726,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /*******************************************************************************/ // write out to C, implement shuffle { +#if 0 + static_for<0, c_thread_buf.Size(), 1>{}([&](auto i) { + printf("tid: %03d, c_thread_buf[%02d] val: %08x\n", get_thread_local_1d_id(), i.value, + *(reinterpret_cast(&(c_thread_buf[i])))); + // c_thread_buf(i) = 32; + }); +#endif constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); @@ -515,8 +751,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); auto c_shuffle_block_buf = make_dynamic_buffer( - static_cast(p_shared), - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); + static_cast(p_shared), SharedMemTrait::c_shuffle_block_space_size); constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, @@ -666,6 +901,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma if constexpr(access_id < num_access - 1) { constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); + // move on C c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 9b641fc57c3..c7a950d7400 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1324,15 +1324,14 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow using Index = MultiIndex; - __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow( - const ElementwiseOperation& element_op) - : element_op_{element_op} + __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow(const Index& src_idx) { static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(), "wrong! Desc need to known at compile-time"); static_assert(SliceLengths::At(Number{}) % DstScalarPerVector == 0, "wrong! Not divisible"); + ignore = src_idx; } template {}([&](auto i) { - // idx_md err. as dst access 2 strided elements while src visit 1 per loop + // src_desc error, non constexpr? constexpr index_t src_offset = src_desc.CalculateOffset( src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); @@ -1396,16 +1395,22 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // apply element-wise operation element_op_(v_this_row, src_buf[Number{}]); - - // apply intra-row swizzle permute + // if (get_thread_local_1d_id() < 16) + // printf("tid: %03d, RawData: %04x\n", get_thread_local_1d_id(), + // *(reinterpret_cast(&v_this_row)) ); apply intra-row swizzle permute if constexpr(IntraRowSwizzlePerm) { - // origin: - // 0xfedcba98, - // 0x76543210 - temp = __builtin_amdgcn_permlane16( - temp, type_convert(v_this_row), 0xeca86420, 0xfdb97531, 1, 0); - v_this_row = type_convert(temp); + temp = __builtin_amdgcn_permlane16( // 0x76543210, 0xfedcba98 + temp, + type_convert(v_this_row), + 0xb3a29180, + 0xf7e6d5c4, + 1, + 0); + v_this_row = type_convert(temp); + // if (get_thread_local_1d_id() < 16) + // printf("tid: %03d, SwiData: %04x\n", get_thread_local_1d_id(), + // *(reinterpret_cast(&v_this_row)) ); } // apply inter-row permute. @@ -1415,8 +1420,9 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow HighEightRowLaneIdx, 1, 0); - v_theother_row = type_convert(temp); - + v_theother_row = type_convert(temp); + // printf("tid: %03d, PermData: %04x\n", get_thread_local_1d_id(), + // *(reinterpret_cast(&v_theother_row)) ); if(get_thread_local_1d_id() % 32 < 16) { // apply type convert @@ -1434,8 +1440,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow }); }); } - - ElementwiseOperation element_op_; + ElementwiseOperation element_op_{}; }; } // namespace ck diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 2a2cb6f05ed..83ecc61b39f 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -103,7 +103,12 @@ struct wmma_type + template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const { if constexpr(wave_size == 32) @@ -358,7 +363,7 @@ template struct WmmaGemm { @@ -492,11 +497,13 @@ struct WmmaGemm "(int8, int32) or (int4, int32)!"); if constexpr(!TransposeC) { - wmma_instr.template run(p_a_wave, p_b_wave, p_c_thread); + wmma_instr.template run( + p_a_wave, p_b_wave, p_c_thread); } else { - wmma_instr.template run(p_b_wave, p_a_wave, p_c_thread); + wmma_instr.template run( + p_b_wave, p_a_wave, p_c_thread); } } diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index 6178e5d19d5..98dc81fc032 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -21,13 +21,16 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16, AssemblyBackend> template __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) { - if constexpr(AssemblyBackend){ + if constexpr(AssemblyBackend) + { amd_assembly_wmma_f32_16x16x16_f16_w32( - reg_a, reg_b, reg_c.template AsType()(Number<0>{})); + reg_a, reg_b, reg_c.template AsType()(Number<0>{})); } - else{ - reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( - reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); + else + { + reg_c.template AsType()(Number<0>{}) = + __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( + reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); } } }; diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 07f1a6bf3c1..a3547951bf0 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -988,6 +988,30 @@ inline __host__ __device__ constexpr float type_convert(int x) return u.fp32; } +template <> +inline __host__ __device__ constexpr int type_convert(half_t x) +{ + union + { + half_t fp16; + int int32; + } u = {x}; + + return u.int32; +} + +template <> +inline __host__ __device__ constexpr half_t type_convert(int x) +{ + union + { + int int32; + half_t fp16; + } u = {x}; + + return u.fp16; +} + // convert fp32 to bfp16 template <> inline __host__ __device__ constexpr bhalf_t type_convert(float x) From 6a9d7b64ef4205f72efb797e98db81e75874ce23 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 27 Feb 2023 07:35:49 +0000 Subject: [PATCH 043/118] temp save --- example/01_gemm/gemm_wmma_fp16.cpp | 12 +- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 6 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 60 ++-- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 313 +++++++++++++----- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 2 +- .../gpu/grid/gridwise_gemm_wmma.hpp | 8 +- .../threadwise_tensor_slice_transfer.hpp | 36 +- .../transform_contraction_to_gemm.hpp | 20 ++ 8 files changed, 312 insertions(+), 145 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 797cff5346d..3945a085dca 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -37,13 +37,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle GemmDefault, 256, // BlockSize 128, // MPerBlock - 128, // NPerBlock - 64, // KPerBlock + 16, // NPerBlock + 32, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma 1, // M Repeat - 8, // N-Repeat + 1, // N-Repeat S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -51,7 +51,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, 8, true, - S<4, 64, 1>, + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, @@ -59,8 +59,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, true, 1, // C shuffle (M Repeat) Per store - 4, // C shuffle (N Repeat) Per store - S<1, 64, 1, 4>, + 1, // C shuffle (N Repeat) Per store + S<1, 128, 1, 2>, 8>; // clang-format on diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index 14ffb040350..4c7a934a754 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -94,12 +94,14 @@ using DeviceGemmInstance = TensorSpecB1, TensorSpecC, 256, + // Gemm 0 128, // MPerBlock 128, // LPerBlock - 4, // K0PerBlock + 32, // KPerBlock 8, // K1 + // Gemm 1 64, // NPerBlock - 4, // L0PerBlock + 32, // LPerBlock 8, // L1 16, // MPerWMMA 16, // LPerWMMA diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index eac76633d69..113abefcb3c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -53,10 +53,10 @@ template {}; @@ -137,6 +135,15 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + + static constexpr auto AEnableLds = LWaves == 1 ? false : true; + // static constexpr auto B0EnableLds = MWaves == 1 ? false : true; + // static constexpr auto B1EnableLds = MWaves == 1 ? false : true; + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< Sequence, Sequence, @@ -146,12 +153,22 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B1Spec, CSpec>; - static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptor(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) { - return Transform::MakeAGridDescriptor_AK0_M_AK1( - Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), - Number{}); + if constexpr(AEnableLds) + { + return Transform::MakeAGridDescriptor_AK0_M_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), + Number{}); + } + else + { + return Transform::MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AKRow_MPerWmma_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), + WmmaK, Number{}, Number{}, Number{}, Number{}) + + } } static auto MakeB0GridDescriptor_BK0_L_BK1(const std::vector& b0_gs_ls_ks_lengths_vec, @@ -170,7 +187,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle Number{}); } - using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1({}, {})); + using AGridDesc = decltype(MakeAGridDescriptor({}, {})); using B0GridDesc_BK0_L_BK1 = decltype(MakeB0GridDescriptor_BK0_L_BK1({}, {})); using B1GridDesc_BL0_N_BL1 = decltype(MakeB1GridDescriptor_BL0_N_BL1({}, {})); using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); @@ -250,17 +267,17 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle CElementwiseOperation, InMemoryDataOperationEnum::Set, // InMemory Data Descriptor - AGridDesc_AK0_M_AK1, + AGridDesc, B0GridDesc_BK0_L_BK1, B1GridDesc_BL0_N_BL1, CGridDesc_M_N, // Tiling Family MPerBlock, LPerBlock, - K0PerBlock, // K0 * K1 = Gemm0 GEMM_K Dim - K1, // + KPerBlock, + K1, NPerBlock, - L0PerBlock, + LPerBlock, L1, MPerWMMA, LPerWMMA, @@ -277,6 +294,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, true, + AEnableLds, ABlockLdsAddExtraM, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, @@ -285,6 +303,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B0BlockTransferSrcScalarPerVector, B0BlockTransferDstScalarPerVector_K1, true, + B0EnableLds, B0BlockLdsAddExtraL, B1BlockTransferThreadClusterLengths_L0_N_L1, B1BlockTransferThreadClusterArrangeOrder, @@ -293,6 +312,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B1BlockTransferSrcScalarPerVector, B1BlockTransferDstScalarPerVector_L1, false, + B1EnableLds, B1BlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, @@ -338,7 +358,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle p_b1_grid_{p_b1_grid}, p_c_grid_{p_c_grid}, a_grid_desc_ak0_m_ak1_{ - DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, b0_grid_desc_bk0_l_bk1_{DeviceOp::MakeB0GridDescriptor_BK0_L_BK1( b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, b1_grid_desc_bl0_n_bl1_{DeviceOp::MakeB1GridDescriptor_BL0_N_BL1( @@ -404,7 +424,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle CDataType* p_c_grid_; // Tensor Descriptors - AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; + AGridDesc a_grid_desc_ak0_m_ak1_; B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1_; B1GridDesc_BL0_N_BL1 b1_grid_desc_bl0_n_bl1_; CGridDesc_M_N c_grid_desc_m_n_; @@ -463,7 +483,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B0DataType, B1DataType, CDataType, - DeviceOp::AGridDesc_AK0_M_AK1, + DeviceOp::AGridDesc, DeviceOp::B0GridDesc_BK0_L_BK1, DeviceOp::B1GridDesc_BL0_N_BL1, typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, @@ -741,11 +761,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle << BlockSize << ", " << MPerBlock << ", " << LPerBlock << ", " - << K0PerBlock << ", " + << KPerBlock << ", " << K1 << ", " << MPerBlock << ", " << NPerBlock << ", " - << L0PerBlock << ", " + << LPerBlock << ", " << L1 << getGemmSpecializationString(GemmSpec) << ", " << "ASpec" << getTensorSpecializationString(ASpec) << ", " diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index ebfa0765a0a..b6eb7612169 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -134,10 +134,10 @@ template {}; static constexpr auto I7 = Number<7>{}; - static constexpr auto AK0 = Number{}; static constexpr auto AK1 = Number{}; - static constexpr auto BK0 = Number{}; + static constexpr auto BK0 = Number{}; static constexpr auto BK1 = Number{}; + static constexpr auto L0PerBlock = LPerBlock / L1Value; static constexpr auto AL0 = Number{}; static constexpr auto AL1 = Number{}; static constexpr auto BL0 = Number{}; static constexpr auto BL1 = Number{}; + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = remove_cvref_t())>; + GridwiseGemmPipeline_Selector())>; - template - __host__ __device__ static constexpr auto - MakeA0BlockDescriptor_K0_M0_M1_M2_K1(const A0BlockDesc_AK0_M_AK1&) + __host__ __device__ static constexpr auto MakeABlockDescriptor() { - constexpr index_t A_K0 = A0BlockDesc_AK0_M_AK1{}.GetLength(I0); - constexpr index_t A_K1 = A0BlockDesc_AK0_M_AK1{}.GetLength(I2); - constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma); + constexpr auto a_block_desc = [&]() { + if constexpr(AEnableLds) + { + // K0->M->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / AK1; + constexpr auto max_lds_align = AK1; - return transform_tensor_descriptor( - A0BlockDesc_AK0_M_AK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, AK1), + make_tuple(Number{} * AK1, AK1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, AK1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, I1, K1), + make_tuple(Number{} * AK1, AK1, AK1, AK1, AK1, I1)); + } + }(); + + return a_block_desc; + } + + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() + { + constexpr auto a_block_copy_step = [&]() { + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / AK1; + + return make_multi_index(K0PerBlock, 0, 0); + } + else + { + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + } + }(); + + return a_block_copy_step; + } + + // Describe how data read from (LDS/VGPR) buffer + template + __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&) + { + + constexpr auto a_wave_desc = [&]() { + if constexpr(AEnableLds) + { + // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1 + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + else + { + // KWmma_MRepeat_MWave_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0, 3>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + } + }(); + + return a_wave_desc; } template @@ -273,14 +362,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } - __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() - { - // A matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(AK0, Number{}, AK1), - make_tuple(Number{} * AK1, AK1, I1)); - } - __host__ __device__ static constexpr auto GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1() { // B matrix in LDS memory, dst of blockwise copy @@ -318,19 +399,16 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle { // LDS allocation for A and B: be careful of alignment const index_t gemm0_bytes_end = - (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + - SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); + (SharedMemTrait::a_block_space_size_aligned + + SharedMemTrait::b0_block_space_size_aligned); const index_t gemm1_bytes_end = - (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * - sizeof(FloatB1); + (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned); - const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset + - SharedMemTrait::reduction_space_size_aligned) * - sizeof(FloatAcc0); + const index_t softmax_bytes_end = SharedMemTrait::reduction_space_offset + + SharedMemTrait::reduction_space_size_aligned - const index_t c_block_bytes_end = - SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); + const index_t c_block_bytes_end = SharedMemTrait::c_block_space_size; return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); } @@ -434,38 +512,30 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle struct SharedMemTrait { // LDS allocation for A and B: be careful of alignment - static constexpr auto a_block_desc_ak0_m_ak1 = - GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); - static constexpr auto b0_block_desc_bk0_l_bk1 = - GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); - static constexpr auto b1_block_desc_bl0_n_bl1 = - GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); - static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), BL1); - static constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - static constexpr auto b0_block_space_size_aligned = math::integer_least_multiple( - b0_block_desc_bk0_l_bk1.GetElementSpaceSize(), max_lds_align); - static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple( - b1_block_desc_bl0_n_bl1.GetElementSpaceSize(), max_lds_align); + static constexpr auto a_block_space_size_aligned = AEnableLds ? math::integer_least_multiple( + MakeABlockDescriptor().GetElementSpaceSize() * sizeof(FloatA), max_lds_align) : 0; + static constexpr auto b0_block_space_size_aligned = B0EnableLds ? math::integer_least_multiple( + GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1().GetElementSpaceSize() * sizeof(FloatB0), max_lds_align) : 0; + static constexpr auto b1_block_space_size_aligned = B1EnableLds ? math::integer_least_multiple( + GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1().GetElementSpaceSize() * sizeof(FloatB1), max_lds_align) : 0; static constexpr auto a_block_space_offset = 0; static constexpr auto b0_block_space_offset = a_block_space_size_aligned.value; static constexpr auto b1_block_space_offset = 0; // LDS allocation for reduction + // Feature to add, IntraThread Reduction static constexpr index_t reduction_space_size_aligned = - math::integer_least_multiple(BlockSize, max_lds_align); + math::integer_least_multiple(BlockSize, max_lds_align) * sizeof(FloatAcc0); static constexpr auto reduction_space_offset = 0; // LDS allocation for C shuffle in LDS - static constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = - GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); static constexpr auto c_block_space_size = - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat - .GetElementSpaceSize(); + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + .GetElementSpaceSize() * sizeof(FloatCShuffle); }; template , -/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, -/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatA, -/* typename DstData, */ FloatA, -/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), -/* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), -/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, -/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, -/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, -/* index_t DstVectorDim, */ 2, -/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, -/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, -/* index_t SrcScalarStrideInVector, */ 1, -/* index_t DstScalarStrideInVector, */ 1, -/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, -/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( + + auto a_block_trait = [&](){ + // A matrix blockwise copy + if constexpr(AEnableLds) + { + constexpr auto AK0PerBlock = KPerBlock/ AK1; + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::a_block_space_offset, + SharedMemTrait::a_block_space_size_aligned); + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, +/* typename SrcElementwiseOperation, */ AElementwiseOperation, +/* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, +/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, +/* typename BlockSliceLengths, */ Sequence, +/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, +/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ FloatA, +/* typename DstData, */ FloatA, +/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), +/* typename DstDesc, */ decltype(a_block_desc), +/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, +/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, +/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( a_grid_desc_k0_m_k1, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, - a_block_desc_k0perblock_mperblock_k1, + a_block_desc, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); + return make_tuple(a_block_buf, a_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + auto a_block_buf = make_static_buffer( + a_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto a_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + ABlockTransferSrcScalarPerVector, + AThreadTransferSrcResetCoordinateAfterRun, + true>( + a_grid_desc, + make_multi_index(0, + m_block_data_idx_on_grid/(MWaves * MPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(a_block_buf, a_blockwise_copy); + } + }; + // B matrix blockwise copy auto b0_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1(static_cast(p_shared) + SharedMemTrait::a_block_space_offset, - a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); + auto b0_block_buf = make_dynamic_buffer(static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, b0_block_desc_k0perblock_lperblock_k1.GetElementSpaceSize()); // Shift Per SUB_K - constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); constexpr auto b0_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); + + const auto a_block_reset_copy_step = [&](){ + if constexpr(AEnableLds){ + return make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); + else{ + return make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0, 0, 0, 0); + } + }(); + const auto b0_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); - const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); + const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); /*******************************************************************************/ // softmax /*******************************************************************************/ @@ -734,7 +867,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // 0x76543210 0xfedcba98 // src Rowlane 0x76543210, 0xfedcba98, - false>{tensor_operation::element_wise::PassThrough{}}; + false>{}; // B1 matrix blockwise copy auto b1_blockwise_copy = @@ -815,7 +948,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle } // gemm0 start, A-B swaped GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, - a_block_desc_k0perblock_mperblock_k1, + a_block_desc, a_blockwise_copy, a_grid_buf, a_block_buf, @@ -828,7 +961,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle b0_block_slice_copy_step, blockwise_gemm0, acc0_thread_buf, - K0BlockMainLoop); + KBlockMainLoop); // do MNK padding or upper triangular masking if constexpr(MaskOutUpperTriangle || PadN) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 46e0493e503..c3907266a91 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -343,7 +343,7 @@ struct GridwiseGemmPipeline_v1<1, false, true> b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - a_block_buf = a_block_buf_switch; + // a_block_buf = a_block_buf_switch; ++i; } while(i < (num_loop - 1)); } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index a652ce8bcee..1b99d535ded 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -130,8 +130,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma static constexpr auto I6 = Number<6>{}; static constexpr auto I7 = Number<7>{}; - static constexpr auto B_K0 = BGridDesc_K0_N_K1{}.GetLength(I0); - static constexpr auto B_K1 = BGridDesc_K0_N_K1{}.GetLength(I2); // FIX ME: To be deprecated static constexpr auto K1 = Number{}; @@ -273,6 +271,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) { + constexpr auto B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); + return transform_tensor_descriptor( BBlockDesc_BK0_N_BK1{}, make_tuple(make_pass_through_transform(Number{}), @@ -528,8 +529,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma } }(); - // printf("---------------K = %d\n", K); - constexpr auto a_block_desc = MakeABlockDescriptor(); constexpr auto b_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); @@ -703,7 +702,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /*******************************************************************************/ // Shift Per SUB_K constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); - // printf("a_block_slice_copy_step FirstKdim = %d\n", a_block_slice_copy_step[I0]); constexpr auto b_block_slice_copy_step = MakeBBlockSliceCopyStep(); // gridwise GEMM pipeline diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index c7a950d7400..4e99fb89b6d 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1395,34 +1395,28 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // apply element-wise operation element_op_(v_this_row, src_buf[Number{}]); - // if (get_thread_local_1d_id() < 16) - // printf("tid: %03d, RawData: %04x\n", get_thread_local_1d_id(), - // *(reinterpret_cast(&v_this_row)) ); apply intra-row swizzle permute + if constexpr(IntraRowSwizzlePerm) { - temp = __builtin_amdgcn_permlane16( // 0x76543210, 0xfedcba98 - temp, - type_convert(v_this_row), - 0xb3a29180, - 0xf7e6d5c4, - 1, - 0); + // temp = __builtin_amdgcn_permlane16( + // temp, + // type_convert(v_this_row), + // 0xb3a29180, + // 0xf7e6d5c4, + // 1, + // 0); v_this_row = type_convert(temp); - // if (get_thread_local_1d_id() < 16) - // printf("tid: %03d, SwiData: %04x\n", get_thread_local_1d_id(), - // *(reinterpret_cast(&v_this_row)) ); } // apply inter-row permute. - temp = __builtin_amdgcn_permlanex16(temp, - type_convert(v_this_row), - LowEightRowlaneIdx, - HighEightRowLaneIdx, - 1, - 0); + // temp = __builtin_amdgcn_permlanex16(temp, + // type_convert(v_this_row), + // LowEightRowlaneIdx, + // HighEightRowLaneIdx, + // 1, + // 0); v_theother_row = type_convert(temp); - // printf("tid: %03d, PermData: %04x\n", get_thread_local_1d_id(), - // *(reinterpret_cast(&v_theother_row)) ); + if(get_thread_local_1d_id() % 32 < 16) { // apply type convert diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp index 5fc11d9158a..7df174a7ecc 100644 --- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp @@ -179,6 +179,26 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } + template + __host__ __device__ static constexpr auto + MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AKRow_MPerWmma_AK1( + const AGridDesc_M_K& a_grid_desc_m_k, const Number& WmmaK, const Number& MRepeat, + const Number& MWaves, const Number& MPerWmma, const Number& AK1) + { + const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlcok; + const auto K = a_grid_desc_m_k.GetLength(I1); + const auto AKWmma = K / WmmaK; + constexpr auto AKRow = WmmaK / K1; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AKWmma, Number{}, AK1)), + make_unmerge_transform( + make_tuple(M0 * MRepeat, MWaves, MPerWmma))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } + // // B (alias of B0) // From 84b4ada52a3be86f611b420024815e0846d2c2e3 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 27 Feb 2023 07:50:53 +0000 Subject: [PATCH 044/118] gemm sanity fix --- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 2 +- .../threadwise_tensor_slice_transfer.hpp | 26 +++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index c3907266a91..46e0493e503 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -343,7 +343,7 @@ struct GridwiseGemmPipeline_v1<1, false, true> b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - // a_block_buf = a_block_buf_switch; + a_block_buf = a_block_buf_switch; ++i; } while(i < (num_loop - 1)); } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 4e99fb89b6d..539d362595f 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1398,23 +1398,23 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow if constexpr(IntraRowSwizzlePerm) { - // temp = __builtin_amdgcn_permlane16( - // temp, - // type_convert(v_this_row), - // 0xb3a29180, - // 0xf7e6d5c4, - // 1, - // 0); + temp = __builtin_amdgcn_permlane16( + temp, + type_convert(v_this_row), + 0xb3a29180, + 0xf7e6d5c4, + 1, + 0); v_this_row = type_convert(temp); } // apply inter-row permute. - // temp = __builtin_amdgcn_permlanex16(temp, - // type_convert(v_this_row), - // LowEightRowlaneIdx, - // HighEightRowLaneIdx, - // 1, - // 0); + temp = __builtin_amdgcn_permlanex16(temp, + type_convert(v_this_row), + LowEightRowlaneIdx, + HighEightRowLaneIdx, + 1, + 0); v_theother_row = type_convert(temp); if(get_thread_local_1d_id() % 32 < 16) From 7e003d313cda7431a393efa38b04ec128ac72732 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 28 Feb 2023 09:24:14 +0000 Subject: [PATCH 045/118] Porting new blockwise gemm to flash attention --- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 14 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 12 +- .../gpu/block/blockwise_gemm_wmma.hpp | 6 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 86 ++++---- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 195 +++++++++++------- .../transform_contraction_to_gemm.hpp | 32 +-- 6 files changed, 209 insertions(+), 136 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index 4c7a934a754..c2f2e000cd7 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -100,12 +100,12 @@ using DeviceGemmInstance = 32, // KPerBlock 8, // K1 // Gemm 1 - 64, // NPerBlock - 32, // LPerBlock - 8, // L1 - 16, // MPerWMMA - 16, // LPerWMMA - 16, // NPerWMMA + 64, // NPerBlock + 32, // LTilePerBlock + 8, // L1 + 16, // MPerWMMA + 16, // LPerWMMA + 16, // NPerWMMA // Per repeat = wave_m = wave_num, wave_n = 1 1, // MRepeat 8, // LRepeat @@ -124,7 +124,7 @@ using DeviceGemmInstance = 8, 8, true, - S<4, 8, 8>, // B1BlockTransfer LN -> L0 N L1 + S<4, 8, 8>, // B1BlockTransfer NL -> L0 N L1 S<0, 2, 1>, S<0, 2, 1>, 1, diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index e0f8f61435c..099ea7354b4 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -122,20 +122,20 @@ int run(int argc, char* argv[]) b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); break; - case 5: // Rand: b1 ; unit: a b0 fail + case 5: // Rand: b1 b0; unit: a a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; - case 6: // Rand: b0 ; unit: a b1 pass - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + case 6: // Rand: a b0 ; unit: b1 pass + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); break; - case 7: // Rand: a ; unit: b0 b1 pass + case 7: // Rand: a b1 ; unit: b0 pass a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index fe448e5bcef..3ca81a52726 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -7,6 +7,7 @@ #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp" #include "ck/tensor_description/tensor_adaptor.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #define CK_MNK_LOOP @@ -340,6 +341,7 @@ struct BlockwiseGemmWMMA b_thread_desc_, make_tuple(I0, n0, I0, I0, I0), b_thread_buf); + vector_type a_thread_vec; vector_type b_thread_vec; @@ -413,7 +415,7 @@ struct BlockwiseGemmWMMA A_K1, 0x76543210, 0xfedcba98, - true>; + TransposeC ? false : true>; }; template @@ -448,7 +450,7 @@ struct BlockwiseGemmWMMA B_K1, 0x76543210, 0xfedcba98, - false>; + TransposeC ? true : false>; }; typename AThreadCopySelector::type a_thread_copy_; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 113abefcb3c..0de115a70d9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -56,11 +56,11 @@ template {}; static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + + static constexpr auto WmmaK = 16; static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); - static constexpr auto WmmaK = 16; - static constexpr auto AEnableLds = LWaves == 1 ? false : true; - // static constexpr auto B0EnableLds = MWaves == 1 ? false : true; - // static constexpr auto B1EnableLds = MWaves == 1 ? false : true; + static constexpr auto AEnableLds = LWaves == 1 ? false : true; + static constexpr auto B0EnableLds = MWaves == 1 ? false : true; + static constexpr auto B1EnableLds = MWaves == 1 ? false : true; using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< Sequence, @@ -165,14 +168,17 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle else { return Transform::MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AKRow_MPerWmma_AK1( - Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), - WmmaK, Number{}, Number{}, Number{}, Number{}) - + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); } } - static auto MakeB0GridDescriptor_BK0_L_BK1(const std::vector& b0_gs_ls_ks_lengths_vec, - const std::vector& b0_gs_ls_ks_strides_vec) + static auto MakeB0GridDescriptor(const std::vector& b0_gs_ls_ks_lengths_vec, + const std::vector& b0_gs_ls_ks_strides_vec) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, b0_gs_ls_ks_strides_vec), @@ -188,7 +194,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } using AGridDesc = decltype(MakeAGridDescriptor({}, {})); - using B0GridDesc_BK0_L_BK1 = decltype(MakeB0GridDescriptor_BK0_L_BK1({}, {})); + using B0GridDesc_BK0_L_BK1 = decltype(MakeB0GridDescriptor({}, {})); using B1GridDesc_BL0_N_BL1 = decltype(MakeB1GridDescriptor_BL0_N_BL1({}, {})); using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); @@ -277,11 +283,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle KPerBlock, K1, NPerBlock, - LPerBlock, + LTilePerBlock, L1, - MPerWMMA, - LPerWMMA, - NPerWMMA, + MPerWmma, + LPerWmma, + NPerWmma, MRepeat, LRepeat, NRepeat, @@ -357,10 +363,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle p_b0_grid_{p_b0_grid}, p_b1_grid_{p_b1_grid}, p_c_grid_{p_c_grid}, - a_grid_desc_ak0_m_ak1_{ - DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b0_grid_desc_bk0_l_bk1_{DeviceOp::MakeB0GridDescriptor_BK0_L_BK1( - b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + a_grid_desc{DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b0_grid_desc_bk0_l_bk1_{ + DeviceOp::MakeB0GridDescriptor(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, b1_grid_desc_bl0_n_bl1_{DeviceOp::MakeB1GridDescriptor_BL0_N_BL1( b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, c_grid_desc_m_n_{ @@ -405,7 +410,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle ignore = acc1_biases_gs_ms_ns_lengths; ignore = acc1_biases_gs_ms_ns_strides; - if(GridwiseOp::CheckValidity(a_grid_desc_ak0_m_ak1_, + if(GridwiseOp::CheckValidity(a_grid_desc, b0_grid_desc_bk0_l_bk1_, b1_grid_desc_bl0_n_bl1_, c_grid_desc_m_n_, @@ -424,7 +429,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle CDataType* p_c_grid_; // Tensor Descriptors - AGridDesc a_grid_desc_ak0_m_ak1_; + AGridDesc a_grid_desc; B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1_; B1GridDesc_BL0_N_BL1 b1_grid_desc_bl0_n_bl1_; CGridDesc_M_N c_grid_desc_m_n_; @@ -473,8 +478,17 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle const index_t grid_size = arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; - const auto K = - arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + const auto K = [&]() { + if constexpr(AEnableLds) + { + return arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I2); + } + else + { + return arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I3) * + arg.a_grid_desc.GetLength(I5); + } + }(); auto launch_kernel = [&](auto has_main_k_block_loop) { const auto kernel = kernel_batched_gemm_softmax_gemm_wmma_cshuffle< @@ -506,7 +520,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle arg.p_b0_grid_, arg.p_b1_grid_, arg.p_c_grid_, - arg.a_grid_desc_ak0_m_ak1_, + arg.a_grid_desc, arg.b0_grid_desc_bk0_l_bk1_, arg.b1_grid_desc_bl0_n_bl1_, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, @@ -551,20 +565,23 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle { if constexpr(!(is_same_v || is_same_v)) { + printf("DeviceOp: Acc0 Type err"); return false; } if constexpr(!(is_same_v || is_same_v)) { + printf("DeviceOp: Acc1 Type err"); return false; } } else { + printf("DeviceOp: Arch err"); return false; } - if(!GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, + if(!GridwiseOp::CheckValidity(arg.a_grid_desc, arg.b0_grid_desc_bk0_l_bk1_, arg.b1_grid_desc_bl0_n_bl1_, arg.c_grid_desc_m_n_, @@ -574,14 +591,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); - const index_t c_n = arg.c_grid_desc_m_n_.GetLength(I1); - const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); - const index_t b1_n = arg.b1_grid_desc_bl0_n_bl1_.GetLength(I1); + const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - if(!(c_g == arg.batch_count_ && c_m == a_m && c_n == b1_n)) + if(!(c_g == arg.batch_count_)) { + printf("DeviceOp: BatchCount err"); return false; } @@ -604,6 +618,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) { + printf("DeviceOp: Data Transfer Vector scalar err"); return false; } @@ -619,6 +634,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || c_stride_lowest == 1)) { + printf("DeviceOp: Data Vectorize transfer err"); return false; } @@ -765,7 +781,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle << K1 << ", " << MPerBlock << ", " << NPerBlock << ", " - << LPerBlock << ", " + << LTilePerBlock << ", " << L1 << getGemmSpecializationString(GemmSpec) << ", " << "ASpec" << getTensorSpecializationString(ASpec) << ", " diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index b6eb7612169..75631e27bb1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -23,7 +23,7 @@ template {}; static constexpr auto AK1 = Number{}; - static constexpr auto BK0 = Number{}; + static constexpr auto BK0 = Number{}; static constexpr auto BK1 = Number{}; - static constexpr auto L0PerBlock = LPerBlock / L1Value; - static constexpr auto AL0 = Number{}; - static constexpr auto AL1 = Number{}; - static constexpr auto BL0 = Number{}; - static constexpr auto BL1 = Number{}; + static constexpr auto L0PerBlock = LTilePerBlock / L1Value; + static constexpr auto AL0 = Number{}; + static constexpr auto AL1 = Number{}; + static constexpr auto BL0 = Number{}; + static constexpr auto BL1 = Number{}; static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); @@ -209,8 +209,12 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle using ThisThreadBlock = ThisThreadBlock; - using GridwiseGemmPipe = remove_cvref_t())>; + using GridwiseGemmPipe = + remove_cvref_t())>; __host__ __device__ static constexpr auto MakeABlockDescriptor() { @@ -238,7 +242,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto KWmmaPerblock = KPerBlock / WmmaK; // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, K1), + make_tuple(Number{}, Number{}, I1, I1, I1, AK1), make_tuple(Number{} * AK1, AK1, AK1, AK1, AK1, I1)); } }(); @@ -349,9 +353,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle __host__ __device__ static constexpr auto MakeB1BlockDescriptor_L0_N0_N1_N2_L1(const B1BlockDesc_BL0_N_BL1&) { - constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); - constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); - constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); + constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); + constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); + return transform_tensor_descriptor( B1BlockDesc_BL0_N_BL1{}, make_tuple(make_pass_through_transform(Number{}), @@ -399,16 +403,19 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle { // LDS allocation for A and B: be careful of alignment const index_t gemm0_bytes_end = - (SharedMemTrait::a_block_space_size_aligned + - SharedMemTrait::b0_block_space_size_aligned); + (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + + SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); const index_t gemm1_bytes_end = - (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned); + (SharedMemTrait::b1_block_space_offset + + SharedMemTrait::b1_block_space_size_aligned * sizeof(FloatB1)); - const index_t softmax_bytes_end = SharedMemTrait::reduction_space_offset + - SharedMemTrait::reduction_space_size_aligned + const index_t softmax_bytes_end = + SharedMemTrait::reduction_space_offset + + SharedMemTrait::reduction_space_size_aligned * sizeof(FloatAcc0); - const index_t c_block_bytes_end = SharedMemTrait::c_block_space_size; + const index_t c_block_bytes_end = + SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); } @@ -416,7 +423,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, + CheckValidity(const AGridDesc& a_grid_desc, const B0GridDesc_BK0_L_BK1& b0_grid_desc_bk0_l_bk1, const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, const CGridDesc_M_N& c_grid_desc_m_n, @@ -426,19 +433,48 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle (LPerBlock % (LPerWmma * LRepeat)) == 0, "Invalid tuning param!"); - const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1); + const auto GetAProblemsizeMK = [&]() { + if constexpr(AEnableLds) + { + return make_tuple(a_grid_desc.GetLength(I1), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * + a_grid_desc.GetLength(I4), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I5)); + } + }; + + const auto M = GetAProblemsizeMK()[I0]; const auto L = b0_grid_desc_bk0_l_bk1.GetLength(I1); - const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); + const auto K = GetAProblemsizeMK()[I1]; const auto N = b1_grid_desc_l0_n_l1.GetLength(I1); - const auto KPerBlock = K0PerBlock * K1Value; if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1))) { + printf("GridwiseOp: M/N Length err, A_M/N = %d, %d | C_M/N = %d, %d\n", + M, + N, + c_grid_desc_m_n.GetLength(I0), + c_grid_desc_m_n.GetLength(I1)); return false; } if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && N % NPerBlock == 0)) { + printf("GridwiseOp: M/L/K/N Division err, M/L/K/N = %d, %d, %d, %d | M/L/K/NPerBlock = " + "%d, %d, %d, %d\n", + M, + L, + K, + N, + MPerBlock, + LPerBlock, + KPerBlock, + NPerBlock); return false; } @@ -446,18 +482,23 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle const auto num_gemm0_k_loop = K / KPerBlock; if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop)) { + printf("GridwiseOp: outer loop unsupport\n"); return false; } // check gemm1 gridwise gemm pipeline - if(!(LPerBlock % (L0PerBlock * L1Value) == 0)) + if(!(LPerBlock % LTilePerBlock == 0)) { + printf("GridwiseOp: inner loop division, L/LTilePerblock: %d, %d\n", + LPerBlock, + LTilePerBlock); return false; } - const auto num_gemm1_k_inner_loop = LPerBlock / (L0PerBlock * L1Value); + const auto num_gemm1_k_inner_loop = LPerBlock / LTilePerBlock; if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop)) { + printf("GridwiseOp: inner loop unsupport\n"); return false; } @@ -472,7 +513,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / (K0PerBlock * K1Value); + const index_t num_loop = K / KPerBlock; return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -514,28 +555,38 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // LDS allocation for A and B: be careful of alignment static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), BL1); - static constexpr auto a_block_space_size_aligned = AEnableLds ? math::integer_least_multiple( - MakeABlockDescriptor().GetElementSpaceSize() * sizeof(FloatA), max_lds_align) : 0; - static constexpr auto b0_block_space_size_aligned = B0EnableLds ? math::integer_least_multiple( - GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1().GetElementSpaceSize() * sizeof(FloatB0), max_lds_align) : 0; - static constexpr auto b1_block_space_size_aligned = B1EnableLds ? math::integer_least_multiple( - GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1().GetElementSpaceSize() * sizeof(FloatB1), max_lds_align) : 0; + static constexpr auto a_block_space_size_aligned = + AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(), + max_lds_align) + : 0; + static constexpr auto b0_block_space_size_aligned = + B0EnableLds + ? math::integer_least_multiple( + GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1().GetElementSpaceSize(), + max_lds_align) + : 0; + static constexpr auto b1_block_space_size_aligned = + B1EnableLds + ? math::integer_least_multiple( + GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1().GetElementSpaceSize(), + max_lds_align) + : 0; static constexpr auto a_block_space_offset = 0; - static constexpr auto b0_block_space_offset = a_block_space_size_aligned.value; + static constexpr auto b0_block_space_offset = a_block_space_size_aligned; static constexpr auto b1_block_space_offset = 0; // LDS allocation for reduction // Feature to add, IntraThread Reduction static constexpr index_t reduction_space_size_aligned = - math::integer_least_multiple(BlockSize, max_lds_align) * sizeof(FloatAcc0); + math::integer_least_multiple(BlockSize, max_lds_align); static constexpr auto reduction_space_offset = 0; // LDS allocation for C shuffle in LDS static constexpr auto c_block_space_size = GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() - .GetElementSpaceSize() * sizeof(FloatCShuffle); + .GetElementSpaceSize(); }; template ( - p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); + p_a_grid, a_grid_desc.GetElementSpaceSize()); const auto b0_grid_buf = make_dynamic_buffer( p_b0_grid, b0_grid_desc_k0_l_k1.GetElementSpaceSize()); const auto b1_grid_buf = make_dynamic_buffer( @@ -601,7 +652,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto a_block_desc = MakeABlockDescriptor(); constexpr auto b0_block_desc_k0perblock_lperblock_k1 = GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); - + auto a_block_trait = [&](){ // A matrix blockwise copy if constexpr(AEnableLds) @@ -610,17 +661,18 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle auto a_block_buf = make_dynamic_buffer( static_cast(p_shared) + SharedMemTrait::a_block_space_offset, SharedMemTrait::a_block_space_size_aligned); + auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, /* typename SrcElementwiseOperation, */ AElementwiseOperation, /* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, /* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, -/* typename BlockSliceLengths, */ Sequence, +/* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, /* typename SrcData, */ FloatA, /* typename DstData, */ FloatA, -/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), +/* typename SrcDesc, */ decltype(a_grid_desc), /* typename DstDesc, */ decltype(a_block_desc), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, /* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, @@ -632,7 +684,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /* index_t DstScalarStrideInVector, */ 1, /* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, /* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( - a_grid_desc_k0_m_k1, + a_grid_desc, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, a_block_desc, @@ -713,7 +765,6 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // Gemm0 - constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1Value, WmmaK); auto blockwise_gemm0 = BlockwiseGemmWMMA< @@ -725,7 +776,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle decltype(MakeB0BlockDescriptor_K0_L0_L1_L2_K1(b0_block_desc_k0perblock_lperblock_k1)), MPerBlock, LPerBlock, - K0PerBlock * K1Value, + KPerBlock, MPerWmma, LPerWmma, MRepeat, @@ -759,18 +810,20 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // LDS allocation for A and B: be careful of alignment - auto b0_block_buf = make_dynamic_buffer(static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, - b0_block_desc_k0perblock_lperblock_k1.GetElementSpaceSize()); + auto b0_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, + SharedMemTrait::b0_block_space_size_aligned); // Shift Per SUB_K constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); - constexpr auto b0_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto b0_block_slice_copy_step = make_multi_index(BK0, 0, 0); const auto a_block_reset_copy_step = [&](){ if constexpr(AEnableLds){ - return make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); + return make_multi_index(-a_grid_desc.GetLength(I0), 0, 0); + } else{ - return make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0, 0, 0, 0); + return make_multi_index(-a_grid_desc.GetLength(I0), 0, 0, 0, 0, 0); } }(); @@ -836,24 +889,23 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); constexpr auto b1_block_slice_copy_step = make_multi_index(BL0, 0, 0); + // Acc0 thread buffer -> A1 thread buffer -> blockwise gemm // A1 matrix in VGPR constexpr auto A1ThreadSlice_L0PerBlock_MPerBlock_L1 = make_tuple( Number{}, Number{}, - Number{}); // Data duplicated dimension + Number{}); constexpr auto A1ThreadSliceL0PerBlock = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I0]; constexpr auto A1ThreadSliceMPerBlock = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I1]; constexpr auto A1ThreadSliceL1 = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I2]; - // A1 has duplicated data - constexpr auto A1ThreadDuplicatedDim = I2 * A1ThreadSliceL1; constexpr auto a1_thread_desc_l0perblock_mperblock_l1 = make_naive_tensor_descriptor( - make_tuple(A1ThreadSliceL0PerBlock, A1ThreadSliceMPerBlock, A1ThreadDuplicatedDim), - make_tuple(A1ThreadSliceMPerBlock * A1ThreadDuplicatedDim, A1ThreadDuplicatedDim, I1)); + make_tuple(A1ThreadSliceL0PerBlock, A1ThreadSliceMPerBlock, A1ThreadSliceL1), + make_tuple(A1ThreadSliceMPerBlock * A1ThreadSliceL1, A1ThreadSliceL1, I1)); // A1 matrix blockwise copy - auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< + auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic< FloatAcc0, FloatA, decltype(acc0_thread_desc_l0perblock_mperblock_l1), @@ -862,13 +914,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle Sequence, Sequence<0, 1, 2>, 2, - laccvgprs, - // dst Rowlane - // 0x76543210 0xfedcba98 - // src Rowlane - 0x76543210, 0xfedcba98, - false>{}; - + laccvgprs>{tensor_operation::element_wise::PassThrough{}}; + // B1 matrix blockwise copy auto b1_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, @@ -904,7 +951,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle a1_thread_desc_l0perblock_mperblock_l1.GetElementSpaceSize()); auto b1_block_buf = make_dynamic_buffer( static_cast(p_shared)+ SharedMemTrait::b1_block_space_offset, - b1_block_desc_l0perblock_nperblock_l1.GetElementSpaceSize()); + SharedMemTrait::b1_block_space_size_aligned); auto blockwise_gemm1 = BlockwiseGemmWMMA c_thread_buf; c_thread_buf.Clear(); /*******************************************************************************/ + // + // Kernel Main Stage + // // Flash Attention // Dao, Tri, et al. "Flashattention: Fast and memory-efficient exact attention with io-awareness." arXiv preprint arXiv:2205.14135 (2022). index_t gemm1_l_block_outer_index = 0; @@ -947,7 +997,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle continue; } // gemm0 start, A-B swaped - GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, + GridwiseGemmPipe::template Run(a_grid_desc, a_block_desc, a_blockwise_copy, a_grid_buf, @@ -1019,10 +1069,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle [&](auto i) { acc_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); }); } - block_sync_lds(); - // gemm0 end - // gemm0 incorrect // Tiled softmax start // softmax SoftmaxBuf& max = blockwise_softmax.max_value_buf; @@ -1130,7 +1177,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle }); }); - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_reset_copy_step); // rewind K b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc_k0_l_k1, b0_block_reset_copy_step); // rewind K and step N diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp index 7df174a7ecc..8474d8a617f 100644 --- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp @@ -179,24 +179,32 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } - template + template __host__ __device__ static constexpr auto MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AKRow_MPerWmma_AK1( - const AGridDesc_M_K& a_grid_desc_m_k, const Number& WmmaK, const Number& MRepeat, - const Number& MWaves, const Number& MPerWmma, const Number& AK1) + const AGridDesc_M_K& a_grid_desc_m_k, + const WmmaK&, + const MRepeat&, + const MWaves&, + const MPerWmma&, + const AK1&) { - const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlcok; + const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlock; const auto K = a_grid_desc_m_k.GetLength(I1); - const auto AKWmma = K / WmmaK; - constexpr auto AKRow = WmmaK / K1; + const auto AKWmma = K / WmmaK{}; + constexpr auto AKRow = WmmaK{} / AK1{}; return transform_tensor_descriptor( - a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AKWmma, Number{}, AK1)), - make_unmerge_transform( - make_tuple(M0 * MRepeat, MWaves, MPerWmma))), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AKWmma, AKRow, AK1{})), + make_unmerge_transform(make_tuple(M0 * MRepeat{}, MWaves{}, MPerWmma{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); } // From a045e0beb5fbfe74a5375dab612e53201b920b43 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 28 Feb 2023 10:06:07 +0000 Subject: [PATCH 046/118] Example branch provide to compiler team --- CMakeLists.txt | 1 - example/01_gemm/gemm_wmma_fp16.cpp | 2 +- .../CMakeLists.txt | 8 - ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 162 --- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 771 ----------- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 1194 ----------------- 6 files changed, 1 insertion(+), 2137 deletions(-) delete mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp delete mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp delete mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index f861e302039..a950f41e3da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -240,7 +240,6 @@ include_directories(BEFORE SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") if(BUILD_DEV) - add_compile_options(-Werror) add_compile_options(-Weverything) endif() message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 797cff5346d..20243bd0e08 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -38,7 +38,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 256, // BlockSize 128, // MPerBlock 128, // NPerBlock - 64, // KPerBlock + 32, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index c7a9e537a8d..8d9aaec85a5 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -5,9 +5,6 @@ add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) -if(GPU_TARGETS MATCHES "gfx1100") - add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) -endif() add_custom_target(example_gemm_scale_softmax_gemm) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16) @@ -17,8 +14,3 @@ add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_soft add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) - -if(GPU_TARGETS MATCHES "gfx1100") - add_custom_target(example_gemm_scale_softmax_gemm_wmma) - add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16) -endif() diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp deleted file mode 100644 index 14ffb040350..00000000000 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -/* -Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g_k_l) * B1_g_l_n - |-----------------| - Gemm0 - |-------------------------------------| - Gemm1 -*/ - -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/literals.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" - -template -using S = ck::Sequence; - -using F16 = ck::half_t; -using F32 = float; - -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - -using ADataType = F16; -using B0DataType = F16; -using B1DataType = F16; -using Acc0DataType = F32; -using Acc1DataType = F32; -using CShuffleDataType = F32; -using CDataType = F16; -using Acc0BiasDataType = ck::Tuple<>; -using Acc1BiasDataType = ck::Tuple<>; - -static constexpr ck::index_t NumDimG = 2; -static constexpr ck::index_t NumDimM = 1; -static constexpr ck::index_t NumDimN = 1; -static constexpr ck::index_t NumDimK = 1; -static constexpr ck::index_t NumDimO = 1; - -using AElementOp = PassThrough; -using B0ElementOp = PassThrough; -using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; -using B1ElementOp = PassThrough; -using CElementOp = PassThrough; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; -static constexpr auto MaskingSpec = - ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; - -static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; -static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; -static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; -static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; - -using DeviceGemmInstance = - ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< - NumDimG, - NumDimM, - NumDimN, - NumDimK, - NumDimO, - ADataType, - B0DataType, - B1DataType, - Acc0BiasDataType, - Acc0DataType, - Acc1BiasDataType, - Acc1DataType, - CShuffleDataType, - CDataType, - AElementOp, - B0ElementOp, - Acc0ElementOp, - B1ElementOp, - CElementOp, - GemmSpec, - TensorSpecA, - TensorSpecB0, - TensorSpecB1, - TensorSpecC, - 256, - 128, // MPerBlock - 128, // LPerBlock - 4, // K0PerBlock - 8, // K1 - 64, // NPerBlock - 4, // L0PerBlock - 8, // L1 - 16, // MPerWMMA - 16, // LPerWMMA - 16, // NPerWMMA - // Per repeat = wave_m = wave_num, wave_n = 1 - 1, // MRepeat - 8, // LRepeat - 4, // NRepeat - S<4, 64, 1>, // ABlockTransfer MK -> K0 M K1 - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - true, - S<4, 64, 1>, // B0BlockTransfer LK -> K0 L K1 - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - true, - S<4, 8, 8>, // B1BlockTransfer LN -> L0 N L1 - S<0, 2, 1>, - S<0, 2, 1>, - 1, - 8, - 1, - false, - 1, // CShuffleMWmmaPerWavePerShuffle - 2, // CShuffleNWmmaPerWavePerShuffle - S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock - 4, // CShuffleBlockTransferScalarPerVector_NPerBlock - MaskingSpec>; // MaskingSpecialization - -// Ref Gemm0: fp16 in, fp32 out -using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm; - -// Ref Softmax: fp32 in, fp16 out -using ReferenceSoftmaxInstance = - ck::tensor_operation::host::ReferenceSoftmax; - -// Ref Gemm1: fp16 in, fp16 out -using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm; - -#include "run_batched_gemm_scale_softmax_gemm_permute.inc" - -int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp deleted file mode 100644 index eac76633d69..00000000000 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ /dev/null @@ -1,771 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include - -#include "ck/utility/common_header.hpp" -#include "ck/tensor_description/tensor_descriptor.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" -#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" -#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp" -#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp" -#include "ck/host_utility/device_prop.hpp" -#include "ck/host_utility/kernel_launch.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { - -// Computes C = A * B0 * B1 -// MN = MK * KL * LN -// ^^^^^^ (Acc0) -// ^^^^^^^^^^^ (Acc1) -template -struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle - : public DeviceBatchedGemmSoftmaxGemmPermute -{ - static_assert(NumDimG > 0 && NumDimM > 0 && NumDimL > 0 && NumDimK > 0 && NumDimN > 0, - "Number of dimension must be greater than 0"); - - static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size(); - static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size(); - - // TODO ANT: implement bias combination - static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented"); - - static constexpr index_t NumDimGemm0M = NumDimM; - static constexpr index_t NumDimGemm0N = NumDimL; - static constexpr index_t NumDimGemm0K = NumDimK; - static constexpr index_t NumDimGemm1M = NumDimM; - static constexpr index_t NumDimGemm1N = NumDimN; - static constexpr index_t NumDimGemm1K = NumDimL; - - static constexpr index_t KPerBlock = K0PerBlock * K1; - - using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle; - - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - - using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< - Sequence, - Sequence, - GemmSpec, - ASpec, - B0Spec, - B1Spec, - CSpec>; - - static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) - { - return Transform::MakeAGridDescriptor_AK0_M_AK1( - Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), - Number{}); - } - - static auto MakeB0GridDescriptor_BK0_L_BK1(const std::vector& b0_gs_ls_ks_lengths_vec, - const std::vector& b0_gs_ls_ks_strides_vec) - { - return Transform::MakeB0GridDescriptor_BK0_N_BK1( - Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, b0_gs_ls_ks_strides_vec), - Number{}); - } - - static auto MakeB1GridDescriptor_BL0_N_BL1(const std::vector& b1_gs_ns_ls_lengths_vec, - const std::vector& b1_gs_ns_ls_strides_vec) - { - return Transform::MakeB1GridDescriptor_BK0_N_BK1( - Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, b1_gs_ns_ls_strides_vec), - Number{}); - } - - using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1({}, {})); - using B0GridDesc_BK0_L_BK1 = decltype(MakeB0GridDescriptor_BK0_L_BK1({}, {})); - using B1GridDesc_BL0_N_BL1 = decltype(MakeB1GridDescriptor_BL0_N_BL1({}, {})); - using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); - using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); - using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); - using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); - using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); - - constexpr static auto make_MaskOutPredicate() - { - if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled) - { - return MaskDisabledPredicate{}; - } - else if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle) - { - return MaskOutUpperTrianglePredicate{}; - } - } - using C0MatrixMask = C0MatrixMask_impl; - - struct ComputeBasePtrOfStridedBatch - { - ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, - const B0GridDesc_G_L_K& b0_grid_desc_g_l_k, - const B1GridDesc_G_N_L& b1_grid_desc_g_n_l, - const CGridDesc_G_M_N& c_grid_desc_g_m_n) - : a_grid_desc_g_m_k_(a_grid_desc_g_m_k), - b0_grid_desc_g_l_k_(b0_grid_desc_g_l_k), - b1_grid_desc_g_n_l_(b1_grid_desc_g_n_l), - c_grid_desc_g_m_n_(c_grid_desc_g_m_n) - { - } - - __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const - { - return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); - } - - __host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const - { - return b0_grid_desc_g_l_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); - } - - __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const - { - return b1_grid_desc_g_n_l_.CalculateOffset(make_multi_index(g_idx, 0, 0)); - } - - __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const - { - return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0)); - } - - private: - AGridDesc_G_M_K a_grid_desc_g_m_k_; - B0GridDesc_G_L_K b0_grid_desc_g_l_k_; - B1GridDesc_G_N_L b1_grid_desc_g_n_l_; - CGridDesc_G_M_N c_grid_desc_g_m_n_; - }; - - // GridwiseOp - using GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle< - // DataType Family - ADataType, - B0DataType, - Acc0DataType, - B1DataType, - Acc1DataType, - CShuffleDataType, - CDataType, - // ElementwiseOp Family - AElementwiseOperation, - B0ElementwiseOperation, - AccElementwiseOperation, - B1ElementwiseOperation, - CElementwiseOperation, - InMemoryDataOperationEnum::Set, - // InMemory Data Descriptor - AGridDesc_AK0_M_AK1, - B0GridDesc_BK0_L_BK1, - B1GridDesc_BL0_N_BL1, - CGridDesc_M_N, - // Tiling Family - MPerBlock, - LPerBlock, - K0PerBlock, // K0 * K1 = Gemm0 GEMM_K Dim - K1, // - NPerBlock, - L0PerBlock, - L1, - MPerWMMA, - LPerWMMA, - NPerWMMA, - MRepeat, - LRepeat, - NRepeat, - // ThreadCluster Family - BlockSize, - ABlockTransferThreadClusterLengths_K0_M_K1, - ABlockTransferThreadClusterArrangeOrder, - ABlockTransferSrcAccessOrder, - ABlockTransferSrcVectorDim, - ABlockTransferSrcScalarPerVector, - ABlockTransferDstScalarPerVector_K1, - true, - ABlockLdsAddExtraM, - B0BlockTransferThreadClusterLengths_K0_L_K1, - B0BlockTransferThreadClusterArrangeOrder, - B0BlockTransferSrcAccessOrder, - B0BlockTransferSrcVectorDim, - B0BlockTransferSrcScalarPerVector, - B0BlockTransferDstScalarPerVector_K1, - true, - B0BlockLdsAddExtraL, - B1BlockTransferThreadClusterLengths_L0_N_L1, - B1BlockTransferThreadClusterArrangeOrder, - B1BlockTransferSrcAccessOrder, - B1BlockTransferSrcVectorDim, - B1BlockTransferSrcScalarPerVector, - B1BlockTransferDstScalarPerVector_L1, - false, - B1BlockLdsAddExtraN, - CShuffleMRepeatPerShuffle, - CShuffleNRepeatPerShuffle, - CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, - CShuffleBlockTransferScalarPerVector_NPerBlock, - Transform::matrix_padder.PadN, - MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle, - NumPrefetch, - LoopSched, - PipelineVer>; - - // Argument - struct Argument : public BaseArgument - { - Argument( - const ADataType* p_a_grid, - const B0DataType* p_b0_grid, - const B1DataType* p_b1_grid, - CDataType* p_c_grid, - const std::array p_acc0_biases, - const std::array p_acc1_biases, - const std::vector& a_gs_ms_ks_lengths, - const std::vector& a_gs_ms_ks_strides, - const std::vector& b0_gs_ls_ks_lengths, - const std::vector& b0_gs_ls_ks_strides, - const std::vector& b1_gs_ns_ls_lengths, - const std::vector& b1_gs_ns_ls_strides, - const std::vector& c_gs_ms_ns_lengths, - const std::vector& c_gs_ms_ns_strides, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, - const index_t M01, - const index_t N01, - AElementwiseOperation a_element_op, - B0ElementwiseOperation b0_element_op, - AccElementwiseOperation acc_element_op, - B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) - : p_a_grid_{p_a_grid}, - p_b0_grid_{p_b0_grid}, - p_b1_grid_{p_b1_grid}, - p_c_grid_{p_c_grid}, - a_grid_desc_ak0_m_ak1_{ - DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b0_grid_desc_bk0_l_bk1_{DeviceOp::MakeB0GridDescriptor_BK0_L_BK1( - b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, - b1_grid_desc_bl0_n_bl1_{DeviceOp::MakeB1GridDescriptor_BL0_N_BL1( - b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, - c_grid_desc_m_n_{ - Transform::MakeCGridDescriptor_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, - a_grid_desc_g_m_k_{ - Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b0_grid_desc_g_l_k_{ - Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, - b1_grid_desc_g_n_l_{ - Transform::MakeB1GridDescriptor_G_N_K(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, - c_grid_desc_g_m_n_{ - Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, - c_grid_desc_mblock_mperblock_nblock_nperblock_{}, - block_2_ctile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)}, - a_element_op_{a_element_op}, - b0_element_op_{b0_element_op}, - acc_element_op_{acc_element_op}, - b1_element_op_{b1_element_op}, - c_element_op_{c_element_op}, - c0_matrix_mask_{b0_grid_desc_g_l_k_.GetLength(I1)}, - raw_lengths_mz_lz_kz_nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], - b0_gs_ls_ks_lengths[NumDimG + NumDimL - 1], - b0_gs_ls_ks_lengths[NumDimG + NumDimL + NumDimK - 1], - b1_gs_ns_ls_lengths[NumDimG + NumDimN - 1]}, - a_mz_kz_strides_{a_gs_ms_ks_strides[NumDimG + NumDimM - 1], - a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}, - b0_lz_kz_strides_{b0_gs_ls_ks_strides[NumDimG + NumDimL - 1], - b0_gs_ls_ks_strides[NumDimG + NumDimL + NumDimK - 1]}, - b1_nz_lz_strides_{b1_gs_ns_ls_strides[NumDimG + NumDimN - 1], - b1_gs_ns_ls_strides[NumDimG + NumDimN + NumDimL - 1]}, - c_mz_nz_strides_{c_gs_ms_ns_strides[NumDimG + NumDimM - 1], - c_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1]}, - batch_count_{c_grid_desc_g_m_n_.GetLength(I0)}, - compute_ptr_offset_of_batch_{ - a_grid_desc_g_m_k_, b0_grid_desc_g_l_k_, b1_grid_desc_g_n_l_, c_grid_desc_g_m_n_} - { - // TODO ANT: implement bias addition - ignore = p_acc0_biases; - ignore = p_acc1_biases; - ignore = acc0_biases_gs_ms_ls_lengths; - ignore = acc0_biases_gs_ms_ls_strides; - ignore = acc1_biases_gs_ms_ns_lengths; - ignore = acc1_biases_gs_ms_ns_strides; - - if(GridwiseOp::CheckValidity(a_grid_desc_ak0_m_ak1_, - b0_grid_desc_bk0_l_bk1_, - b1_grid_desc_bl0_n_bl1_, - c_grid_desc_m_n_, - block_2_ctile_map_)) - { - c_grid_desc_mblock_mperblock_nblock_nperblock_ = - GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( - c_grid_desc_m_n_); - } - } - - // Pointers - const ADataType* p_a_grid_; - const B0DataType* p_b0_grid_; - const B1DataType* p_b1_grid_; - CDataType* p_c_grid_; - - // Tensor Descriptors - AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; - B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1_; - B1GridDesc_BL0_N_BL1 b1_grid_desc_bl0_n_bl1_; - CGridDesc_M_N c_grid_desc_m_n_; - - AGridDesc_G_M_K a_grid_desc_g_m_k_; - B0GridDesc_G_L_K b0_grid_desc_g_l_k_; - B1GridDesc_G_N_L b1_grid_desc_g_n_l_; - CGridDesc_G_M_N c_grid_desc_g_m_n_; - - typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock - c_grid_desc_mblock_mperblock_nblock_nperblock_; - - // Block to Tile mapping - typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map_; - - // ElementwiseOp - AElementwiseOperation a_element_op_; - B0ElementwiseOperation b0_element_op_; - AccElementwiseOperation acc_element_op_; - B1ElementwiseOperation b1_element_op_; - CElementwiseOperation c_element_op_; - - // check C0 masking and padding - C0MatrixMask c0_matrix_mask_; - - // Strides for the last M/N/K dimensions of A/B0/B1/C - // for sanity check of vector load/store - std::vector raw_lengths_mz_lz_kz_nz_; - std::vector a_mz_kz_strides_; - std::vector b0_lz_kz_strides_; - std::vector b1_nz_lz_strides_; - std::vector c_mz_nz_strides_; - - index_t batch_count_; - // Batch Offset - ComputeBasePtrOfStridedBatch compute_ptr_offset_of_batch_; - }; - - // Invoker - struct Invoker : public BaseInvoker - { - using Argument = DeviceOp::Argument; - - float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) - { - const index_t grid_size = - arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; - - const auto K = - arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); - - auto launch_kernel = [&](auto has_main_k_block_loop) { - const auto kernel = kernel_batched_gemm_softmax_gemm_wmma_cshuffle< - GridwiseOp, - ADataType, - B0DataType, - B1DataType, - CDataType, - DeviceOp::AGridDesc_AK0_M_AK1, - DeviceOp::B0GridDesc_BK0_L_BK1, - DeviceOp::B1GridDesc_BL0_N_BL1, - typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, - AElementwiseOperation, - B0ElementwiseOperation, - AccElementwiseOperation, - B1ElementwiseOperation, - CElementwiseOperation, - ComputeBasePtrOfStridedBatch, - C0MatrixMask, - typename GridwiseOp::DefaultBlock2CTileMap, - has_main_k_block_loop>; - - return launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b0_grid_, - arg.p_b1_grid_, - arg.p_c_grid_, - arg.a_grid_desc_ak0_m_ak1_, - arg.b0_grid_desc_bk0_l_bk1_, - arg.b1_grid_desc_bl0_n_bl1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b0_element_op_, - arg.acc_element_op_, - arg.b1_element_op_, - arg.c_element_op_, - arg.batch_count_, - arg.compute_ptr_offset_of_batch_, - arg.c0_matrix_mask_, - arg.block_2_ctile_map_); - }; - - if(GridwiseOp::CalculateHasMainKBlockLoop(K)) - { - return launch_kernel(integral_constant{}); - } - else - { - return launch_kernel(integral_constant{}); - } - } - - // polymorphic - float Run(const BaseArgument* p_arg, - const StreamConfig& stream_config = StreamConfig{}) override - { - return Run(*dynamic_cast(p_arg), stream_config); - } - }; - - static constexpr bool IsValidCompilationParameter() - { - // TODO: properly implement this check - return true; - } - - static bool IsSupportedArgument(const Argument& arg) - { - if(ck::get_device_name() == "gfx1100") - { - if constexpr(!(is_same_v || is_same_v)) - { - return false; - } - - if constexpr(!(is_same_v || is_same_v)) - { - return false; - } - } - else - { - return false; - } - - if(!GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, - arg.b0_grid_desc_bk0_l_bk1_, - arg.b1_grid_desc_bl0_n_bl1_, - arg.c_grid_desc_m_n_, - arg.block_2_ctile_map_)) - { - return false; - } - - // Check if C permute dimension matches GEMM + GEMM shape - const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded - const index_t c_m = arg.c_grid_desc_m_n_.GetLength(I0); - const index_t c_n = arg.c_grid_desc_m_n_.GetLength(I1); - const index_t a_m = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1); - const index_t b1_n = arg.b1_grid_desc_bl0_n_bl1_.GetLength(I1); - - if(!(c_g == arg.batch_count_ && c_m == a_m && c_n == b1_n)) - { - return false; - } - - // Note: we need raw lengths since threadwise copy can not handle vector load when part of - // vector is out of bounds - // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O - const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; - const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; - const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; - const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; - - // Check scalar per vector requirement - const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; - const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; - const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; - const auto c_extent_lowest = NzRaw; - - if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && - b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && - b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && - c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) - { - return false; - } - - // Check vector load/store requirement - const auto a_stride_lowest = - ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; - const auto b0_stride_lowest = - B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; - const auto b1_stride_lowest = - B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; - const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; - - if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || - c_stride_lowest == 1)) - { - return false; - } - - return true; - } - - // polymorphic - bool IsSupportedArgument(const BaseArgument* p_arg) override - { - return IsSupportedArgument(*dynamic_cast(p_arg)); - } - - static auto MakeArgument( - const ADataType* p_a, - const B0DataType* p_b0, - const B1DataType* p_b1, - CDataType* p_c, - const std::array p_acc0_biases, - const std::array p_acc1_biases, - const std::vector& a_gs_ms_ks_lengths, - const std::vector& a_gs_ms_ks_strides, - const std::vector& b0_gs_ls_ks_lengths, - const std::vector& b0_gs_ls_ks_strides, - const std::vector& b1_gs_ns_ls_lengths, - const std::vector& b1_gs_ns_ls_strides, - const std::vector& c_gs_ms_ns_lengths, - const std::vector& c_gs_ms_ns_strides, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, - AElementwiseOperation a_element_op, - B0ElementwiseOperation b0_element_op, - AccElementwiseOperation acc_element_op, - B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) - { - return Argument{p_a, - p_b0, - p_b1, - p_c, - p_acc0_biases, - p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ls_ks_lengths, - b0_gs_ls_ks_strides, - b1_gs_ns_ls_lengths, - b1_gs_ns_ls_strides, - c_gs_ms_ns_lengths, - c_gs_ms_ns_strides, - acc0_biases_gs_ms_ls_lengths, - acc0_biases_gs_ms_ls_strides, - acc1_biases_gs_ms_ns_lengths, - acc1_biases_gs_ms_ns_strides, - 1, - 1, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op}; - } - - // polymorphic - std::unique_ptr MakeArgumentPointer( - const void* p_a, - const void* p_b0, - const void* p_b1, - void* p_c, - const std::array p_acc0_biases, - const std::array p_acc1_biases, - const std::vector& a_gs_ms_ks_lengths, - const std::vector& a_gs_ms_ks_strides, - const std::vector& b0_gs_ls_ks_lengths, - const std::vector& b0_gs_ls_ks_strides, - const std::vector& b1_gs_ns_ls_lengths, - const std::vector& b1_gs_ns_ls_strides, - const std::vector& c_gs_ms_ns_lengths, - const std::vector& c_gs_ms_ns_strides, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, - const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, - const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, - AElementwiseOperation a_element_op, - B0ElementwiseOperation b0_element_op, - AccElementwiseOperation acc_element_op, - B1ElementwiseOperation b1_element_op, - CElementwiseOperation c_element_op) override - { - return std::make_unique(static_cast(p_a), - static_cast(p_b0), - static_cast(p_b1), - static_cast(p_c), - p_acc0_biases, - p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ls_ks_lengths, - b0_gs_ls_ks_strides, - b1_gs_ns_ls_lengths, - b1_gs_ns_ls_strides, - c_gs_ms_ns_lengths, - c_gs_ms_ns_strides, - acc0_biases_gs_ms_ls_lengths, - acc0_biases_gs_ms_ls_strides, - acc1_biases_gs_ms_ns_lengths, - acc1_biases_gs_ms_ns_strides, - 1, - 1, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op); - } - - static auto MakeInvoker() { return Invoker{}; } - - // polymorphic - std::unique_ptr MakeInvokerPointer() override - { - return std::make_unique(Invoker{}); - } - - // polymorphic - std::string GetTypeString() const override - { - auto str = std::stringstream(); - - std::map LoopSchedToString{ - {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; - - std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, - {PipelineVersion::v2, "v2"}}; - - // clang-format off - str << "DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle" - << "<" - << BlockSize << ", " - << MPerBlock << ", " - << LPerBlock << ", " - << K0PerBlock << ", " - << K1 << ", " - << MPerBlock << ", " - << NPerBlock << ", " - << L0PerBlock << ", " - << L1 - << getGemmSpecializationString(GemmSpec) << ", " - << "ASpec" << getTensorSpecializationString(ASpec) << ", " - << "B0Spec" << getTensorSpecializationString(B0Spec) << ", " - << "B1Spec" << getTensorSpecializationString(B1Spec) << ", " - << "CSpec" << getTensorSpecializationString(CSpec) << ", " - << getMaskingSpecializationString(MaskingSpec) - << ">" - << " NumPrefetch: " - << NumPrefetch << ", " - << "LoopScheduler: " - << LoopSchedToString[LoopSched] << ", " - << "PipelineVersion: " - << PipelineVersionToString[PipelineVer]; - // clang-format on - - return str.str(); - } -}; - -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp deleted file mode 100644 index ebfa0765a0a..00000000000 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ /dev/null @@ -1,1194 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/common_header.hpp" -#include "ck/tensor_description/multi_index_transform_helper.hpp" -#include "ck/tensor_description/tensor_descriptor.hpp" -#include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp" -#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" -#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_softmax.hpp" - -namespace ck { - -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_batched_gemm_softmax_gemm_wmma_cshuffle( - const FloatA* __restrict__ p_a_grid, - const FloatB0* __restrict__ p_b0_grid, - const FloatB1* __restrict__ p_b1_grid, - FloatC* __restrict__ p_c_grid, - const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, - const B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1, - const B1GridDesc_BL0_N_BL1 b1_grid_desc_l0_n_l1, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock - c_grid_desc_mblock_mperblock_nblock_nperblock, - const AElementwiseOperation a_element_op, - const B0ElementwiseOperation b0_element_op, - const AccElementwiseOperation acc_element_op, - const B1ElementwiseOperation b1_element_op, - const CElementwiseOperation c_element_op, - const index_t batch_count, - const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch, - const C0MatrixMask c0_matrix_mask, - const Block2CTileMap block_2_ctile_map) -{ -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) - __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; - - const index_t num_blocks_per_batch = - __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); - const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - - const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); - const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx))); - const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); - const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); - - GridwiseGemm::template Run(p_a_grid + a_batch_offset, - p_b0_grid + b0_batch_offset, - p_b1_grid + b1_batch_offset, - p_c_grid + c_batch_offset, - p_shared, - a_grid_desc_ak0_m_ak1, - b0_grid_desc_bk0_l_bk1, - b1_grid_desc_l0_n_l1, - c_grid_desc_mblock_mperblock_nblock_nperblock, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op, - c0_matrix_mask, - block_2_ctile_map); -#else - ignore = p_a_grid; - ignore = p_b0_grid; - ignore = p_b1_grid; - ignore = p_c_grid; - ignore = a_grid_desc_ak0_m_ak1; - ignore = b0_grid_desc_bk0_l_bk1; - ignore = b1_grid_desc_l0_n_l1; - ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; - ignore = a_element_op; - ignore = b0_element_op; - ignore = acc_element_op; - ignore = b1_element_op; - ignore = c_element_op; - ignore = batch_count; - ignore = compute_base_ptr_of_batch; - ignore = c0_matrix_mask; - ignore = block_2_ctile_map; -#endif // end of if (defined(__gfx1100__)) -} - -// Gemm0: A [M x K] x B0 [K x L] = Acc [M x L] -// Gemm1: Acc [M x L] x B1 [L x N] = C [M x N] -template -struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto I5 = Number<5>{}; - static constexpr auto I6 = Number<6>{}; - static constexpr auto I7 = Number<7>{}; - - static constexpr auto AK0 = Number{}; - static constexpr auto AK1 = Number{}; - static constexpr auto BK0 = Number{}; - static constexpr auto BK1 = Number{}; - - static constexpr auto AL0 = Number{}; - static constexpr auto AL1 = Number{}; - static constexpr auto BL0 = Number{}; - static constexpr auto BL1 = Number{}; - - using ThisThreadBlock = ThisThreadBlock; - - using GridwiseGemmPipe = remove_cvref_t())>; - - template - __host__ __device__ static constexpr auto - MakeA0BlockDescriptor_K0_M0_M1_M2_K1(const A0BlockDesc_AK0_M_AK1&) - { - constexpr index_t A_K0 = A0BlockDesc_AK0_M_AK1{}.GetLength(I0); - constexpr index_t A_K1 = A0BlockDesc_AK0_M_AK1{}.GetLength(I2); - constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWmma); - - return transform_tensor_descriptor( - A0BlockDesc_AK0_M_AK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - template - __host__ __device__ static constexpr auto - MakeB0BlockDescriptor_K0_L0_L1_L2_K1(const B0BlockDesc_BK0_L_BK1&) - { - constexpr index_t B_K0 = B0BlockDesc_BK0_L_BK1{}.GetLength(I0); - constexpr index_t B_K1 = B0BlockDesc_BK0_L_BK1{}.GetLength(I2); - constexpr index_t LWaves = LPerBlock / (LRepeat * LPerWmma); - return transform_tensor_descriptor( - B0BlockDesc_BK0_L_BK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - template - __host__ __device__ static constexpr auto - MakeA1BlockDescriptor_L0_M0_M1_M2_L1(const A1BlockDesc_AL0_M_AL1&) - { - constexpr index_t A_L0 = A1BlockDesc_AL0_M_AL1{}.GetLength(I0); - constexpr index_t A_L1 = A1BlockDesc_AL0_M_AL1{}.GetLength(I2); - - return transform_tensor_descriptor( - A1BlockDesc_AL0_M_AL1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform(make_tuple(Number{}, I1, I1)), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - template - __host__ __device__ static constexpr auto - MakeB1BlockDescriptor_L0_N0_N1_N2_L1(const B1BlockDesc_BL0_N_BL1&) - { - constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); - constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); - constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWmma); - return transform_tensor_descriptor( - B1BlockDesc_BL0_N_BL1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() - { - // A matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(AK0, Number{}, AK1), - make_tuple(Number{} * AK1, AK1, I1)); - } - - __host__ __device__ static constexpr auto GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1() - { - // B matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(BK0, Number{}, BK1), - make_tuple(Number{} * BK1, BK1, I1)); - } - - __host__ __device__ static constexpr auto GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1() - { - // B1 matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(BL0, Number{}, BL1), - make_tuple(Number{} * BL1, BL1, I1)); - } - - __host__ __device__ static constexpr auto - // *Caution Here repeat is shuffle repeat - GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() - { - constexpr index_t MWave = MPerBlock / (MRepeat * MPerWmma); - constexpr index_t NWave = NPerBlock / (NRepeat * NPerWmma); - - constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = - make_naive_tensor_descriptor_packed( - make_tuple(I1, - Number{}, - I1, - Number{})); - - return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat; - } - - __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() - { - // LDS allocation for A and B: be careful of alignment - const index_t gemm0_bytes_end = - (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + - SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); - - const index_t gemm1_bytes_end = - (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) * - sizeof(FloatB1); - - const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset + - SharedMemTrait::reduction_space_size_aligned) * - sizeof(FloatAcc0); - - const index_t c_block_bytes_end = - SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); - - return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); - } - - // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} - template - __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1, - const B0GridDesc_BK0_L_BK1& b0_grid_desc_bk0_l_bk1, - const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, - const CGridDesc_M_N& c_grid_desc_m_n, - const Block2CTileMap& block_2_ctile_map) - { - static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && - (LPerBlock % (LPerWmma * LRepeat)) == 0, - "Invalid tuning param!"); - - const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1); - const auto L = b0_grid_desc_bk0_l_bk1.GetLength(I1); - const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2); - const auto N = b1_grid_desc_l0_n_l1.GetLength(I1); - - const auto KPerBlock = K0PerBlock * K1Value; - if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1))) - { - return false; - } - - if(!(M % MPerBlock == 0 && L % LPerBlock == 0 && K % KPerBlock == 0 && N % NPerBlock == 0)) - { - return false; - } - - // check gemm0 gridwise gemm pipeline - const auto num_gemm0_k_loop = K / KPerBlock; - if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop)) - { - return false; - } - - // check gemm1 gridwise gemm pipeline - if(!(LPerBlock % (L0PerBlock * L1Value) == 0)) - { - return false; - } - - const auto num_gemm1_k_inner_loop = LPerBlock / (L0PerBlock * L1Value); - if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop)) - { - return false; - } - - if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n)) - { - return false; - } - - // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) - return true; - } - - __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) - { - const index_t num_loop = K / (K0PerBlock * K1Value); - - return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); - } - - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto MBlock = M / MPerBlock; - const auto NBlock = N / NPerBlock; - - const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), - make_unmerge_transform(make_tuple(NBlock, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); - - return c_grid_desc_mblock_mperblock_nblock_nperblock; - } - - // return block_id to C matrix tile idx (m0, n0) mapping - __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap( - const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */) - { - return BlockToCTileMap_M00_N0_M01Adapt( - c_grid_desc_m_n); - } - - using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; - using DefaultBlock2CTileMap = - remove_cvref_t; - - struct SharedMemTrait - { - // LDS allocation for A and B: be careful of alignment - static constexpr auto a_block_desc_ak0_m_ak1 = - GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); - static constexpr auto b0_block_desc_bk0_l_bk1 = - GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); - static constexpr auto b1_block_desc_bl0_n_bl1 = - GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); - - static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), BL1); - - static constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); - static constexpr auto b0_block_space_size_aligned = math::integer_least_multiple( - b0_block_desc_bk0_l_bk1.GetElementSpaceSize(), max_lds_align); - static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple( - b1_block_desc_bl0_n_bl1.GetElementSpaceSize(), max_lds_align); - - static constexpr auto a_block_space_offset = 0; - static constexpr auto b0_block_space_offset = a_block_space_size_aligned.value; - static constexpr auto b1_block_space_offset = 0; - - // LDS allocation for reduction - static constexpr index_t reduction_space_size_aligned = - math::integer_least_multiple(BlockSize, max_lds_align); - - static constexpr auto reduction_space_offset = 0; - - // LDS allocation for C shuffle in LDS - static constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = - GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); - static constexpr auto c_block_space_size = - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat - .GetElementSpaceSize(); - }; - - template - __device__ static void Run(const FloatA* __restrict__ p_a_grid, - const FloatB0* __restrict__ p_b0_grid, - const FloatB1* __restrict__ p_b1_grid, - FloatC* __restrict__ p_c_grid, - void* __restrict__ p_shared, - const AGridDesc_AK0_M_AK1& a_grid_desc_k0_m_k1, - const B0GridDesc_BK0_L_BK1& b0_grid_desc_k0_l_k1, - const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& - c_grid_desc_mblock_mperblock_nblock_nperblock, - const AElementwiseOperation& a_element_op, - const B0ElementwiseOperation& b0_element_op, - const AccElementwiseOperation& acc_element_op, - const B1ElementwiseOperation& b1_element_op, - const CElementwiseOperation& c_element_op, - const C0MatrixMask& c0_matrix_mask, - const Block2CTileMap& block_2_ctile_map) - { - // clang-format off -/*******************************************************************************/ -// Memory buffer zone. - const auto a_grid_buf = make_dynamic_buffer( - p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); - const auto b0_grid_buf = make_dynamic_buffer( - p_b0_grid, b0_grid_desc_k0_l_k1.GetElementSpaceSize()); - const auto b1_grid_buf = make_dynamic_buffer( - p_b1_grid, b1_grid_desc_l0_n_l1.GetElementSpaceSize()); - auto c_grid_buf = make_dynamic_buffer( - p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); - -/*******************************************************************************/ -// BlockIdx.x -> [BlockId.m, BlockId.n] - const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); - if(!block_2_ctile_map.ValidCTileIndex( - block_work_idx, - make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), - c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) - { return; } - - // Store BlockId into SGPR - const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); - const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); - -/*******************************************************************************/ -// set up Gemm0 -/*******************************************************************************/ - -/*******************************************************************************/ -// BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - - constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1(); - constexpr auto b0_block_desc_k0perblock_lperblock_k1 = GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); - - // A matrix blockwise copy - auto a_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, -/* typename SrcElementwiseOperation, */ AElementwiseOperation, -/* typename DstElementwiseOperation, */ ck::tensor_operation::element_wise::PassThrough, -/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, -/* typename BlockSliceLengths, */ Sequence, -/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, -/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatA, -/* typename DstData, */ FloatA, -/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), -/* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), -/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, -/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, -/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, -/* index_t DstVectorDim, */ 2, -/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, -/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, -/* index_t SrcScalarStrideInVector, */ 1, -/* index_t DstScalarStrideInVector, */ 1, -/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, -/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( - a_grid_desc_k0_m_k1, - make_multi_index(0, m_block_data_idx_on_grid, 0), - a_element_op, - a_block_desc_k0perblock_mperblock_k1, - make_multi_index(0, 0, 0), - ck::tensor_operation::element_wise::PassThrough{}); - - // B matrix blockwise copy - auto b0_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - B0BlockTransferThreadClusterLengths_K0_L_K1, - B0BlockTransferThreadClusterArrangeOrder, - FloatB0, - FloatB0, - decltype(b0_grid_desc_k0_l_k1), - decltype(b0_block_desc_k0perblock_lperblock_k1), - B0BlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - B0BlockTransferSrcVectorDim, - 2, - B0BlockTransferSrcScalarPerVector, - B0BlockTransferDstScalarPerVector_K1, - 1, - 1, - B0ThreadTransferSrcResetCoordinateAfterRun, - true>( - b0_grid_desc_k0_l_k1, - make_multi_index(0, 0, 0), - b0_element_op, - b0_block_desc_k0perblock_lperblock_k1, - make_multi_index(0, 0, 0), - ck::tensor_operation::element_wise::PassThrough{}); - -/*******************************************************************************/ - // Gemm0 - constexpr auto WmmaK = 16; - constexpr auto KPack = math::integer_least_multiple(K1Value, WmmaK); - - auto blockwise_gemm0 = BlockwiseGemmWMMA< - BlockSize, - FloatA, - FloatB0, - FloatAcc0, - decltype(MakeA0BlockDescriptor_K0_M0_M1_M2_K1(a_block_desc_k0perblock_mperblock_k1)), - decltype(MakeB0BlockDescriptor_K0_L0_L1_L2_K1(b0_block_desc_k0perblock_lperblock_k1)), - MPerBlock, - LPerBlock, - K0PerBlock * K1Value, - MPerWmma, - LPerWmma, - MRepeat, - LRepeat, - KPack, - true>{}; // C' = B' x A' - - - // Prepare Register for A*B0 matrix - auto acc0_thread_buf = blockwise_gemm0.GetCThreadBuffer(); - - constexpr auto acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = - blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); - - constexpr auto mrepeat = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I0); - constexpr auto mwave = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I1); - constexpr auto mthreadpersubgroup = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I2); - constexpr auto lrepeat = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I3); - constexpr auto lwave = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I4); - constexpr auto lsubgroup = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I5); - constexpr auto laccvgprs = acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I6); - - constexpr auto acc0_thread_desc_l0perblock_mperblock_l1 = transform_tensor_descriptor( - acc0_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, - make_tuple(make_merge_transform_v3_division_mod(make_tuple(lrepeat, lwave, lsubgroup)), - make_merge_transform_v3_division_mod(make_tuple(mrepeat, mwave, mthreadpersubgroup)), - make_pass_through_transform(laccvgprs)), - make_tuple(Sequence<3, 4, 5>{}, Sequence<0, 1, 2>{}, Sequence<6>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - -/*******************************************************************************/ - // LDS allocation for A and B: be careful of alignment - auto a_block_buf = make_dynamic_buffer(static_cast(p_shared) + SharedMemTrait::a_block_space_offset, - a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); - auto b0_block_buf = make_dynamic_buffer(static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, - b0_block_desc_k0perblock_lperblock_k1.GetElementSpaceSize()); - - // Shift Per SUB_K - constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - constexpr auto b0_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - const auto a_block_reset_copy_step = make_multi_index(-a_grid_desc_k0_m_k1.GetLength(I0), 0, 0); - const auto b0_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); - - const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); -/*******************************************************************************/ -// softmax -/*******************************************************************************/ - auto workspace_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::reduction_space_offset, - SharedMemTrait::reduction_space_size_aligned); - // get acc0 7D thread cluster - constexpr auto thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = - blockwise_gemm0.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths() / - blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths(); - constexpr auto t_mrepeat = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I0); - constexpr auto t_mwave = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I1); - constexpr auto t_mthreadpersubgroup = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I2); - constexpr auto t_lrepeat = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I3); - constexpr auto t_lwave = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I4); - constexpr auto t_lsubgroup = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I5); - constexpr auto t_laccvgprs = thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.At(I6); - // get acc0 thread map - constexpr auto m0_l_m1_to_m_l_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(t_mrepeat * t_mwave, t_mthreadpersubgroup)), - make_pass_through_transform(I1)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - constexpr auto threadid_to_m0_l_m1_adaptor = make_single_stage_tensor_adaptor( - make_tuple( - make_merge_transform( - make_tuple(t_mrepeat * t_mwave, t_lrepeat * t_lwave * t_lsubgroup * t_laccvgprs, t_mthreadpersubgroup))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - const auto threadid_to_l_n_thread_cluster_adaptor = - chain_tensor_adaptors(m0_l_m1_to_m_l_adaptor, threadid_to_m0_l_m1_adaptor); - - // get acc0 2D thread cluster & 2D thread slice - constexpr auto thread_cluster_desc_m_l = make_naive_tensor_descriptor_packed( - make_tuple(t_mrepeat * t_mwave * t_mthreadpersubgroup, t_lrepeat * t_lwave * t_lsubgroup * t_laccvgprs)); - - constexpr auto thread_slice_desc_m_l = make_naive_tensor_descriptor_packed( - make_tuple(mrepeat * mwave * mthreadpersubgroup, lrepeat * lwave * lsubgroup * laccvgprs)); - - auto blockwise_softmax = BlockwiseSoftmax{}; - - // Initialize running sum and max of exponentiating row vectors - using SoftmaxBuf = typename decltype(blockwise_softmax)::BufferType; - SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new; - running_sum = 0; - running_sum_new = 0; - running_max = NumericLimits::Lowest(); - running_max_new = NumericLimits::Lowest(); -/*******************************************************************************/ -// set up Gemm1 -/*******************************************************************************/ - // B1 matrix in LDS memory, dst of blockwise copy - constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); - constexpr auto b1_block_slice_copy_step = make_multi_index(BL0, 0, 0); - - // A1 matrix in VGPR - constexpr auto A1ThreadSlice_L0PerBlock_MPerBlock_L1 = make_tuple( - Number{}, - Number{}, - Number{}); // Data duplicated dimension - - constexpr auto A1ThreadSliceL0PerBlock = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I0]; - constexpr auto A1ThreadSliceMPerBlock = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I1]; - constexpr auto A1ThreadSliceL1 = A1ThreadSlice_L0PerBlock_MPerBlock_L1[I2]; - - // A1 has duplicated data - constexpr auto A1ThreadDuplicatedDim = I2 * A1ThreadSliceL1; - constexpr auto a1_thread_desc_l0perblock_mperblock_l1 = make_naive_tensor_descriptor( - make_tuple(A1ThreadSliceL0PerBlock, A1ThreadSliceMPerBlock, A1ThreadDuplicatedDim), - make_tuple(A1ThreadSliceMPerBlock * A1ThreadDuplicatedDim, A1ThreadDuplicatedDim, I1)); - - // A1 matrix blockwise copy - auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< - FloatAcc0, - FloatA, - decltype(acc0_thread_desc_l0perblock_mperblock_l1), - decltype(a1_thread_desc_l0perblock_mperblock_l1), - tensor_operation::element_wise::PassThrough, - Sequence, - Sequence<0, 1, 2>, - 2, - laccvgprs, - // dst Rowlane - // 0x76543210 0xfedcba98 - // src Rowlane - 0x76543210, 0xfedcba98, - false>{tensor_operation::element_wise::PassThrough{}}; - - // B1 matrix blockwise copy - auto b1_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, -/* typename SrcElementwiseOperation, */ B1ElementwiseOperation, -/* typename DstElementwiseOperation, */ tensor_operation::element_wise::PassThrough, -/* InMemoryDataOperationEnum DstInMemOp, */ InMemoryDataOperationEnum::Set, -/* typename BlockSliceLengths, */ Sequence, -/* typename ThreadClusterLengths, */ B1BlockTransferThreadClusterLengths_L0_N_L1, -/* typename ThreadClusterArrangeOrder, */ B1BlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatB1, -/* typename DstData, */ FloatB1, -/* typename SrcDesc, */ decltype(b1_grid_desc_l0_n_l1), -/* typename DstDesc, */ decltype(b1_block_desc_l0perblock_nperblock_l1), -/* typename SrcDimAccessOrder, */ B1BlockTransferSrcAccessOrder, -/* typename DstDimAccessOrder, */ Sequence<1, 0, 2>, -/* index_t SrcVectorDim, */ B1BlockTransferSrcVectorDim, -/* index_t DstVectorDim, */ 2, -/* index_t SrcScalarPerVector, */ B1BlockTransferSrcScalarPerVector, -/* index_t DstScalarPerVector, */ B1BlockTransferDstScalarPerVector_L1, -/* index_t SrcScalarStrideInVector, */ 1, -/* index_t DstScalarStrideInVector, */ 1, -/* bool ThreadTransferSrcResetCoordinateAfterRun, */ B1ThreadTransferSrcResetCoordinateAfterRun, -/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, // DstResetCoord - NumGemmKPrefetchStage>( - b1_grid_desc_l0_n_l1, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b1_element_op, - b1_block_desc_l0perblock_nperblock_l1, - make_multi_index(0, 0, 0), - tensor_operation::element_wise::PassThrough{}); - - auto a1_thread_buf = make_static_buffer( - a1_thread_desc_l0perblock_mperblock_l1.GetElementSpaceSize()); - auto b1_block_buf = make_dynamic_buffer( - static_cast(p_shared)+ SharedMemTrait::b1_block_space_offset, - b1_block_desc_l0perblock_nperblock_l1.GetElementSpaceSize()); - - auto blockwise_gemm1 = - BlockwiseGemmWMMA{make_tuple(0, 0, 0, 0, 0)}; - - auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); - - const index_t num_gemm1_l_block_outer_loop = b0_grid_desc_k0_l_k1.GetLength(I1) / LPerBlock; - constexpr index_t num_gemm1_l_block_inner_loop = LPerBlock / (BL0 * BL1); - - // Initialize C - StaticBuffer c_thread_buf; - c_thread_buf.Clear(); - -/*******************************************************************************/ - // Flash Attention - // Dao, Tri, et al. "Flashattention: Fast and memory-efficient exact attention with io-awareness." arXiv preprint arXiv:2205.14135 (2022). - index_t gemm1_l_block_outer_index = 0; - // Outer loop, along GEMM_L - // Inner loop, along GEMM_K - do{ - auto l_block_data_idx_on_grid = - __builtin_amdgcn_readfirstlane(gemm1_l_block_outer_index * LPerBlock); - if(c0_matrix_mask.IsTileSkippable( - m_block_data_idx_on_grid, l_block_data_idx_on_grid, MPerBlock, LPerBlock)) - { - continue; - } - // gemm0 start, A-B swaped - GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, - a_block_desc_k0perblock_mperblock_k1, - a_blockwise_copy, - a_grid_buf, - a_block_buf, - a_block_slice_copy_step, - b0_grid_desc_k0_l_k1, - b0_block_desc_k0perblock_lperblock_k1, - b0_blockwise_copy, - b0_grid_buf, - b0_block_buf, - b0_block_slice_copy_step, - blockwise_gemm0, - acc0_thread_buf, - K0BlockMainLoop); - // do MNK padding or upper triangular masking - if constexpr(MaskOutUpperTriangle || PadN) - { - // 7d thread_desc in thread scope - constexpr auto c_thread_lengths = - blockwise_gemm0.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths(); - - // 7d block_desc in block scope - constexpr auto c_block_lengths = - blockwise_gemm0.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs().GetLengths(); - - constexpr auto MREPEAT = c_block_lengths[I0]; - constexpr auto MWAVE = c_block_lengths[I1]; - constexpr auto MTHREADSubGroup = c_block_lengths[I2]; - constexpr auto LREPEAT = c_block_lengths[I3]; - constexpr auto LWAVE = c_block_lengths[I4]; - constexpr auto LSUBGROUP = c_block_lengths[I5]; - constexpr auto LACCVGPRS = c_block_lengths[I6]; - - // works like multi-dimension static_for (static_ford), but provides both the linear - // index as well as n-d index - using Acc0TileIterator = SpaceFillingCurve< - decltype(c_thread_lengths), - typename arithmetic_sequence_gen<0, c_thread_lengths.Size(), 1>::type, - typename uniform_sequence_gen::type, - false>; // SnakeCurved - - auto acc0_thread_origin = blockwise_gemm0.CalculateCThreadOriginDataIndex7D( - Number<0>{}, Number<0>{}); - - constexpr auto block_idx_to_m_l_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(MREPEAT, MWAVE, MTHREADSubGroup)), - make_unmerge_transform(make_tuple(LREPEAT, LWAVE, LSUBGROUP, LACCVGPRS))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5, 6>{})); - - static_for<0, Acc0TileIterator::GetNumOfAccess(), 1>{}([&](auto i) { - auto acc0_thread_idx = Acc0TileIterator::GetIndex(i) + acc0_thread_origin; - auto m_local = block_idx_to_m_l_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0]; - auto l_local = block_idx_to_m_l_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1]; - auto m_global = m_local + m_block_data_idx_on_grid; - auto l_global = l_local + l_block_data_idx_on_grid; - if(c0_matrix_mask.IsMaskedElement(m_global, l_global)) - { - acc0_thread_buf(i) = -ck::NumericLimits::Infinity(); - } - else - { - acc_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); - } - }); - } - else - { static_for<0, acc0_thread_buf.Size(), 1>{}( - [&](auto i) { acc_element_op(acc0_thread_buf(i), acc0_thread_buf[i]); }); - } - - - block_sync_lds(); - // gemm0 end - // gemm0 incorrect - // Tiled softmax start - // softmax - SoftmaxBuf& max = blockwise_softmax.max_value_buf; - SoftmaxBuf& sum = blockwise_softmax.sum_value_buf; - - blockwise_softmax.Run(acc0_thread_buf, workspace_buf); - - // TODO: may convert to log domain - running_max_new = mathext::max(max, running_max); - running_sum_new = mathext::exp(running_max - running_max_new) * running_sum + - mathext::exp(max - running_max_new) * sum; - - // gemm1 - { - // TODO: explore using dynamic buffer for a1 thread buffer - // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(), - // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that - // the A1 source buffer is static buffer holding the output of first GEMM and - // requires constexpr offset by design. Therefore, we pass tensor coordinate offset - // explicitly in Run() below. - - // Initialize acc1 - acc1_thread_buf.Clear(); - - // preload data into LDS - b1_blockwise_copy.RunRead(b1_grid_desc_l0_n_l1, b1_grid_buf); - - b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_l0_n_l1, - b1_block_slice_copy_step); - - block_sync_lds(); // wait for reduction LDS read - - b1_blockwise_copy.RunWrite(b1_block_desc_l0perblock_nperblock_l1, b1_block_buf); - - // main body - if constexpr(num_gemm1_l_block_inner_loop > 1) - { - static_for<0, num_gemm1_l_block_inner_loop - 1, 1>{}([&](auto i) { - // Data cast from FloatAcc0 to FloatA happen here - a1_blockwise_copy.Run(acc0_thread_desc_l0perblock_mperblock_l1, - make_tuple(Number{}, I0, I0), - acc0_thread_buf, - a1_thread_desc_l0perblock_mperblock_l1, - make_tuple(I0, I0, I0), - a1_thread_buf); - - b1_blockwise_copy.RunRead(b1_grid_desc_l0_n_l1, b1_grid_buf); - - block_sync_lds(); - - blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); - - block_sync_lds(); - - b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_l0_n_l1, - b1_block_slice_copy_step); - - b1_blockwise_copy.RunWrite(b1_block_desc_l0perblock_nperblock_l1, b1_block_buf); - }); - } - // tail - { - a1_blockwise_copy.Run( - acc0_thread_desc_l0perblock_mperblock_l1, - make_tuple( - Number<(num_gemm1_l_block_inner_loop - 1) * A1ThreadSliceL0PerBlock>{}, I0, I0), - acc0_thread_buf, - a1_thread_desc_l0perblock_mperblock_l1, - make_tuple(I0, I0, I0), - a1_thread_buf); - - block_sync_lds(); - - blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf); - } - } // end gemm1 - - constexpr auto c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = - blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); - constexpr auto c_mrepeat = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I0); - constexpr auto c_mwave = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I1); - constexpr auto c_mthreadpersubgroup = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I2); - constexpr auto c_nrepeat = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I3); - constexpr auto c_nwave = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I4); - constexpr auto c_nsubgroup = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I5); - constexpr auto c_naccvgprs = c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs.GetLength(I6); - - constexpr auto c_thread_slice_desc_m_n = make_naive_tensor_descriptor_packed( - make_tuple(c_mrepeat * c_mwave * c_mthreadpersubgroup, - c_nrepeat * c_nwave * c_nsubgroup * c_naccvgprs)); - constexpr auto c_thread_buf_slice_m = c_thread_slice_desc_m_n.GetLength(I0); - constexpr auto c_thread_buf_slice_n = c_thread_slice_desc_m_n.GetLength(I1); - - static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) { - static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) { - auto I = Number{}; - FloatAcc1 acc1 = acc1_thread_buf[I]; // P*V - FloatAcc1 c = c_thread_buf[I]; // O - FloatAcc1 c_new = - (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c + - math::exp(max[iM] - running_max_new[iM]) * acc1) / - running_sum_new[iM]; - - c_thread_buf(I) = c_new; // O_new - }); - }); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, - a_block_reset_copy_step); // rewind K - b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc_k0_l_k1, - b0_block_reset_copy_step); // rewind K and step N - - // update before next j iteration - running_max = running_max_new; - running_sum = running_sum_new; - - block_sync_lds(); // wait for gemm1 LDS read - }while(++gemm1_l_block_outer_index < num_gemm1_l_block_outer_loop); -/*******************************************************************************/ - // write out to C, implement shuffle - { - constexpr auto c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = - blockwise_gemm1.GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); - - // This API Provide All dimension (size) you need - constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp = - blockwise_gemm1.GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs(); - - constexpr auto MWave = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I1); - constexpr auto MThreadPerSubGroup = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I2); - constexpr auto NWave = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I4); - constexpr auto NSubGroup = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I5); - constexpr auto NAccVgprs = c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs_tmp.GetLength(I6); - - // LDS descriptor, shuffle and write out in MRepeat x NRepeat times - constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = - GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); - - auto c_shuffle_block_buf = make_dynamic_buffer( - static_cast(p_shared), - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); - - constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = transform_tensor_descriptor( - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, - make_tuple( - make_freeze_transform(I0), - make_unmerge_transform(make_tuple( - Number{}, // MRepeat per shuffle repeat - MWave, // MWave - MThreadPerSubGroup // MThreadPerSubGroup = MPerWmma - )), - make_freeze_transform(I0), - make_unmerge_transform(make_tuple( - Number{}, // NRepeat per shuffle repeat - NWave, // NWave - NSubGroup, - NAccVgprs))), // NSubGroup * NAccVgprs = NPerWmma - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<>{}, Sequence<0, 1, 2>{}, Sequence<>{}, Sequence<3, 4, 5, 6>{})); - - // calculate origin of thread output tensor on global memory - // blockwise GEMM c matrix starting index - const auto c_thread_mtx_on_block = blockwise_gemm1.CalculateCThreadOriginDataIndex(I0, I0); - - const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; - const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; - - const auto m_thread_data_on_block_to_mrepeat_mwave_mthreadpersubgroup_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MThreadPerSubGroup))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - const auto n_thread_data_on_block_to_nrepeat_nwave_nsubgroup_naccvgprs_adaptor = - make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NSubGroup, NAccVgprs))), - make_tuple(Sequence<0, 1, 2, 3>{}), - make_tuple(Sequence<0>{})); - - const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_mthreadpersubgroup_adaptor.CalculateBottomIndex( - make_multi_index(m_thread_data_on_block)); - - const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nsubgroup_naccvgprs_adaptor.CalculateBottomIndex( - make_multi_index(n_thread_data_on_block)); - - // shuffle: threadwise copy C from VGPR to LDS - auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3, - Sequence<0, 1, 2, 3, 4, 5, 6>, - 6, - 8, // vector write pixel - InMemoryDataOperationEnum::Set, - 1, - true>{ - c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, - make_multi_index(0, - m_thread_data_on_block_idx[I1], - m_thread_data_on_block_idx[I2], - 0, - n_thread_data_on_block_idx[I1], - n_thread_data_on_block_idx[I2], - n_thread_data_on_block_idx[I3]), - ck::tensor_operation::element_wise::PassThrough{}}; - - // shuffle: blockwise copy C from LDS to global - auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< - ThisThreadBlock, // ThreadGroup - CElementwiseOperation, // ElementwiseOperation, - CGlobalMemoryDataOperation, // DstInMemOp, - Sequence<1, - CShuffleMRepeatPerShuffle * MWave * MPerWmma, - 1, - CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths, - CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, - Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, - FloatCShuffle, // typename SrcData, - FloatC, // typename DstData, - decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), - decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), - Sequence<0, 1, 2, 3>, // typename DimAccessOrder, - 3, // index_t VectorDim, - CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, - true, // bool ThreadTransferSrcResetCoordinateAfterRun, - false> // bool ThreadTransferDstResetCoordinateAfterRun> - {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, - make_multi_index(0, 0, 0, 0), - c_grid_desc_mblock_mperblock_nblock_nperblock, - make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0), - c_element_op}; - - // space filling curve for local reg & global memory - // space filling curve for threadwise C in VGPR - constexpr auto sfc_c_vgpr = - SpaceFillingCurve, - Sequence<0, 1, 2, 3, 4, 5, 6>, - Sequence>{}; - - // space filling curve for shuffled blockwise C in global mem - constexpr auto sfc_c_global = - SpaceFillingCurve, - Sequence<0, 2, 1, 3>, - Sequence<1, - CShuffleMRepeatPerShuffle * MWave * MPerWmma, - 1, - CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{}; - - constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); - - static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!"); - - static_for<0, num_access, 1>{}([&](auto access_id) { - // make sure it's safe to write to LDS - block_sync_lds(); - - // each thread write its data from VGPR to LDS - c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, - sfc_c_vgpr.GetIndexTupleOfNumber(access_id), - c_thread_buf, - c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs, - c_shuffle_block_buf); - - // make sure it's safe to read from LDS - block_sync_lds(); - - // each block copy its data from LDS to global - c_shuffle_block_copy_lds_to_global.Run( - c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, - c_shuffle_block_buf, - c_grid_desc_mblock_mperblock_nblock_nperblock, - c_grid_buf); - - if constexpr(access_id < num_access - 1) - { - constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); - // move on C - c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( - c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); - } - }); - } - // clang-format on - } -}; - -} // namespace ck From 579f84c6a004be53d5948d680948fb95bb0571cc Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 06:27:24 +0000 Subject: [PATCH 047/118] tempsave --- example/01_gemm/gemm_wmma_fp16.cpp | 8 +- example/01_gemm/run_gemm_example.inc | 2 +- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 2 +- include/ck/host_utility/kernel_launch.hpp | 4 +- .../gpu/block/blockwise_gemm_wmma.hpp | 5 +- .../gpu/device/impl/device_gemm_wmma.hpp | 3 + ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 481 ++++++++++++++---- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 5 + .../gpu/grid/gridwise_gemm_wmma.hpp | 4 +- 9 files changed, 395 insertions(+), 119 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 3945a085dca..f6fd5b4d19b 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -37,13 +37,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle GemmDefault, 256, // BlockSize 128, // MPerBlock - 16, // NPerBlock + 128, // NPerBlock 32, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 1, // M Repeat - 1, // N-Repeat + 2, // M Repeat + 4, // N-Repeat S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -60,7 +60,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle true, 1, // C shuffle (M Repeat) Per store 1, // C shuffle (N Repeat) Per store - S<1, 128, 1, 2>, + S<1, 64, 1, 4>, 8>; // clang-format on diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index e9b6e9830ce..30f11d9089f 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -44,7 +44,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); break; case 4: - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); break; default: diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index c2f2e000cd7..f79d75f99c3 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -129,7 +129,7 @@ using DeviceGemmInstance = S<0, 2, 1>, 1, 8, - 1, + 1, // be eight? false, 1, // CShuffleMWmmaPerWavePerShuffle 2, // CShuffleNWmmaPerWavePerShuffle diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index f5d534c75a3..f5ad7408b55 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -33,9 +33,9 @@ float launch_and_time_kernel(const StreamConfig& stream_config, printf("Warm up 1 time\n"); #endif // warm up - kernel<<>>(args...); + // kernel<<>>(args...); - const int nrepeat = 100; + const int nrepeat = 1; #if DEBUG_LOG printf("Start running %d times...\n", nrepeat); #endif diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 3ca81a52726..0668fe81932 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -27,6 +27,8 @@ template /* Option: Read from LDS, big buffer hold all threads required data * Source @@ -83,9 +85,6 @@ struct BlockwiseGemmWMMA static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); - static constexpr bool AEnableLds = NWaves == 1 ? false : true; - static constexpr bool BEnableLds = MWaves == 1 ? false : true; - // Read from Lds, duplicate Twice, Read from VGPR, no duplication. static constexpr index_t A_Data_Duplicated_Rate = AEnableLds ? 2 : 1; static constexpr index_t B_Data_Duplicated_Rate = BEnableLds ? 2 : 1; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index d0211fe5a08..408222392f6 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -89,6 +89,9 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{MPerBlock, NPerBlock, KPerBlock}; // Describe how data read from Global memory diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index da2a5d36f32..555f0b61edb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -45,7 +45,7 @@ __global__ void const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, - const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1, + const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, @@ -84,7 +84,7 @@ __global__ void p_ds_grid_grp, p_e_grid + e_batch_offset, p_shared, - a_grid_desc_k0_m_k1, + a_grid_desc, b_grid_desc_k0_n_k1, ds_grid_desc_mblock_mperblock_nblock_nperblock, e_grid_desc_mblock_mperblock_nblock_nperblock_, @@ -98,7 +98,7 @@ __global__ void ignore = p_ds_grid; ignore = p_e_grid; ignore = batch_count; - ignore = a_grid_desc_k0_m_k1; + ignore = a_grid_desc; ignore = b_grid_desc_k0_n_k1; ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock; ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_; @@ -115,7 +115,7 @@ template static constexpr auto K1 = Number{}; + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + using ThisThreadBlock = ThisThreadBlock; - using GridwiseGemmPipe = remove_cvref_t())>; + using GridwiseGemmPipe = + remove_cvref_t())>; - __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() + // Describe how data store to (LDS/VGPR) buffer from Global memory + __host__ __device__ static constexpr auto MakeABlockDescriptor() { - constexpr auto max_lds_align = K1; + constexpr auto a_block_desc = [&]() { + if constexpr(AEnableLds) + { + // K0->M->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + constexpr auto max_lds_align = K1; - // A matrix in LDS memory, dst of blockwise copy - constexpr auto a_block_desc_k0perblock_mperblock_k1 = [&]() { - if constexpr(ABlockLdsExtraM) + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + } + else { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); + make_tuple(Number{}, Number{}, I1, I1, I1, K1), + make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + } + }(); + + return a_block_desc; + } + + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() + { + constexpr auto a_block_copy_step = [&]() { + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / K1; + + return make_multi_index(K0PerBlock, 0, 0); } else { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); } }(); - return a_block_desc_k0perblock_mperblock_k1; + return a_block_copy_step; + } + + __host__ __device__ static constexpr auto MakeBBlockSliceCopyStep() + { + constexpr auto b_block_copy_step = [&]() { + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / K1; + + return make_multi_index(K0PerBlock, 0, 0); + } + else + { + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + } + }(); + + return b_block_copy_step; + } + + // Describe how data read from (LDS/VGPR) buffer + template + __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&) + { + + constexpr auto a_wave_desc = [&]() { + if constexpr(AEnableLds) + { + // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1 + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + else + { + // KWmma_MRepeat_MWave_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0, 3>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple( + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + } + }(); + + return a_wave_desc; + } + + template + __host__ __device__ static constexpr auto + MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + { + constexpr auto B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); + + return transform_tensor_descriptor( + BBlockDesc_BK0_N_BK1{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform( + make_tuple(Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() { constexpr auto max_lds_align = K1; + constexpr auto K0PerBlock = KPerBlock / K1; // B matrix in LDS memory, dst of blockwise copy constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { @@ -416,28 +543,20 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() { // LDS allocation for A and B: be careful of alignment - constexpr auto a_block_desc_k0perblock_mperblock_k1 = - GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - - constexpr auto b_block_desc_k0perblock_nperblock_k1 = - GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - - constexpr auto max_lds_align = K1; - - constexpr auto a_block_space_size_aligned = math::integer_least_multiple( - a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); - - constexpr auto b_block_space_size_aligned = math::integer_least_multiple( - b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize(), max_lds_align); + const index_t gemm_bytes_end = + SharedMemTrait::a_block_space_size_aligned * sizeof(ADataType)+ + SharedMemTrait::b_block_space_size_aligned * sizeof(BDataType); + + const index_t c_block_bytes_end = + SharedMemTrait::c_shuffle_block_space_size * sizeof(CShuffleDataType); - return (a_block_space_size_aligned * sizeof(ADataType) + - b_block_space_size_aligned * sizeof(BDataType)); + return math::max(gemm_bytes_end, c_block_bytes_end); } // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1, + CheckValidity(const AGridDesc& a_grid_desc, const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, const DsGridDesc_M_N& ds_grid_desc_m_n, const EGridDesc_M_N& e_grid_desc_m_n, @@ -450,9 +569,41 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle (NPerBlock % (NRepeat * NPerWmma)) == 0, "Invalid tuning param!"); - const auto M = a_grid_desc_k0_m_k1.GetLength(I1); - const auto N = b_grid_desc_k0_n_k1.GetLength(I1); - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); + const auto GetAProblemsizeMK = [&]() { + if constexpr(AEnableLds) + { + return make_tuple(a_grid_desc.GetLength(I1), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * + a_grid_desc.GetLength(I4), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I5)); + } + }; + + const auto GetBProblemsizeNK = [&]() { + if constexpr(BEnableLds) + { + return make_tuple(b_grid_desc_k0_n_k1.GetLength(I1), + b_grid_desc_k0_n_k1.GetLength(I0) * + b_grid_desc_k0_n_k1.GetLength(I2)); + } + else + { + return make_tuple( + b_grid_desc_k0_n_k1.GetLength(I1) * b_grid_desc_k0_n_k1.GetLength(I2) * + b_grid_desc_k0_n_k1.GetLength(I4), + b_grid_desc_k0_n_k1.GetLength(I0) * b_grid_desc_k0_n_k1.GetLength(I3) * + b_grid_desc_k0_n_k1.GetLength(I5)); + } + }; + + const auto M = GetAProblemsizeMK()[I0]; + const auto N = GetBProblemsizeNK()[I0]; + const auto K = GetAProblemsizeMK()[I1]; bool valid = true; @@ -468,21 +619,20 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle } if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) && - K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) && - K1 == b_grid_desc_k0_n_k1.GetLength(I2))) + K == GetBProblemsizeNK()[I1])) { printf("GridwiseOp: ABE descriptor dimension cross check failure\n"); return false; } - if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0)) + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0)) { printf("GridwiseOp: Problemsize descriptor dimension check failure\n"); return false; } // check gridwise gemm pipeline - const auto num_k_loop = K0 / K0PerBlock; + const auto num_k_loop = K / KPerBlock; if(!GridwiseGemmPipe::IsSupported(num_k_loop)) { @@ -546,6 +696,31 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle e_grid_desc_m_n); } + struct SharedMemTrait + { + // LDS allocation for A and B: be careful of alignment + + static constexpr auto max_lds_align = K1; + + static constexpr auto a_block_space_size_aligned = + AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(), + max_lds_align): 0; + static constexpr auto b_block_space_size_aligned = + BEnableLds ? math::integer_least_multiple( + GetBBlockDescriptor_K0PerBlock_NPerBlock_K1().GetElementSpaceSize(), + max_lds_align): 0; + + static constexpr auto a_block_space_offset = 0; + static constexpr auto b_block_space_offset = a_block_space_size_aligned; + + // LDS allocation for C shuffle in LDS + static constexpr auto c_shuffle_block_space_size = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + .GetElementSpaceSize(); + + static constexpr auto c_shuffle_block_space_offset = 0; + }; + using DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; using EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t( - p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize()); + p_a_grid, a_grid_desc.GetElementSpaceSize()); const auto b_grid_buf = make_dynamic_buffer( p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); const auto ds_grid_buf = generate_tuple( @@ -603,23 +778,39 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle /*******************************************************************************/ // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy - const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0); - constexpr auto max_lds_align = K1; - constexpr auto a_block_desc_k0perblock_mperblock_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1(); - constexpr auto b_block_desc_k0perblock_nperblock_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); - // A matrix blockwise copy - auto a_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, + const auto K = [&](){ + if constexpr(AEnableLds){ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); + } + else{ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); + } + }(); + + constexpr auto a_block_desc = MakeABlockDescriptor(); + constexpr auto b_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + + auto a_block_trait = [&](){ + // A matrix blockwise copy + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared), + a_block_desc.GetElementSpaceSize()); + + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ ADataType, -/* typename DstData, */ ADataType, -/* typename SrcDesc, */ decltype(a_grid_desc_k0_m_k1), -/* typename DstDesc, */ decltype(a_block_desc_k0perblock_mperblock_k1), +/* typename SrcData, */ FloatA, +/* typename DstData, */ FloatA, +/* typename SrcDesc, */ decltype(a_grid_desc), +/* typename DstDesc, */ decltype(a_block_desc), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, /* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, /* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, @@ -630,62 +821,138 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle /* index_t DstScalarStrideInVector, */ 1, /* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, /* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( - a_grid_desc_k0_m_k1, + a_grid_desc, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, - a_block_desc_k0perblock_mperblock_k1, + a_block_desc, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); - // B matrix blockwise copy - auto b_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - BDataType, - BDataType, - decltype(b_grid_desc_k0_n_k1), - decltype(b_block_desc_k0perblock_nperblock_k1), - BBlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true>( - b_grid_desc_k0_n_k1, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_element_op, - b_block_desc_k0perblock_nperblock_k1, - make_multi_index(0, 0, 0), - ck::tensor_operation::element_wise::PassThrough{}); + return make_tuple(a_block_buf, a_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + auto a_block_buf = make_static_buffer( + a_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto a_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + ABlockTransferSrcScalarPerVector, + AThreadTransferSrcResetCoordinateAfterRun, + true>( + a_grid_desc, + make_multi_index(0, + m_block_data_idx_on_grid/(MWaves * MPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(a_block_buf, a_blockwise_copy); + } + }; + + auto b_block_trait = [&](){ + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto b_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::a_block_space_size_aligned, + b_block_desc.GetElementSpaceSize()); + + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + FloatB, + FloatB, + decltype(b_grid_desc_k0_n_k1), + decltype(b_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc_k0_n_k1, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + return make_tuple(b_block_buf, b_blockwise_copy); + } + else + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto b_block_buf = make_static_buffer( + b_block_desc.GetElementSpaceSize()); + auto b_blockwise_copy = + ThreadwiseTensorSliceTransfer_v4{}, + Number{}, + Number{}>, + Sequence<0, 1, 2>, + 2, + BBlockTransferSrcScalarPerVector, + 1>( + make_multi_index(0, get_thread_local_1d_id()/32 * 16 + get_thread_local_1d_id() % 16, 0)); + + return make_tuple(b_block_buf, b_blockwise_copy); + } + }; + + auto a_block_buf = a_block_trait()[I0]; + auto a_blockwise_copy = a_block_trait()[I1]; + auto b_block_buf = b_block_trait()[I0]; + auto b_blockwise_copy = b_block_trait()[I1]; /*******************************************************************************/ // GEMM - constexpr auto WmmaK = 16; constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = - BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO{}; + BlockwiseGemmWMMA{}; // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); @@ -702,7 +969,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle // gridwise GEMM pipeline const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); - GridwiseGemmPipe::template Run(a_grid_desc_k0_m_k1, + GridwiseGemmPipe::template Run(a_grid_desc, a_block_desc_k0perblock_mperblock_k1, a_blockwise_copy, a_grid_buf, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 46e0493e503..54f01952179 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -56,6 +56,8 @@ struct GridwiseGemmPipeline_v1<1, true, true> CThreadBuffer& c_thread_buf, index_t num_loop) { + if(get_thread_local_1d_id()<32); + printf("Mat-A Lds Enabled, Mat-B Lds Enabled\n"); // preload data into LDS a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); @@ -304,6 +306,9 @@ struct GridwiseGemmPipeline_v1<1, false, true> }, Number{}); #endif + if(get_thread_local_1d_id()<32); + printf("Mat-A Lds Disabled, Mat-B Lds Enabled\n"); + constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); auto a_block_buf_switch = a_block_buf; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 1b99d535ded..d6cf9f81cf1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -694,7 +694,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma NPerWmma, MRepeat, NRepeat, - KPack>{}; + KPack, + AEnableLds, + BEnableLds>{}; // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); From bdd0f64e0d6b0b09fb89cc35b70cdaee7d547466 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 08:15:39 +0000 Subject: [PATCH 048/118] Fix a bug --- .../run_grouped_conv_bwd_weight_example.inc | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 19 +----- .../gpu/grid/gridwise_gemm_wmma.hpp | 67 ++++++++++++------- 3 files changed, 43 insertions(+), 45 deletions(-) diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc index dc45db98655..7891812375f 100644 --- a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc +++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc @@ -26,7 +26,7 @@ bool run_grouped_conv_bwd_weight(const ExecutionConfig& config, { split_k = 1; } - + const auto in_g_n_c_wis_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed< InputLayout>(conv_param); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index fe448e5bcef..fdf0552de8b 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -62,20 +62,6 @@ struct BlockwiseGemmWMMA static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I4); static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I4); - static constexpr auto A_temp0 = Number{}; - static constexpr auto A_temp1 = Number{}; - static constexpr auto A_temp2 = Number{}; - static constexpr auto A_temp3 = Number{}; - static constexpr auto A_temp4 = Number{}; - - // FIX it, workaround - using ABlockDesc_temp = decltype( - make_naive_tensor_descriptor(make_tuple(A_temp0, A_temp1, A_temp2, A_temp3, A_temp4), - make_tuple(A_temp1* A_temp2* A_temp3* A_temp4, - A_temp2* A_temp3* A_temp4, - A_temp3* A_temp4, - A_temp4, - I1))); static constexpr auto wmma_gemm = WmmaGemm{}; @@ -210,9 +196,6 @@ struct BlockwiseGemmWMMA constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - // constexpr auto NSubGroup = - // c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; constexpr auto MThreadPerSubGroup - // = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; return make_naive_tensor_descriptor_packed( @@ -302,7 +285,7 @@ struct BlockwiseGemmWMMA // Describe how data allocated in thread copy src buffer // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma - static constexpr ABlockDesc_temp a_block_desc_k0_m0_m1_m2_k1; + static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1; static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1; template diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index a652ce8bcee..2236616c27c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -249,20 +249,45 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + // Err: merge transform cause non-constexpr issue + + // return transform_tensor_descriptor( + // ABlockDesc_{}, + // make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + // make_pass_through_transform(Number{}), + // make_pass_through_transform(I1), + // make_pass_through_transform(I1), + // make_pass_through_transform(Number{})), + // make_tuple(Sequence<0, 3>{}, + // Sequence<1>{}, + // Sequence<2>{}, + // Sequence<4>{}, + // Sequence<5>{}), + // make_tuple( + // Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, + // Sequence<4>{})); + + // Workaround, Freeze transform return transform_tensor_descriptor( ABlockDesc_{}, - make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), make_pass_through_transform(Number{}), make_pass_through_transform(I1), make_pass_through_transform(I1), make_pass_through_transform(Number{})), - make_tuple(Sequence<0, 3>{}, + make_tuple(Sequence<3>{}, + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<5>{}), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); } }(); @@ -455,14 +480,12 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma static constexpr auto a_block_space_size_aligned = AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(), - max_lds_align) * - sizeof(FloatA) + max_lds_align) : 0; static constexpr auto b_block_space_size_aligned = BEnableLds ? math::integer_least_multiple( GetBBlockDescriptor_K0PerBlock_NPerBlock_K1().GetElementSpaceSize(), - max_lds_align) * - sizeof(FloatB) + max_lds_align) : 0; static constexpr auto a_block_space_offset = 0; @@ -471,13 +494,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // LDS allocation for C shuffle in LDS static constexpr auto c_shuffle_block_space_size = GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() - .GetElementSpaceSize() * - sizeof(FloatCShuffle); + .GetElementSpaceSize(); static constexpr auto c_shuffle_block_space_offset = 0; - static constexpr auto lds_size = math::max( - c_shuffle_block_space_size, (a_block_space_size_aligned + b_block_space_size_aligned)); + static constexpr auto lds_size = + math::max(c_shuffle_block_space_size * sizeof(FloatCShuffle), + a_block_space_size_aligned * sizeof(FloatA) + + b_block_space_size_aligned * sizeof(FloatB)); }; template @@ -528,8 +552,6 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma } }(); - // printf("---------------K = %d\n", K); - constexpr auto a_block_desc = MakeABlockDescriptor(); constexpr auto b_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); @@ -540,7 +562,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto K0PerBlock = KPerBlock/ K1; auto a_block_buf = make_dynamic_buffer( static_cast(p_shared), - a_block_desc.GetElementSpaceSize()); + SharedMemTrait::a_block_space_size_aligned); auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1( - static_cast(p_shared) + SharedMemTrait::a_block_space_size_aligned, - b_block_desc.GetElementSpaceSize()); + static_cast(p_shared) + SharedMemTrait::b_block_space_offset, + SharedMemTrait::b_block_space_size_aligned); auto b_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1{}([&](auto i) { - printf("tid: %03d, c_thread_buf[%02d] val: %08x\n", get_thread_local_1d_id(), i.value, - *(reinterpret_cast(&(c_thread_buf[i])))); - // c_thread_buf(i) = 32; - }); -#endif constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); @@ -751,7 +765,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); auto c_shuffle_block_buf = make_dynamic_buffer( - static_cast(p_shared), SharedMemTrait::c_shuffle_block_space_size); + static_cast(p_shared) + SharedMemTrait::c_shuffle_block_space_offset, + SharedMemTrait::c_shuffle_block_space_size); constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, From a38ce0244e3ac7dd6329ac5865475cc2351edf31 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 09:40:35 +0000 Subject: [PATCH 049/118] batched gemm ported --- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 8 +- .../CMakeLists.txt | 8 + ...d_contraction_multiple_d_wmma_cshuffle.hpp | 236 ++++++++++-------- .../gpu/device/impl/device_gemm_wmma.hpp | 7 +- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 19 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 94 ++++--- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 4 - .../gpu/grid/gridwise_gemm_wmma.hpp | 4 +- 8 files changed, 204 insertions(+), 176 deletions(-) diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index 0b0c130874a..d04d6b12cd6 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -70,12 +70,12 @@ using DeviceOpInstanceKKNN = 256, 128, 128, - 4, + 32, 8, 16, 16, - 4, - 2, + 1, + 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -92,7 +92,7 @@ using DeviceOpInstanceKKNN = true, 1, 1, - S<1, 32, 1, 8>, + S<1, 128, 1, 2>, 8>; using DeviceOpInstance = DeviceOpInstanceKKNN; diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 8d9aaec85a5..b22a376a8cd 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -5,6 +5,9 @@ add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) +if(GPU_TARGETS MATCHES "gfx1100") + add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) +endif() add_custom_target(example_gemm_scale_softmax_gemm) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16) @@ -14,3 +17,8 @@ add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_soft add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) + +if(GPU_TARGETS MATCHES "gfx1100") + add_custom_target(example_gemm_scale_softmax_gemm_wmma) + add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16) +endif() \ No newline at end of file diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 1eff05096e9..76684648531 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -76,10 +76,10 @@ template {}; static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + + static constexpr auto AEnableLds = NWaves == 1 ? false : true; + static constexpr auto BEnableLds = MWaves == 1 ? false : true; + static constexpr auto matrix_padder = - MatrixPadder{MPerBlock, NPerBlock, K0PerBlock* K1}; + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; // Assume: A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...] - static auto MakeAGridDescriptor_M_K(const std::vector& a_gs_ms_ks_lengths_vec, + static auto MakeAGridDescriptor(const std::vector& a_gs_ms_ks_lengths_vec, const std::vector& a_gs_ms_ks_strides_vec) { assert(a_gs_ms_ks_lengths_vec.size() == NumDimG + NumDimM + NumDimK && @@ -158,36 +167,69 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // lengths for K0, K1, ... const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds); - if constexpr(ASpec == TensorSpecialization::Packed) + const auto a_grid_desc_m_k = [&](){ + if constexpr(ASpec == TensorSpecialization::Packed) + { + auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); + auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{}); + const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor( + make_tuple(M, K), + make_tuple(a_ms_ks_strides[Number{}], + a_ms_ks_strides[Number{}])); + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + else + { + // naive tensor A[M0, M1, M2, ..., K0, K1, K2...] + const auto a_grid_desc_ms_ks = + make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides); + + // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...] + const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor( + a_grid_desc_ms_ks, + make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)), + make_tuple(mDimIds, kDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + }(); + + const auto M = a_grid_desc_m_k.GetLength(I0); + const auto K = a_grid_desc_m_k.GetLength(I1); + assert(K % K1 == 0); + + if constexpr(AEnableLds) { - auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); - auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{}); - const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor( - make_tuple(M, K), - make_tuple(a_ms_ks_strides[Number{}], - a_ms_ks_strides[Number{}])); - return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } else { - // naive tensor A[M0, M1, M2, ..., K0, K1, K2...] - const auto a_grid_desc_ms_ks = - make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides); - - // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...] - const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor( - a_grid_desc_ms_ks, - make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)), - make_tuple(mDimIds, kDimIds), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + constexpr auto A_KRow = WmmaK / K1; + const auto A_KWmma = K / WmmaK; + + const auto M0 = M / MPerBlock; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(M0 * MRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); } } // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...] - static auto MakeBGridDescriptor_N_K(const std::vector& b_gs_ns_ks_lengths_vec, - const std::vector& b_gs_ns_ks_strides_vec) + static auto MakeBGridDescriptor_K0_N_K1(const std::vector& b_gs_ns_ks_lengths_vec, + const std::vector& b_gs_ns_ks_strides_vec) { assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK && b_gs_ns_ks_strides_vec.size() == NumDimG + NumDimN + NumDimK); @@ -214,31 +256,45 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // lengths for N0, N1, ... const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds); - if constexpr(BSpec == TensorSpecialization::Packed) - { - auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); - auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{}); - const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor( - make_tuple(N, K), - make_tuple(b_ns_ks_strides[Number{}], - b_ns_ks_strides[Number{}])); - return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); - } - else - { - // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...] - const auto b_grid_desc_ns_ks = - make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides); - - // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...] - const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor( - b_grid_desc_ns_ks, - make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)), - make_tuple(nDimIds, kDimIds), - make_tuple(Sequence<0>{}, Sequence<1>{})); + const auto b_grid_desc_n_k = [&](){ + if constexpr(BSpec == TensorSpecialization::Packed) + { + auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); + auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{}); + const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor( + make_tuple(N, K), + make_tuple(b_ns_ks_strides[Number{}], + b_ns_ks_strides[Number{}])); + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + else + { + // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...] + const auto b_grid_desc_ns_ks = + make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides); + + // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...] + const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor( + b_grid_desc_ns_ks, + make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)), + make_tuple(nDimIds, kDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + }(); - return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); - } + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + assert(K % K1 == 0); + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] @@ -393,8 +449,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle } // Gridwise descriptor, mapping to whole given provblem. - using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K({}, {})); - using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K({}, {})); using DsGridDesc_M_N = remove_cvref_t; using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N({}, {})); @@ -449,42 +503,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle EGridDesc_G_M_N e_grid_desc_g_m_n_; }; - // A desc for source in blockwise copy - template - __host__ __device__ static constexpr auto - MakeAGridDescriptor_K0_M_K1(const AGridDesc_M_K& a_grid_desc_m_k) - { - const auto M = a_grid_desc_m_k.GetLength(I0); - const auto K = a_grid_desc_m_k.GetLength(I1); - - const auto AK0 = K / K1; - - return transform_tensor_descriptor( - a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - - // B desc for source in blockwise copy - template - __host__ __device__ static constexpr auto - MakeBGridDescriptor_K0_N_K1(const BGridDesc_N_K& b_grid_desc_n_k) - { - const auto N = b_grid_desc_n_k.GetLength(I0); - const auto K = b_grid_desc_n_k.GetLength(I1); - - const auto BK0 = K / K1; - - return transform_tensor_descriptor( - b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - - using AGridDesc_K0_M_K1 = decltype(DeviceOp::MakeAGridDescriptor_K0_M_K1(AGridDesc_M_K{})); - using BGridDesc_K0_N_K1 = decltype(DeviceOp::MakeBGridDescriptor_K0_N_K1(BGridDesc_N_K{})); + using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({},{})); + using BGridDesc_K0_N_K1 = decltype(DeviceOp::MakeBGridDescriptor_K0_N_K1({},{})); // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< @@ -496,7 +516,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle DsDataType, EDataType, // InMemory Data Descriptor - AGridDesc_K0_M_K1, + AGridDesc, BGridDesc_K0_N_K1, DsGridDesc_M_N, EGridDesc_M_N, @@ -508,9 +528,9 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // Tiling Family MPerBlock, NPerBlock, - K0PerBlock, - MPerWMMA, - NPerWMMA, + KPerBlock, + MPerWmma, + NPerWmma, K1, MRepeat, NRepeat, @@ -523,6 +543,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, // AThreadTransferSrcResetCoordinateAfterRun, + AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -531,6 +552,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, // BThreadTransferSrcResetCoordinateAfterRun, + BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, @@ -564,16 +586,14 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle p_b_grid_{static_cast(p_b_grid)}, p_ds_grid_{}, p_e_grid_{static_cast(p_e_grid)}, - a_grid_desc_m_k_{}, - b_grid_desc_n_k_{}, + a_grid_desc_k0_m_k1_{}, + b_grid_desc_k0_n_k1_{}, ds_grid_desc_m_n_{}, e_grid_desc_m_n_{}, ds_grid_desc_g_m_n_{ DeviceOp::MakeDsGridDescriptor_G_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides)}, e_grid_desc_g_m_n_{ DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)}, - a_grid_desc_k0_m_k1_{}, - b_grid_desc_k0_n_k1_{}, ds_grid_desc_mblock_mperblock_nblock_nperblock{}, e_grid_desc_mblock_mperblock_nblock_nperblock{}, block_2_ctile_map_{}, @@ -600,10 +620,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle p_ds_grid_(i) = static_cast(p_ds_grid[i]); }); - a_grid_desc_m_k_ = - DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); - b_grid_desc_n_k_ = - DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); ds_grid_desc_m_n_ = DeviceOp::MakeDsGridDescriptor_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides); @@ -611,8 +629,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle e_grid_desc_m_n_ = DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); - a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor_K0_M_K1(a_grid_desc_m_k_); - b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_grid_desc_n_k_); block_2_ctile_map_ = GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01); @@ -644,15 +660,13 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle EDataType* p_e_grid_; // Tensor Descriptors - AGridDesc_M_K a_grid_desc_m_k_; - BGridDesc_N_K b_grid_desc_n_k_; + AGridDesc a_grid_desc_k0_m_k1_; + BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; DsGridDesc_M_N ds_grid_desc_m_n_; EGridDesc_M_N e_grid_desc_m_n_; DsGridDesc_G_M_N ds_grid_desc_g_m_n_; EGridDesc_G_M_N e_grid_desc_g_m_n_; - AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_; - BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock; @@ -712,7 +726,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle BDataType, typename GridwiseOp::DsGridPointer, EDataType, - DeviceOp::AGridDesc_K0_M_K1, + DeviceOp::AGridDesc, DeviceOp::BGridDesc_K0_N_K1, typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, @@ -975,10 +989,10 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle << BlockSize << ", " << MPerBlock << ", " << NPerBlock << ", " - << K0PerBlock << ", " + << KPerBlock << ", " << K1 << ", " - << MPerWMMA << ", " - << NPerWMMA << ", " + << MPerWmma << ", " + << NPerWmma << ", " << MRepeat << ", " << NRepeat << ">" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 408222392f6..e1c5396593a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -89,8 +89,9 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{MPerBlock, NPerBlock, KPerBlock}; @@ -124,7 +125,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{}, Sequence<0>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 75631e27bb1..350545a8691 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -296,20 +296,27 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + // Workaround, Freeze transform return transform_tensor_descriptor( ABlockDesc_{}, - make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), make_pass_through_transform(Number{}), make_pass_through_transform(I1), make_pass_through_transform(I1), make_pass_through_transform(Number{})), - make_tuple(Sequence<0, 3>{}, + make_tuple(Sequence<3>{}, + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<5>{}), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); } }(); @@ -782,6 +789,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle MRepeat, LRepeat, KPack, + AEnableLds, + B0EnableLds, true>{}; // C' = B' x A' @@ -968,6 +977,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle MRepeat, NRepeat, KPack, + false, + B1EnableLds, true>{make_tuple(0, 0, 0, 0, 0)}; auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 555f0b61edb..50b1aadfdc2 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -69,7 +69,7 @@ __global__ void const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx); - __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size]; DsPointer p_ds_grid_grp; @@ -148,7 +148,7 @@ __global__ void const Block2CTileMap block_2_etile_map) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) - __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size]; const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); @@ -237,7 +237,7 @@ __global__ void const Block2CTileMap block_2_ctile_map) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) - __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size]; GridwiseOp::template Run(p_a_grid, p_b_grid, @@ -451,20 +451,27 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + // Workaround, Freeze transform return transform_tensor_descriptor( ABlockDesc_{}, - make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), make_pass_through_transform(Number{}), make_pass_through_transform(I1), make_pass_through_transform(I1), make_pass_through_transform(Number{})), - make_tuple(Sequence<0, 3>{}, + make_tuple(Sequence<3>{}, + Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<5>{}), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); } }(); @@ -540,19 +547,6 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle Number{}); } - __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() - { - // LDS allocation for A and B: be careful of alignment - const index_t gemm_bytes_end = - SharedMemTrait::a_block_space_size_aligned * sizeof(ADataType)+ - SharedMemTrait::b_block_space_size_aligned * sizeof(BDataType); - - const index_t c_block_bytes_end = - SharedMemTrait::c_shuffle_block_space_size * sizeof(CShuffleDataType); - - return math::max(gemm_bytes_end, c_block_bytes_end); - } - // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool @@ -650,7 +644,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / (K0PerBlock * K1); + const index_t num_loop = K / KPerBlock; return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -704,11 +698,13 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle static constexpr auto a_block_space_size_aligned = AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(), - max_lds_align): 0; + max_lds_align) + : 0; static constexpr auto b_block_space_size_aligned = BEnableLds ? math::integer_least_multiple( GetBBlockDescriptor_K0PerBlock_NPerBlock_K1().GetElementSpaceSize(), - max_lds_align): 0; + max_lds_align) + : 0; static constexpr auto a_block_space_offset = 0; static constexpr auto b_block_space_offset = a_block_space_size_aligned; @@ -719,6 +715,11 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle .GetElementSpaceSize(); static constexpr auto c_shuffle_block_space_offset = 0; + + static constexpr auto lds_size = + math::max(c_shuffle_block_space_size * sizeof(CShuffleDataType), + a_block_space_size_aligned * sizeof(ADataType) + + b_block_space_size_aligned * sizeof(BDataType)); }; using DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t( - static_cast(p_shared), + static_cast(p_shared), a_block_desc.GetElementSpaceSize()); auto a_blockwise_copy = @@ -807,8 +808,8 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle /* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatA, -/* typename DstData, */ FloatA, +/* typename SrcData, */ ADataType, +/* typename DstData, */ ADataType, /* typename SrcDesc, */ decltype(a_grid_desc), /* typename DstDesc, */ decltype(a_block_desc), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, @@ -835,13 +836,13 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle // Thread-wise copy // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - auto a_block_buf = make_static_buffer( + auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); // Limitation: NumDim of Src and Dst descriptor should be identical auto a_blockwise_copy = - ThreadwiseTensorSliceTransfer_v2{}, @@ -872,7 +873,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle { constexpr auto K0PerBlock = KPerBlock/ K1; auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::a_block_space_size_aligned, + static_cast(p_shared) + SharedMemTrait::a_block_space_size_aligned, b_block_desc.GetElementSpaceSize()); auto b_blockwise_copy = @@ -883,8 +884,8 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle Sequence, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, - FloatB, - FloatB, + BDataType, + BDataType, decltype(b_grid_desc_k0_n_k1), decltype(b_block_desc), BBlockTransferSrcAccessOrder, @@ -909,11 +910,11 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle else { constexpr auto K0PerBlock = KPerBlock/ K1; - auto b_block_buf = make_static_buffer( + auto b_block_buf = make_static_buffer( b_block_desc.GetElementSpaceSize()); auto b_blockwise_copy = - ThreadwiseTensorSliceTransfer_v4{}, @@ -952,38 +953,35 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle NPerWmma, MRepeat, NRepeat, - KPack>{}; + KPack, + AEnableLds, + BEnableLds>{}; // Prepare Register for C matrix auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); -/*******************************************************************************/ - constexpr auto a_block_space_size_aligned = math::integer_least_multiple(a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize(), max_lds_align); - // LDS allocation for A and B: be careful of alignment - auto a_block_buf = make_dynamic_buffer(static_cast(p_shared), a_block_desc_k0perblock_mperblock_k1.GetElementSpaceSize()); - auto b_block_buf = make_dynamic_buffer(static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0perblock_nperblock_k1.GetElementSpaceSize()); - +/*******************************************************************************/ // Shift Per SUB_K - constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); - constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); + constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); + constexpr auto b_block_slice_copy_step = MakeBBlockSliceCopyStep(); // gridwise GEMM pipeline - const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); + const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); GridwiseGemmPipe::template Run(a_grid_desc, - a_block_desc_k0perblock_mperblock_k1, + a_block_desc, a_blockwise_copy, a_grid_buf, a_block_buf, a_block_slice_copy_step, b_grid_desc_k0_n_k1, - b_block_desc_k0perblock_nperblock_k1, + b_block_desc, b_blockwise_copy, b_grid_buf, b_block_buf, b_block_slice_copy_step, blockwise_gemm, c_thread_buf, - K0BlockMainLoop); + KBlockMainLoop); /*******************************************************************************/ // write out to C, implement shuffle { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 54f01952179..bbdb264d739 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -56,8 +56,6 @@ struct GridwiseGemmPipeline_v1<1, true, true> CThreadBuffer& c_thread_buf, index_t num_loop) { - if(get_thread_local_1d_id()<32); - printf("Mat-A Lds Enabled, Mat-B Lds Enabled\n"); // preload data into LDS a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); @@ -306,8 +304,6 @@ struct GridwiseGemmPipeline_v1<1, false, true> }, Number{}); #endif - if(get_thread_local_1d_id()<32); - printf("Mat-A Lds Disabled, Mat-B Lds Enabled\n"); constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); auto a_block_buf_switch = a_block_buf; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 2d2a1581100..c65582f4eea 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -731,7 +731,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma constexpr auto b_block_slice_copy_step = MakeBBlockSliceCopyStep(); // gridwise GEMM pipeline - const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); + const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); GridwiseGemmPipe::template Run(a_grid_desc, a_block_desc, a_blockwise_copy, @@ -746,7 +746,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma b_block_slice_copy_step, blockwise_gemm, c_thread_buf, - K0BlockMainLoop); + KBlockMainLoop); /*******************************************************************************/ // write out to C, implement shuffle { From f00dab9fc6463218bdc9cafd8e636ba8c941ff7b Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 10:54:12 +0000 Subject: [PATCH 050/118] conv A-skip lds ported --- .../gemm_bilinear_wmma_fp16.cpp | 10 +- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 10 +- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 102 +++++++---- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 158 ++++++++++-------- 4 files changed, 161 insertions(+), 119 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index ff99bf46411..72bcea7fa03 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -82,13 +82,13 @@ using DeviceOpInstance = GemmSpec, 256, 128, - 256, - 8, + 128, + 32, 8, 16, 16, - 4, - 4, + 1, + 8, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -105,7 +105,7 @@ using DeviceOpInstance = true, 1, 1, - S<1, 32, 1, 8>, + S<1, 128, 1, 2>, 8>; int main(int argc, char* argv[]) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index d59d1bc7025..269ebc074f5 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -53,13 +53,13 @@ using DeviceConvFwdInstance = GemmSpec, // GemmSpecialization 256, // BlockSize 128, // MPerBlock - 256, // NPerBlock - 4, // K0PerBlock + 128, // NPerBlock + 32, // KPerBlock 8, // K1 16, // MPerWMMA 16, // NPerWMMA - 4, // MRepeat - 4, // NRepeat + 1, // MRepeat + 8, // NRepeat S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder @@ -76,7 +76,7 @@ using DeviceConvFwdInstance = true, // BBlockLdsExtraN 1, 1, - S<1, 32, 1, 8>, + S<1, 128, 1, 2>, 8>; template diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 66c4de7f05c..3418e40dec7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -16,6 +16,8 @@ #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" + namespace ck { namespace tensor_operation { @@ -38,10 +40,10 @@ template {}; static constexpr auto I1 = Number<1>{}; static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; - static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA) - { - assert(K % K1 == 0); + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; - const index_t K0 = K / K1; + static constexpr auto AEnableLds = NWaves == 1 ? false : true; + static constexpr auto BEnableLds = MWaves == 1 ? false : true; + // Force enable LDS if uncommented following + // AEnableLds = true; + // BEnableLds = true; + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; + // Describe how data read from Global memory + static auto MakeAGridDescriptor(index_t MRaw, index_t KRaw, index_t StrideA) + { const auto a_grid_desc_m_k = [&]() { if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1)); + const auto a_grid_desc_mraw_kraw = + make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), make_tuple(StrideA, I1)); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); } #ifdef ENABLE_COLMAJOR else if constexpr(is_same::value) @@ -105,25 +123,35 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{}, Sequence<0>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } else { + constexpr auto A_KRow = WmmaK / K1; + const auto A_KWmma = K / WmmaK; + + const auto M0 = M / MPerBlock; + return transform_tensor_descriptor( a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_pass_through_transform(M)), + make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(M0 * MRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); } } @@ -216,7 +244,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD; using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)); @@ -231,7 +259,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD(p_b_grid)}, p_ds_grid_{}, p_e_grid_{static_cast(p_e_grid)}, - a_grid_desc_k0_m_k1_{}, + a_grid_desc{}, b_grid_desc_k0_n_k1_{}, ds_grid_desc_m_n_{}, e_grid_desc_m_n_{}, @@ -311,7 +341,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{}([&](auto i) { using DLayout = remove_cvref_t>; @@ -328,7 +358,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD, + remove_reference_t, remove_reference_t, remove_reference_t< typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, @@ -444,7 +474,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD, + remove_reference_t, remove_reference_t, remove_reference_t< typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, @@ -483,7 +513,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD" diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index e245902b6cc..f45a15ba294 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -112,10 +112,10 @@ template {}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr index_t KPerBlock = K0PerBlock * K1; + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + // K1 = Max Vector Access Pixels + static constexpr auto K1Number = Number{}; + + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + + static constexpr auto AEnableLds = NWaves == 1 ? false : true; + static constexpr auto BEnableLds = MWaves == 1 ? false : true; + + // Force enable LDS if uncommented following + // AEnableLds = true; + // BEnableLds = true; static constexpr auto conv_to_gemm_transformer = TransformConvFwdToGemm{}; @@ -171,7 +185,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle template static auto - MakeAGridDescriptor_M_K(const std::array& a_g_n_c_wis_lengths, + MakeAGridDescriptor(const std::array& a_g_n_c_wis_lengths, const std::array& a_g_n_c_wis_strides, const std::array& b_g_k_c_xs_lengths, const std::array& b_g_k_c_xs_strides, @@ -196,13 +210,42 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle const auto in_gemmm_gemmk_desc = matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc); + + const auto M = in_gemmm_gemmk_desc.GetLength(I0); + const auto K = in_gemmm_gemmk_desc.GetLength(I1); + assert(K % K1 == 0); - return in_gemmm_gemmk_desc; + if constexpr(AEnableLds) + { + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + in_gemmm_gemmk_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto A_KRow = WmmaK / K1; + const auto A_KWmma = K / WmmaK; + + const auto M0 = M / MPerBlock; + + return transform_tensor_descriptor( + in_gemmm_gemmk_desc, + make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(M0 * MRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } } template static auto - MakeBGridDescriptor_N_K(const std::array& b_g_k_c_xs_lengths, + MakeBGridDescriptor_BK0_N_BK1(const std::array& b_g_k_c_xs_lengths, const std::array& b_g_k_c_xs_strides) { const auto wei_gemmnraw_gemmkraw_desc = @@ -211,8 +254,18 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle const auto wei_gemmn_gemmk_desc = matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc); + + const auto N = wei_gemmn_gemmk_desc.GetLength(I0); + const auto K = wei_gemmn_gemmk_desc.GetLength(I1); + + const auto BK1 = K1; + const auto BK0 = K / BK1; - return wei_gemmn_gemmk_desc; + return transform_tensor_descriptor(wei_gemmn_gemmk_desc, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } template @@ -245,50 +298,11 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } // desc for problem definition - using AGridDesc_M_K = remove_cvref_t({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>; - using BGridDesc_N_K = remove_cvref_t({}, {}))>; using DsGridDesc_M_N = remove_cvref_t; using EGridDesc_M_N = remove_cvref_t({}, {}))>; - // A desc for source in blockwise copy - template - __host__ __device__ static constexpr auto - MakeAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k) - { - const auto M = a_grid_desc_m_k.GetLength(I0); - const auto K = a_grid_desc_m_k.GetLength(I1); - - const auto AK1 = K1; - const auto AK0 = K / AK1; - - return transform_tensor_descriptor(a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)), - make_pass_through_transform(M)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - - // B desc for source in blockwise copy - template - __host__ __device__ static constexpr auto - MakeBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k) - { - const auto N = b_grid_desc_n_k.GetLength(I0); - const auto K = b_grid_desc_n_k.GetLength(I1); - - const auto BK1 = K1; - const auto BK0 = K / BK1; - - return transform_tensor_descriptor(b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); - } - - using AGridDesc_AK0_M_AK1 = decltype(DeviceOp::MakeAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{})); - using BGridDesc_BK0_N_BK1 = decltype(DeviceOp::MakeBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{})); + using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {})); + using BGridDesc_BK0_N_BK1 = decltype(DeviceOp::MakeBGridDescriptor_BK0_N_BK1({}, {})); // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< @@ -300,7 +314,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle DsDataType, EDataType, // InMemory Data Descriptor - AGridDesc_AK0_M_AK1, + AGridDesc, BGridDesc_BK0_N_BK1, DsGridDesc_M_N, EGridDesc_M_N, @@ -312,9 +326,9 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle // Tiling Family MPerBlock, NPerBlock, - K0PerBlock, - MPerWMMA, - NPerWMMA, + KPerBlock, + MPerWmma, + NPerWmma, K1, MRepeat, NRepeat, @@ -327,6 +341,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false, + AEnableLds, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, @@ -335,6 +350,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, false, + BEnableLds, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, @@ -375,7 +391,10 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle p_ds_grid_{}, p_e_grid_{static_cast(p_e)}, num_group_{a_g_n_c_wis_lengths[0]}, - a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_g_n_c_wis_lengths, + ds_grid_desc_m_n_{}, + e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, + e_g_n_k_wos_strides)}, + a_grid_desc{DeviceOp::MakeAGridDescriptor(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, b_g_k_c_xs_lengths, b_g_k_c_xs_strides, @@ -385,13 +404,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle conv_filter_dilations, input_left_pads, input_right_pads)}, - b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_g_k_c_xs_lengths, + b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_g_k_c_xs_lengths, b_g_k_c_xs_strides)}, - ds_grid_desc_m_n_{}, - e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, - e_g_n_k_wos_strides)}, - a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)}, - b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)}, ds_grid_desc_mblock_mperblock_nblock_nperblock_{}, e_grid_desc_mblock_mperblock_nblock_nperblock_{}, block_2_etile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01)}, @@ -443,8 +457,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle void Print() const { - std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl; - std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl; + std::cout << "A[M, K]: " << a_grid_desc << std::endl; + std::cout << "B[N, K]: " << b_grid_desc_bk0_n_bk1_ << std::endl; static_for<0, NumDTensor, 1>{}( [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; }); std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl; @@ -459,13 +473,11 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle // tensor descriptors for problem definiton index_t num_group_; - AGridDesc_M_K a_grid_desc_m_k_; - BGridDesc_N_K b_grid_desc_n_k_; DsGridDesc_M_N ds_grid_desc_m_n_; EGridDesc_M_N e_grid_desc_m_n_; // tensor descriptors for block/thread-wise copy - AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_; + AGridDesc a_grid_desc; BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock_; @@ -514,7 +526,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.num_group_; const auto K = - arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2); + arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I2); auto launch_kernel = [&](auto has_main_k_block_loop) { constexpr bool has_main_loop = has_main_k_block_loop.value; @@ -528,7 +540,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, - DeviceOp::AGridDesc_AK0_M_AK1, + DeviceOp::AGridDesc, DeviceOp::BGridDesc_BK0_N_BK1, typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, @@ -549,7 +561,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle arg.b_element_op_, arg.cde_element_op_, arg.a_g_n_c_wis_lengths_[0], // Group count - arg.a_grid_desc_ak0_m_ak1_, + arg.a_grid_desc, arg.b_grid_desc_bk0_n_bk1_, arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, arg.e_grid_desc_mblock_mperblock_nblock_nperblock_, @@ -719,7 +731,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } // check Gridwise GEMM - return GridwiseOp::CheckValidity(arg.a_grid_desc_ak0_m_ak1_, + return GridwiseOp::CheckValidity(arg.a_grid_desc, arg.b_grid_desc_bk0_n_bk1_, arg.ds_grid_desc_m_n_, arg.e_grid_desc_m_n_, From 04c6a9787e6ee62b796e210a9a1a152987467321 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 11:20:47 +0000 Subject: [PATCH 051/118] Skip B-Lds real gemm --- example/01_gemm/gemm_wmma_fp16.cpp | 6 +- .../gpu/device/impl/device_gemm_wmma.hpp | 70 +++-- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 96 ++++++- .../gpu/grid/gridwise_gemm_wmma.hpp | 252 +++++++++++------- 4 files changed, 301 insertions(+), 123 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 657151ee863..a068ea20ea1 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -42,8 +42,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, // K1 16, // MPerWmma 16, // NPerWmma - 2, // M Repeat - 4, // N-Repeat + 8, // M Repeat + 1, // N-Repeat S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -60,7 +60,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle true, 1, // C shuffle (M Repeat) Per store 1, // C shuffle (N Repeat) Per store - S<1, 64, 1, 4>, + S<1, 16, 1, 16>, 8>; // clang-format on diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index e1c5396593a..6525e4b8b61 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -106,12 +106,13 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm::value) { - return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); + const auto a_grid_desc_mraw_kraw = + make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), make_tuple(I1, StrideA)); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); } -#endif }(); const auto M = a_grid_desc_m_k.GetLength(I0); @@ -146,34 +147,57 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm::value) { - return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(I1, StrideB)); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } else if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(StrideB, I1)); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } }(); - const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); - const auto N = b_grid_desc_n_k.GetLength(I0); const auto K = b_grid_desc_n_k.GetLength(I1); assert(K % K1 == 0); - const index_t K0 = K / K1; - - return transform_tensor_descriptor( - b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + if constexpr(BEnableLds) + { + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto B_KRow = WmmaK / K1; + const auto B_KWmma = K / WmmaK; + + const auto N0 = N / NPerBlock; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(N0 * NRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } } static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC) @@ -196,7 +220,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, - remove_reference_t, + remove_reference_t, remove_reference_t< typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, AElementwiseOperation, @@ -404,7 +428,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, - remove_reference_t, + remove_reference_t, remove_reference_t< typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, AElementwiseOperation, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index bbdb264d739..86195387cb4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -309,9 +309,9 @@ struct GridwiseGemmPipeline_v1<1, false, true> auto a_block_buf_switch = a_block_buf; // preload data into LDS + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); a_blockwise_copy.Run( a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf); - b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); @@ -364,6 +364,100 @@ struct GridwiseGemmPipeline_v1<1, false, true> template <> struct GridwiseGemmPipeline_v1<1, true, false> { + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + __device__ static void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { + constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + auto b_block_buf_switch = b_block_buf; + + // preload data into LDS + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.Run( + b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + b_blockwise_copy.Run( + b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch); + + block_sync_lds(); + + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + + b_block_buf = b_block_buf_switch; + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + } + } }; template <> diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index c65582f4eea..091e25d2399 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -18,11 +18,11 @@ namespace ck { template N->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + constexpr auto max_lds_align = K1; + + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + // KWmma->NRepeat->NWave->NRow->NPerWmma->K1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, I1, K1), + make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + } + }(); + + return b_block_desc; + } + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() { constexpr auto a_block_copy_step = [&]() { @@ -292,43 +326,56 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma return a_wave_desc; } - template + template __host__ __device__ static constexpr auto - MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + MakeBWaveDescriptor(const BBlockDesc_&) { - constexpr auto B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); - constexpr auto B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); - - return transform_tensor_descriptor( - BBlockDesc_BK0_N_BK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() - { - constexpr auto max_lds_align = K1; - constexpr auto K0PerBlock = KPerBlock / K1; - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { - if constexpr(BBlockLdsExtraN) + constexpr auto b_wave_desc = [&]() { + if constexpr(BEnableLds) { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); + // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 + constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + BBlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } else { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); + // KWmma_NRepeat_NWave_KRow_NPerWmma_K1 -> K0_NRepeat_Nwaves_NPerWmma_K1 + constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I5); + + // Workaround, Freeze transform + return transform_tensor_descriptor( + BBlockDesc_{}, + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<3>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); } }(); - return b_block_desc_k0perblock_nperblock_k1; + return b_wave_desc; } __host__ __device__ static constexpr auto @@ -349,7 +396,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma template __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, - const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const BGridDesc& b_grid_desc, const CGridDesc_M_N& c_grid_desc_m_n, const Block2CTileMap& block_2_ctile_map) { @@ -378,17 +425,17 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma const auto GetBProblemsizeNK = [&]() { if constexpr(BEnableLds) { - return make_tuple(b_grid_desc_k0_n_k1.GetLength(I1), - b_grid_desc_k0_n_k1.GetLength(I0) * - b_grid_desc_k0_n_k1.GetLength(I2)); + return make_tuple(b_grid_desc.GetLength(I1), + b_grid_desc.GetLength(I0) * + b_grid_desc.GetLength(I2)); } else { return make_tuple( - b_grid_desc_k0_n_k1.GetLength(I1) * b_grid_desc_k0_n_k1.GetLength(I2) * - b_grid_desc_k0_n_k1.GetLength(I4), - b_grid_desc_k0_n_k1.GetLength(I0) * b_grid_desc_k0_n_k1.GetLength(I3) * - b_grid_desc_k0_n_k1.GetLength(I5)); + b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * + b_grid_desc.GetLength(I4), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * + b_grid_desc.GetLength(I5)); } }; @@ -484,9 +531,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma max_lds_align) : 0; static constexpr auto b_block_space_size_aligned = - BEnableLds ? math::integer_least_multiple( - GetBBlockDescriptor_K0PerBlock_NPerBlock_K1().GetElementSpaceSize(), - max_lds_align) + BEnableLds ? math::integer_least_multiple(MakeBBlockDescriptor().GetElementSpaceSize(), + max_lds_align) : 0; static constexpr auto a_block_space_offset = 0; @@ -500,18 +546,18 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma static constexpr auto c_shuffle_block_space_offset = 0; static constexpr auto lds_size = - math::max(c_shuffle_block_space_size * sizeof(FloatCShuffle), - a_block_space_size_aligned * sizeof(FloatA) + - b_block_space_size_aligned * sizeof(FloatB)); + math::max(c_shuffle_block_space_size * sizeof(CShuffleDataType), + a_block_space_size_aligned * sizeof(ADataType) + + b_block_space_size_aligned * sizeof(BDataType)); }; template - __device__ static void Run(const FloatA* __restrict__ p_a_grid, - const FloatB* __restrict__ p_b_grid, - FloatC* __restrict__ p_c_grid, + __device__ static void Run(const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + CDataType* __restrict__ p_c_grid, void* __restrict__ p_shared, const AGridDesc& a_grid_desc, - const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const BGridDesc& b_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation& a_element_op, @@ -525,7 +571,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc.GetElementSpaceSize()); const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); + p_b_grid, b_grid_desc.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); @@ -554,7 +600,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma }(); constexpr auto a_block_desc = MakeABlockDescriptor(); - constexpr auto b_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + constexpr auto b_block_desc = MakeBBlockDescriptor(); auto a_block_trait = [&](){ // A matrix blockwise copy @@ -562,7 +608,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma { constexpr auto K0PerBlock = KPerBlock/ K1; auto a_block_buf = make_dynamic_buffer( - static_cast(p_shared), + static_cast(p_shared), SharedMemTrait::a_block_space_size_aligned); auto a_blockwise_copy = @@ -573,8 +619,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma /* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatA, -/* typename DstData, */ FloatA, +/* typename SrcData, */ ADataType, +/* typename DstData, */ ADataType, /* typename SrcDesc, */ decltype(a_grid_desc), /* typename DstDesc, */ decltype(a_block_desc), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, @@ -601,13 +647,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // Thread-wise copy // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - auto a_block_buf = make_static_buffer( + auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); // Limitation: NumDim of Src and Dst descriptor should be identical auto a_blockwise_copy = - ThreadwiseTensorSliceTransfer_v2{}, @@ -638,7 +684,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma { constexpr auto K0PerBlock = KPerBlock/ K1; auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::b_block_space_offset, + static_cast(p_shared) + SharedMemTrait::b_block_space_offset, SharedMemTrait::b_block_space_size_aligned); auto b_blockwise_copy = @@ -649,9 +695,9 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma Sequence, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, - FloatB, - FloatB, - decltype(b_grid_desc_k0_n_k1), + BDataType, + BDataType, + decltype(b_grid_desc), decltype(b_block_desc), BBlockTransferSrcAccessOrder, Sequence<0, 1, 2>, @@ -663,7 +709,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma 1, BThreadTransferSrcResetCoordinateAfterRun, true>( - b_grid_desc_k0_n_k1, + b_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, b_block_desc, @@ -674,23 +720,37 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma } else { - constexpr auto K0PerBlock = KPerBlock/ K1; - auto b_block_buf = make_static_buffer( + // Thread-wise copy + // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + auto b_block_buf = make_static_buffer( b_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical auto b_blockwise_copy = - ThreadwiseTensorSliceTransfer_v4{}, + Sequence{}, Number{}, + I1, + I1, + I1, Number{}>, - Sequence<0, 1, 2>, - 2, + Sequence<0, 1, 2, 3, 4, 5>, + 5, BBlockTransferSrcScalarPerVector, - 1>( - make_multi_index(0, get_thread_local_1d_id()/32 * 16 + get_thread_local_1d_id() % 16, 0)); - + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc, + make_multi_index(0, + n_block_data_idx_on_grid/(NWaves * NPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + return make_tuple(b_block_buf, b_blockwise_copy); } }; @@ -706,11 +766,11 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma auto blockwise_gemm = BlockwiseGemmWMMA( - static_cast(p_shared) + SharedMemTrait::c_shuffle_block_space_offset, + static_cast(p_shared) + SharedMemTrait::c_shuffle_block_space_offset, SharedMemTrait::c_shuffle_block_space_size); constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( @@ -815,8 +875,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // shuffle: threadwise copy C from VGPR to LDS auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3, // BlockSliceLengths, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, - FloatCShuffle, // typename SrcData, - FloatC, // typename DstData, + CShuffleDataType, // typename SrcData, + CDataType, // typename DstData, decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), Sequence<0, 1, 2, 3>, // typename DimAccessOrder, From 060c4f3a82b3c7938dceb50e63b35feb875e9bc0 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 11:32:08 +0000 Subject: [PATCH 052/118] Skip B Lds Gemm + MulD --- .../gemm_bilinear_wmma_fp16.cpp | 4 +- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 111 +++++----- .../gpu/device/impl/device_gemm_wmma.hpp | 21 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 194 ++++++++++++------ 4 files changed, 191 insertions(+), 139 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 72bcea7fa03..12b868c7d12 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -87,8 +87,8 @@ using DeviceOpInstance = 8, 16, 16, - 1, 8, + 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -105,7 +105,7 @@ using DeviceOpInstance = true, 1, 1, - S<1, 128, 1, 2>, + S<1, 16, 1, 16>, 8>; int main(int argc, char* argv[]) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 3418e40dec7..1b9a36d26d1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -105,6 +105,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{MPerBlock, NPerBlock, KPerBlock}; // Describe how data read from Global memory + // Describe how data read from Global memory static auto MakeAGridDescriptor(index_t MRaw, index_t KRaw, index_t StrideA) { const auto a_grid_desc_m_k = [&]() { @@ -115,12 +116,13 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD::value) { - return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA)); + const auto a_grid_desc_mraw_kraw = + make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), make_tuple(I1, StrideA)); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); } -#endif }(); const auto M = a_grid_desc_m_k.GetLength(I0); @@ -155,42 +157,56 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD::value) { - return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1)); + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + make_tuple(I1, StrideB)); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } else if constexpr(is_same::value) { - return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB)); + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), + make_tuple(StrideB, I1)); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } }(); - if constexpr(GemmSpec == GemmSpecialization::MNPadding) + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + assert(K % K1 == 0); + + if constexpr(BEnableLds) { - const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock; + const index_t K0 = K / K1; return transform_tensor_descriptor( - b_grid_desc_k_n, + b_grid_desc_n_k, make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_right_pad_transform(N, PadN)), - make_tuple(Sequence<0>{}, Sequence<1>{}), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } else { + constexpr auto B_KRow = WmmaK / K1; + const auto B_KWmma = K / WmmaK; + + const auto N0 = N / NPerBlock; + return transform_tensor_descriptor( - b_grid_desc_k_n, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_pass_through_transform(N)), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(N0 * NRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); } } @@ -245,7 +261,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD; using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)); @@ -260,7 +276,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD(p_e_grid)}, a_grid_desc{}, - b_grid_desc_k0_n_k1_{}, + b_grid_desc{}, ds_grid_desc_m_n_{}, e_grid_desc_m_n_{}, ds_grid_desc_mblock_mperblock_nblock_nperblock{}, @@ -342,7 +358,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{}([&](auto i) { using DLayout = remove_cvref_t>; using DDataType = remove_cvref_t>; @@ -359,7 +375,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD, - remove_reference_t, + remove_reference_t, remove_reference_t< typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, remove_reference_t< @@ -475,7 +484,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD, - remove_reference_t, + remove_reference_t, remove_reference_t< typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, remove_reference_t< @@ -514,7 +523,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleDN->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + constexpr auto max_lds_align = K1; + + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + // KWmma->NRepeat->NWave->NRow->NPerWmma->K1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, I1, K1), + make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + } + }(); + + return b_block_desc; + } + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() { constexpr auto a_block_copy_step = [&]() { @@ -478,44 +512,56 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle return a_wave_desc; } - template + template __host__ __device__ static constexpr auto - MakeBBlockDescriptor_K0_N0_N1_N2_K1(const BBlockDesc_BK0_N_BK1&) + MakeBWaveDescriptor(const BBlockDesc_&) { - constexpr auto B_K0 = BBlockDesc_BK0_N_BK1{}.GetLength(I0); - constexpr auto B_K1 = BBlockDesc_BK0_N_BK1{}.GetLength(I2); - - return transform_tensor_descriptor( - BBlockDesc_BK0_N_BK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1() - { - constexpr auto max_lds_align = K1; - constexpr auto K0PerBlock = KPerBlock / K1; - - // B matrix in LDS memory, dst of blockwise copy - constexpr auto b_block_desc_k0perblock_nperblock_k1 = [&]() { - if constexpr(BBlockLdsExtraN) + constexpr auto b_wave_desc = [&]() { + if constexpr(BEnableLds) { - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), - make_tuple(Number{} * K1, K1, I1)); + // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 + constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + BBlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } else { - return make_naive_tensor_descriptor_aligned( - make_tuple(Number{}, Number{}, K1), max_lds_align); + // KWmma_NRepeat_NWave_KRow_NPerWmma_K1 -> K0_NRepeat_Nwaves_NPerWmma_K1 + constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I5); + + // Workaround, Freeze transform + return transform_tensor_descriptor( + BBlockDesc_{}, + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<3>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); } }(); - return b_block_desc_k0perblock_nperblock_k1; + return b_wave_desc; } __host__ __device__ static constexpr auto @@ -551,7 +597,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle template __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, - const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const BGridDesc& b_grid_desc, const DsGridDesc_M_N& ds_grid_desc_m_n, const EGridDesc_M_N& e_grid_desc_m_n, const Block2CTileMap& block_2_ctile_map) @@ -581,17 +627,17 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle const auto GetBProblemsizeNK = [&]() { if constexpr(BEnableLds) { - return make_tuple(b_grid_desc_k0_n_k1.GetLength(I1), - b_grid_desc_k0_n_k1.GetLength(I0) * - b_grid_desc_k0_n_k1.GetLength(I2)); + return make_tuple(b_grid_desc.GetLength(I1), + b_grid_desc.GetLength(I0) * + b_grid_desc.GetLength(I2)); } else { return make_tuple( - b_grid_desc_k0_n_k1.GetLength(I1) * b_grid_desc_k0_n_k1.GetLength(I2) * - b_grid_desc_k0_n_k1.GetLength(I4), - b_grid_desc_k0_n_k1.GetLength(I0) * b_grid_desc_k0_n_k1.GetLength(I3) * - b_grid_desc_k0_n_k1.GetLength(I5)); + b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * + b_grid_desc.GetLength(I4), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * + b_grid_desc.GetLength(I5)); } }; @@ -702,7 +748,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle : 0; static constexpr auto b_block_space_size_aligned = BEnableLds ? math::integer_least_multiple( - GetBBlockDescriptor_K0PerBlock_NPerBlock_K1().GetElementSpaceSize(), + MakeBBlockDescriptor().GetElementSpaceSize(), max_lds_align) : 0; @@ -737,7 +783,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle EDataType* __restrict__ p_e_grid, void* __restrict__ p_shared, const AGridDesc& a_grid_desc, - const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1, + const BGridDesc& b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& @@ -753,7 +799,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc.GetElementSpaceSize()); const auto b_grid_buf = make_dynamic_buffer( - p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize()); + p_b_grid, b_grid_desc.GetElementSpaceSize()); const auto ds_grid_buf = generate_tuple( [&](auto i) { return make_dynamic_buffer( @@ -789,7 +835,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle }(); constexpr auto a_block_desc = MakeABlockDescriptor(); - constexpr auto b_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1(); + constexpr auto b_block_desc = MakeBBlockDescriptor(); auto a_block_trait = [&](){ // A matrix blockwise copy @@ -886,7 +932,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle BBlockTransferThreadClusterArrangeOrder, BDataType, BDataType, - decltype(b_grid_desc_k0_n_k1), + decltype(b_grid_desc), decltype(b_block_desc), BBlockTransferSrcAccessOrder, Sequence<0, 1, 2>, @@ -898,7 +944,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle 1, BThreadTransferSrcResetCoordinateAfterRun, true>( - b_grid_desc_k0_n_k1, + b_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, b_block_desc, @@ -909,23 +955,37 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle } else { - constexpr auto K0PerBlock = KPerBlock/ K1; - auto b_block_buf = make_static_buffer( + // Thread-wise copy + // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + auto b_block_buf = make_static_buffer( b_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical auto b_blockwise_copy = - ThreadwiseTensorSliceTransfer_v4{}, + Sequence{}, Number{}, + I1, + I1, + I1, Number{}>, - Sequence<0, 1, 2>, - 2, + Sequence<0, 1, 2, 3, 4, 5>, + 5, BBlockTransferSrcScalarPerVector, - 1>( - make_multi_index(0, get_thread_local_1d_id()/32 * 16 + get_thread_local_1d_id() % 16, 0)); - + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc, + make_multi_index(0, + n_block_data_idx_on_grid/(NWaves * NPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + return make_tuple(b_block_buf, b_blockwise_copy); } }; @@ -945,7 +1005,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle BDataType, AccDataType, decltype(MakeAWaveDescriptor(a_block_desc)), - decltype(MakeBBlockDescriptor_K0_N0_N1_N2_K1(b_block_desc)), + decltype(MakeBWaveDescriptor(b_block_desc)), MPerBlock, NPerBlock, KPerBlock, @@ -973,7 +1033,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle a_grid_buf, a_block_buf, a_block_slice_copy_step, - b_grid_desc_k0_n_k1, + b_grid_desc, b_block_desc, b_blockwise_copy, b_grid_buf, From 708fd81fff308576f879dfbc1ec14b525a1af2b6 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 11:45:41 +0000 Subject: [PATCH 053/118] batched gemm, conv, skip b lds --- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 4 +- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 6 +- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 84 ++++++++++++------- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 74 +++++++++++----- 4 files changed, 112 insertions(+), 56 deletions(-) diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index d04d6b12cd6..25ab210739d 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -74,8 +74,8 @@ using DeviceOpInstanceKKNN = 8, 16, 16, - 1, 8, + 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -92,7 +92,7 @@ using DeviceOpInstanceKKNN = true, 1, 1, - S<1, 128, 1, 2>, + S<1, 16, 1, 16>, 8>; using DeviceOpInstance = DeviceOpInstanceKKNN; diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 269ebc074f5..1def3a7b680 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -58,8 +58,8 @@ using DeviceConvFwdInstance = 8, // K1 16, // MPerWMMA 16, // NPerWMMA - 1, // MRepeat - 8, // NRepeat + 4, // MRepeat + 2, // NRepeat S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder @@ -76,7 +76,7 @@ using DeviceConvFwdInstance = true, // BBlockLdsExtraN 1, 1, - S<1, 128, 1, 2>, + S<1, 32, 1, 8>, 8>; template diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 76684648531..f91479e2c7f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -228,7 +228,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle } // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...] - static auto MakeBGridDescriptor_K0_N_K1(const std::vector& b_gs_ns_ks_lengths_vec, + static auto MakeBGridDescriptor(const std::vector& b_gs_ns_ks_lengths_vec, const std::vector& b_gs_ns_ks_strides_vec) { assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK && @@ -287,14 +287,33 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle const auto N = b_grid_desc_n_k.GetLength(I0); const auto K = b_grid_desc_n_k.GetLength(I1); assert(K % K1 == 0); - const index_t K0 = K / K1; - - return transform_tensor_descriptor( - b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + + if constexpr(BEnableLds) + { + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto B_KRow = WmmaK / K1; + const auto B_KWmma = K / WmmaK; + + const auto N0 = N / NPerBlock; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(N0 * NRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } } // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] @@ -504,7 +523,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle }; using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({},{})); - using BGridDesc_K0_N_K1 = decltype(DeviceOp::MakeBGridDescriptor_K0_N_K1({},{})); + using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({},{})); // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< @@ -517,7 +536,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle EDataType, // InMemory Data Descriptor AGridDesc, - BGridDesc_K0_N_K1, + BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, // ElementwiseOp Family @@ -586,8 +605,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle p_b_grid_{static_cast(p_b_grid)}, p_ds_grid_{}, p_e_grid_{static_cast(p_e_grid)}, - a_grid_desc_k0_m_k1_{}, - b_grid_desc_k0_n_k1_{}, + a_grid_desc_{}, + b_grid_desc_{}, ds_grid_desc_m_n_{}, e_grid_desc_m_n_{}, ds_grid_desc_g_m_n_{ @@ -620,8 +639,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle p_ds_grid_(i) = static_cast(p_ds_grid[i]); }); - a_grid_desc_k0_m_k1_ = DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); - b_grid_desc_k0_n_k1_ = DeviceOp::MakeBGridDescriptor_K0_N_K1(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); + a_grid_desc_ = DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + b_grid_desc_ = DeviceOp::MakeBGridDescriptor(b_gs_ns_ks_lengths, b_gs_ns_ks_strides); ds_grid_desc_m_n_ = DeviceOp::MakeDsGridDescriptor_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides); @@ -660,8 +679,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle EDataType* p_e_grid_; // Tensor Descriptors - AGridDesc a_grid_desc_k0_m_k1_; - BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_; + AGridDesc a_grid_desc_; + BGridDesc b_grid_desc_; DsGridDesc_M_N ds_grid_desc_m_n_; EGridDesc_M_N e_grid_desc_m_n_; DsGridDesc_G_M_N ds_grid_desc_g_m_n_; @@ -714,8 +733,17 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle const index_t grid_size = arg.block_2_ctile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G; - const auto K = - arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2); + const auto K = [&]() { + if constexpr(AEnableLds) + { + return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I2); + } + else + { + return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I3) * + arg.a_grid_desc_.GetLength(I5); + } + }(); auto launch_kernel = [&](auto has_main_k_block_loop) { constexpr bool has_main_loop = has_main_k_block_loop.value; @@ -727,7 +755,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle typename GridwiseOp::DsGridPointer, EDataType, DeviceOp::AGridDesc, - DeviceOp::BGridDesc_K0_N_K1, + DeviceOp::BGridDesc, typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, AElementwiseOperation, @@ -747,8 +775,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle arg.p_ds_grid_, arg.p_e_grid_, G, - arg.a_grid_desc_k0_m_k1_, - arg.b_grid_desc_k0_n_k1_, + arg.a_grid_desc_, + arg.b_grid_desc_, arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, arg.e_grid_desc_mblock_mperblock_nblock_nperblock, arg.a_element_op_, @@ -797,8 +825,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle return false; } - if(!GridwiseOp::CheckValidity(arg.a_grid_desc_k0_m_k1_, - arg.b_grid_desc_k0_n_k1_, + if(!GridwiseOp::CheckValidity(arg.a_grid_desc_, + arg.b_grid_desc_, arg.ds_grid_desc_m_n_, arg.e_grid_desc_m_n_, arg.block_2_ctile_map_)) @@ -816,7 +844,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle if constexpr(ABlockTransferSrcVectorDim == 1) { if(!(arg.a_mz_stride_ == 1 && - arg.a_grid_desc_k0_m_k1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0)) + arg.a_grid_desc_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0)) { printf("DeviceOp: Vector Access A-m check failure\n"); return false; @@ -825,7 +853,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle else { if(!(arg.a_kz_stride_ == 1 && - arg.a_grid_desc_k0_m_k1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0)) + arg.a_grid_desc_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0)) { printf("DeviceOp: Vector Access A-k check failure\n"); return false; @@ -836,7 +864,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle if constexpr(BBlockTransferSrcVectorDim == 1) { if(!(arg.b_nz_stride_ == 1 && - arg.b_grid_desc_k0_n_k1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0)) + arg.b_grid_desc_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0)) { printf("DeviceOp: Vector Access B-n check failure\n"); return false; @@ -845,7 +873,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle else { if(!(arg.b_kz_stride_ == 1 && - arg.b_grid_desc_k0_n_k1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0)) + arg.b_grid_desc_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0)) { printf("DeviceOp: Vector Access B-k check failure\n"); return false; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index f45a15ba294..30507031f00 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -245,7 +245,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle template static auto - MakeBGridDescriptor_BK0_N_BK1(const std::array& b_g_k_c_xs_lengths, + MakeBGridDescriptor(const std::array& b_g_k_c_xs_lengths, const std::array& b_g_k_c_xs_strides) { const auto wei_gemmnraw_gemmkraw_desc = @@ -257,15 +257,34 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle const auto N = wei_gemmn_gemmk_desc.GetLength(I0); const auto K = wei_gemmn_gemmk_desc.GetLength(I1); + assert(K % K1 == 0); + + if constexpr(BEnableLds) + { + const index_t K0 = K / K1; - const auto BK1 = K1; - const auto BK0 = K / BK1; + return transform_tensor_descriptor( + wei_gemmn_gemmk_desc, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto B_KRow = WmmaK / K1; + const auto B_KWmma = K / WmmaK; - return transform_tensor_descriptor(wei_gemmn_gemmk_desc, - make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), - make_pass_through_transform(N)), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + const auto N0 = N / NPerBlock; + + return transform_tensor_descriptor( + wei_gemmn_gemmk_desc, + make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(N0 * NRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } } template @@ -302,7 +321,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle using EGridDesc_M_N = remove_cvref_t({}, {}))>; using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {})); - using BGridDesc_BK0_N_BK1 = decltype(DeviceOp::MakeBGridDescriptor_BK0_N_BK1({}, {})); + using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({}, {})); // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< @@ -315,7 +334,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle EDataType, // InMemory Data Descriptor AGridDesc, - BGridDesc_BK0_N_BK1, + BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, // ElementwiseOp Family @@ -394,7 +413,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle ds_grid_desc_m_n_{}, e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, e_g_n_k_wos_strides)}, - a_grid_desc{DeviceOp::MakeAGridDescriptor(a_g_n_c_wis_lengths, + a_grid_desc_{DeviceOp::MakeAGridDescriptor(a_g_n_c_wis_lengths, a_g_n_c_wis_strides, b_g_k_c_xs_lengths, b_g_k_c_xs_strides, @@ -404,7 +423,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle conv_filter_dilations, input_left_pads, input_right_pads)}, - b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_g_k_c_xs_lengths, + b_grid_desc_{DeviceOp::MakeBGridDescriptor(b_g_k_c_xs_lengths, b_g_k_c_xs_strides)}, ds_grid_desc_mblock_mperblock_nblock_nperblock_{}, e_grid_desc_mblock_mperblock_nblock_nperblock_{}, @@ -457,8 +476,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle void Print() const { - std::cout << "A[M, K]: " << a_grid_desc << std::endl; - std::cout << "B[N, K]: " << b_grid_desc_bk0_n_bk1_ << std::endl; + std::cout << "A[M, K]: " << a_grid_desc_ << std::endl; + std::cout << "B[N, K]: " << b_grid_desc_ << std::endl; static_for<0, NumDTensor, 1>{}( [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; }); std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl; @@ -477,8 +496,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle EGridDesc_M_N e_grid_desc_m_n_; // tensor descriptors for block/thread-wise copy - AGridDesc a_grid_desc; - BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_; + AGridDesc a_grid_desc_; + BGridDesc b_grid_desc_; typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock_; typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock @@ -525,8 +544,17 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle const index_t grid_size = arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.num_group_; - const auto K = - arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I2); + const auto K = [&]() { + if constexpr(AEnableLds) + { + return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I2); + } + else + { + return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I3) * + arg.a_grid_desc_.GetLength(I5); + } + }(); auto launch_kernel = [&](auto has_main_k_block_loop) { constexpr bool has_main_loop = has_main_k_block_loop.value; @@ -541,7 +569,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle BElementwiseOperation, CDEElementwiseOperation, DeviceOp::AGridDesc, - DeviceOp::BGridDesc_BK0_N_BK1, + DeviceOp::BGridDesc, typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, remove_reference_t, @@ -561,8 +589,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle arg.b_element_op_, arg.cde_element_op_, arg.a_g_n_c_wis_lengths_[0], // Group count - arg.a_grid_desc, - arg.b_grid_desc_bk0_n_bk1_, + arg.a_grid_desc_, + arg.b_grid_desc_, arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_, arg.e_grid_desc_mblock_mperblock_nblock_nperblock_, arg.block_2_etile_map_, @@ -731,8 +759,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } // check Gridwise GEMM - return GridwiseOp::CheckValidity(arg.a_grid_desc, - arg.b_grid_desc_bk0_n_bk1_, + return GridwiseOp::CheckValidity(arg.a_grid_desc_, + arg.b_grid_desc_, arg.ds_grid_desc_m_n_, arg.e_grid_desc_m_n_, arg.block_2_etile_map_); From 6e28a8ac64af00418b95c681645690cb16633ab9 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 11:46:42 +0000 Subject: [PATCH 054/118] format --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 4 +- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 14 ++--- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 17 +++--- .../gpu/device/impl/device_gemm_wmma.hpp | 21 +++---- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 55 +++++++++---------- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 31 +++++------ .../gpu/grid/gridwise_gemm_wmma.hpp | 24 ++++---- .../threadwise_tensor_slice_transfer.hpp | 9 +-- script/clang-format-overwrite.sh | 2 +- 9 files changed, 77 insertions(+), 100 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 1def3a7b680..b9ecf2ac20a 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -54,12 +54,12 @@ using DeviceConvFwdInstance = 256, // BlockSize 128, // MPerBlock 128, // NPerBlock - 32, // KPerBlock + 32, // KPerBlock 8, // K1 16, // MPerWMMA 16, // NPerWMMA 4, // MRepeat - 2, // NRepeat + 2, // NRepeat S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index f91479e2c7f..56225d22998 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -140,7 +140,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // Assume: A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...] static auto MakeAGridDescriptor(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + const std::vector& a_gs_ms_ks_strides_vec) { assert(a_gs_ms_ks_lengths_vec.size() == NumDimG + NumDimM + NumDimK && a_gs_ms_ks_strides_vec.size() == NumDimG + NumDimM + NumDimK); @@ -167,7 +167,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // lengths for K0, K1, ... const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds); - const auto a_grid_desc_m_k = [&](){ + const auto a_grid_desc_m_k = [&]() { if constexpr(ASpec == TensorSpecialization::Packed) { auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); @@ -229,7 +229,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...] static auto MakeBGridDescriptor(const std::vector& b_gs_ns_ks_lengths_vec, - const std::vector& b_gs_ns_ks_strides_vec) + const std::vector& b_gs_ns_ks_strides_vec) { assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK && b_gs_ns_ks_strides_vec.size() == NumDimG + NumDimN + NumDimK); @@ -256,7 +256,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // lengths for N0, N1, ... const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds); - const auto b_grid_desc_n_k = [&](){ + const auto b_grid_desc_n_k = [&]() { if constexpr(BSpec == TensorSpecialization::Packed) { auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); @@ -522,8 +522,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle EGridDesc_G_M_N e_grid_desc_g_m_n_; }; - using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({},{})); - using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({},{})); + using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({}, {})); + using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({}, {})); // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< @@ -648,7 +648,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle e_grid_desc_m_n_ = DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides); - block_2_ctile_map_ = GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01); ds_grid_desc_mblock_mperblock_nblock_nperblock = @@ -686,7 +685,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle DsGridDesc_G_M_N ds_grid_desc_g_m_n_; EGridDesc_G_M_N e_grid_desc_g_m_n_; - typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock; typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 1b9a36d26d1..696dcec60ac 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -18,7 +18,6 @@ #include "ck/host_utility/kernel_launch.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" - namespace ck { namespace tensor_operation { namespace device { @@ -163,16 +162,14 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD::value) { const auto b_grid_desc_nraw_kraw = - make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), - make_tuple(I1, StrideB)); + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(I1, StrideB)); return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } else if constexpr(is_same::value) { - const auto b_grid_desc_nraw_kraw = - make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), - make_tuple(StrideB, I1)); + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(StrideB, I1)); return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } @@ -260,10 +257,10 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD; - using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)); + using AGridDesc = decltype(MakeAGridDescriptor(1, 1, 1)); + using BGridDesc = decltype(MakeBGridDescriptor(1, 1, 1)); + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1)); // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index c7e9a123366..58ecc719086 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -153,16 +153,14 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm::value) { const auto b_grid_desc_nraw_kraw = - make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), - make_tuple(I1, StrideB)); + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(I1, StrideB)); return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } else if constexpr(is_same::value) { - const auto b_grid_desc_nraw_kraw = - make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), - make_tuple(StrideB, I1)); + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(StrideB, I1)); return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); } @@ -219,9 +217,9 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{MPerBlock, NPerBlock, KPerBlock}; template - static auto - MakeAGridDescriptor(const std::array& a_g_n_c_wis_lengths, - const std::array& a_g_n_c_wis_strides, - const std::array& b_g_k_c_xs_lengths, - const std::array& b_g_k_c_xs_strides, - const std::array& e_g_n_k_wos_lengths, - const std::array& e_g_n_k_wos_strides, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& input_right_pads) + static auto MakeAGridDescriptor(const std::array& a_g_n_c_wis_lengths, + const std::array& a_g_n_c_wis_strides, + const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides, + const std::array& e_g_n_k_wos_lengths, + const std::array& e_g_n_k_wos_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads) { const auto in_gemmmraw_gemmkraw_desc = conv_to_gemm_transformer.template MakeADescriptor_M_K(a_g_n_c_wis_lengths, @@ -210,7 +209,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle const auto in_gemmm_gemmk_desc = matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc); - + const auto M = in_gemmm_gemmk_desc.GetLength(I0); const auto K = in_gemmm_gemmk_desc.GetLength(I1); assert(K % K1 == 0); @@ -244,9 +243,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } template - static auto - MakeBGridDescriptor(const std::array& b_g_k_c_xs_lengths, - const std::array& b_g_k_c_xs_strides) + static auto MakeBGridDescriptor(const std::array& b_g_k_c_xs_lengths, + const std::array& b_g_k_c_xs_strides) { const auto wei_gemmnraw_gemmkraw_desc = conv_to_gemm_transformer.template MakeBDescriptor_N_K(b_g_k_c_xs_lengths, @@ -254,7 +252,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle const auto wei_gemmn_gemmk_desc = matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc); - + const auto N = wei_gemmn_gemmk_desc.GetLength(I0); const auto K = wei_gemmn_gemmk_desc.GetLength(I1); assert(K % K1 == 0); @@ -320,7 +318,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle using DsGridDesc_M_N = remove_cvref_t; using EGridDesc_M_N = remove_cvref_t({}, {}))>; - using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {})); + using AGridDesc = + decltype(DeviceOp::MakeAGridDescriptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {})); using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({}, {})); // GridwiseOp @@ -414,17 +413,17 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_g_n_k_wos_lengths, e_g_n_k_wos_strides)}, a_grid_desc_{DeviceOp::MakeAGridDescriptor(a_g_n_c_wis_lengths, - a_g_n_c_wis_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - e_g_n_k_wos_lengths, - e_g_n_k_wos_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads)}, - b_grid_desc_{DeviceOp::MakeBGridDescriptor(b_g_k_c_xs_lengths, - b_g_k_c_xs_strides)}, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads)}, + b_grid_desc_{ + DeviceOp::MakeBGridDescriptor(b_g_k_c_xs_lengths, b_g_k_c_xs_strides)}, ds_grid_desc_mblock_mperblock_nblock_nperblock_{}, e_grid_desc_mblock_mperblock_nblock_nperblock_{}, block_2_etile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_, M01, N01)}, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index f53ad7a76cd..cc9ae5e60aa 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -513,8 +513,7 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle } template - __host__ __device__ static constexpr auto - MakeBWaveDescriptor(const BBlockDesc_&) + __host__ __device__ static constexpr auto MakeBWaveDescriptor(const BBlockDesc_&) { constexpr auto b_wave_desc = [&]() { if constexpr(BEnableLds) @@ -595,12 +594,11 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template - __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc& a_grid_desc, - const BGridDesc& b_grid_desc, - const DsGridDesc_M_N& ds_grid_desc_m_n, - const EGridDesc_M_N& e_grid_desc_m_n, - const Block2CTileMap& block_2_ctile_map) + __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, + const BGridDesc& b_grid_desc, + const DsGridDesc_M_N& ds_grid_desc_m_n, + const EGridDesc_M_N& e_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) { static_assert(is_known_at_compile_time>::value, "wrong! K1 need to be known at compile-time"); @@ -628,16 +626,14 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle if constexpr(BEnableLds) { return make_tuple(b_grid_desc.GetLength(I1), - b_grid_desc.GetLength(I0) * - b_grid_desc.GetLength(I2)); + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I2)); } else { - return make_tuple( - b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * - b_grid_desc.GetLength(I4), - b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * - b_grid_desc.GetLength(I5)); + return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * + b_grid_desc.GetLength(I4), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * + b_grid_desc.GetLength(I5)); } }; @@ -747,9 +743,8 @@ struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle max_lds_align) : 0; static constexpr auto b_block_space_size_aligned = - BEnableLds ? math::integer_least_multiple( - MakeBBlockDescriptor().GetElementSpaceSize(), - max_lds_align) + BEnableLds ? math::integer_least_multiple(MakeBBlockDescriptor().GetElementSpaceSize(), + max_lds_align) : 0; static constexpr auto a_block_space_offset = 0; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 091e25d2399..07d6b0848e4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -327,8 +327,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma } template - __host__ __device__ static constexpr auto - MakeBWaveDescriptor(const BBlockDesc_&) + __host__ __device__ static constexpr auto MakeBWaveDescriptor(const BBlockDesc_&) { constexpr auto b_wave_desc = [&]() { if constexpr(BEnableLds) @@ -394,11 +393,10 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template - __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc& a_grid_desc, - const BGridDesc& b_grid_desc, - const CGridDesc_M_N& c_grid_desc_m_n, - const Block2CTileMap& block_2_ctile_map) + __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, + const BGridDesc& b_grid_desc, + const CGridDesc_M_N& c_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) { static_assert(is_known_at_compile_time>::value, "wrong! K1 need to be known at compile-time"); @@ -426,16 +424,14 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma if constexpr(BEnableLds) { return make_tuple(b_grid_desc.GetLength(I1), - b_grid_desc.GetLength(I0) * - b_grid_desc.GetLength(I2)); + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I2)); } else { - return make_tuple( - b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * - b_grid_desc.GetLength(I4), - b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * - b_grid_desc.GetLength(I5)); + return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * + b_grid_desc.GetLength(I4), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * + b_grid_desc.GetLength(I5)); } }; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 539d362595f..2b8b236b568 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1398,13 +1398,8 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow if constexpr(IntraRowSwizzlePerm) { - temp = __builtin_amdgcn_permlane16( - temp, - type_convert(v_this_row), - 0xb3a29180, - 0xf7e6d5c4, - 1, - 0); + temp = __builtin_amdgcn_permlane16( + temp, type_convert(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0); v_this_row = type_convert(temp); } diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index f9d11fcd8cb..2ddbb6440d8 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From c5fd087e8b21c0848dc883c9306b98a2c95808fc Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 6 Mar 2023 13:01:15 +0000 Subject: [PATCH 055/118] Attn, skip b lds --- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 95 ++- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 693 +++++++++++++----- .../gpu/grid/gridwise_gemm_wmma.hpp | 2 +- .../transform_contraction_to_gemm.hpp | 56 ++ 4 files changed, 609 insertions(+), 237 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 0de115a70d9..1f65694cfa6 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -180,27 +180,57 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static auto MakeB0GridDescriptor(const std::vector& b0_gs_ls_ks_lengths_vec, const std::vector& b0_gs_ls_ks_strides_vec) { - return Transform::MakeB0GridDescriptor_BK0_N_BK1( - Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, b0_gs_ls_ks_strides_vec), - Number{}); + if constexpr(B0EnableLds) + { + return Transform::MakeB0GridDescriptor_BK0_N_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}); + } + else + { + return Transform::MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BKRow_LPerWmma_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } } - static auto MakeB1GridDescriptor_BL0_N_BL1(const std::vector& b1_gs_ns_ls_lengths_vec, - const std::vector& b1_gs_ns_ls_strides_vec) + static auto MakeB1GridDescriptor(const std::vector& b1_gs_ns_ls_lengths_vec, + const std::vector& b1_gs_ns_ls_strides_vec) { - return Transform::MakeB1GridDescriptor_BK0_N_BK1( - Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, b1_gs_ns_ls_strides_vec), - Number{}); + if constexpr(B1EnableLds) + { + return Transform::MakeB1GridDescriptor_BK0_N_BK1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}); + } + else + { + return Transform::MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves_BLRow_NPerWmma_BL1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } } - using AGridDesc = decltype(MakeAGridDescriptor({}, {})); - using B0GridDesc_BK0_L_BK1 = decltype(MakeB0GridDescriptor({}, {})); - using B1GridDesc_BL0_N_BL1 = decltype(MakeB1GridDescriptor_BL0_N_BL1({}, {})); - using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); - using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); - using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); - using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); - using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); + using AGridDesc = decltype(MakeAGridDescriptor({}, {})); + using B0GridDesc = decltype(MakeB0GridDescriptor({}, {})); + using B1GridDesc = decltype(MakeB1GridDescriptor({}, {})); + using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); + using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); + using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); + using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); + using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); constexpr static auto make_MaskOutPredicate() { @@ -274,8 +304,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle InMemoryDataOperationEnum::Set, // InMemory Data Descriptor AGridDesc, - B0GridDesc_BK0_L_BK1, - B1GridDesc_BL0_N_BL1, + B0GridDesc, + B1GridDesc, CGridDesc_M_N, // Tiling Family MPerBlock, @@ -364,10 +394,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle p_b1_grid_{p_b1_grid}, p_c_grid_{p_c_grid}, a_grid_desc{DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, - b0_grid_desc_bk0_l_bk1_{ + b0_grid_desc{ DeviceOp::MakeB0GridDescriptor(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, - b1_grid_desc_bl0_n_bl1_{DeviceOp::MakeB1GridDescriptor_BL0_N_BL1( - b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + b1_grid_desc{ + DeviceOp::MakeB1GridDescriptor(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, c_grid_desc_m_n_{ Transform::MakeCGridDescriptor_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, a_grid_desc_g_m_k_{ @@ -410,11 +440,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle ignore = acc1_biases_gs_ms_ns_lengths; ignore = acc1_biases_gs_ms_ns_strides; - if(GridwiseOp::CheckValidity(a_grid_desc, - b0_grid_desc_bk0_l_bk1_, - b1_grid_desc_bl0_n_bl1_, - c_grid_desc_m_n_, - block_2_ctile_map_)) + if(GridwiseOp::CheckValidity( + a_grid_desc, b0_grid_desc, b1_grid_desc, c_grid_desc_m_n_, block_2_ctile_map_)) { c_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( @@ -430,8 +457,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle // Tensor Descriptors AGridDesc a_grid_desc; - B0GridDesc_BK0_L_BK1 b0_grid_desc_bk0_l_bk1_; - B1GridDesc_BL0_N_BL1 b1_grid_desc_bl0_n_bl1_; + B0GridDesc b0_grid_desc; + B1GridDesc b1_grid_desc; CGridDesc_M_N c_grid_desc_m_n_; AGridDesc_G_M_K a_grid_desc_g_m_k_; @@ -498,8 +525,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B1DataType, CDataType, DeviceOp::AGridDesc, - DeviceOp::B0GridDesc_BK0_L_BK1, - DeviceOp::B1GridDesc_BL0_N_BL1, + DeviceOp::B0GridDesc, + DeviceOp::B1GridDesc, typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, AElementwiseOperation, B0ElementwiseOperation, @@ -521,8 +548,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle arg.p_b1_grid_, arg.p_c_grid_, arg.a_grid_desc, - arg.b0_grid_desc_bk0_l_bk1_, - arg.b1_grid_desc_bl0_n_bl1_, + arg.b0_grid_desc, + arg.b1_grid_desc, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, arg.a_element_op_, arg.b0_element_op_, @@ -582,8 +609,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } if(!GridwiseOp::CheckValidity(arg.a_grid_desc, - arg.b0_grid_desc_bk0_l_bk1_, - arg.b1_grid_desc_bl0_n_bl1_, + arg.b0_grid_desc, + arg.b1_grid_desc, arg.c_grid_desc_m_n_, arg.block_2_ctile_map_)) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 350545a8691..267221bda97 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -18,14 +18,14 @@ namespace ck { -template (compute_base_ptr_of_batch.GetCBasePtr(g_idx))); - GridwiseGemm::template Run(p_a_grid + a_batch_offset, - p_b0_grid + b0_batch_offset, - p_b1_grid + b1_batch_offset, - p_c_grid + c_batch_offset, - p_shared, - a_grid_desc, - b0_grid_desc_bk0_l_bk1, - b1_grid_desc_l0_n_l1, - c_grid_desc_mblock_mperblock_nblock_nperblock, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op, - c0_matrix_mask, - block_2_ctile_map); + GridwiseOp::template Run(p_a_grid + a_batch_offset, + p_b0_grid + b0_batch_offset, + p_b1_grid + b1_batch_offset, + p_c_grid + c_batch_offset, + p_shared, + a_grid_desc, + b0_grid_desc, + b1_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b0_element_op, + acc_element_op, + b1_element_op, + c_element_op, + c0_matrix_mask, + block_2_ctile_map); #else ignore = p_a_grid; ignore = p_b0_grid; ignore = p_b1_grid; ignore = p_c_grid; ignore = a_grid_desc; - ignore = b0_grid_desc_bk0_l_bk1; - ignore = b1_grid_desc_l0_n_l1; + ignore = b0_grid_desc; + ignore = b1_grid_desc; ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; ignore = a_element_op; ignore = b0_element_op; @@ -115,13 +115,13 @@ __global__ void // Gemm0: A [M x K] x B0 [K x L] = Acc [M x L] // Gemm1: Acc [M x L] x B1 [L x N] = C [M x N] -template {}; static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); static constexpr auto WmmaK = 16; + static constexpr auto WmmaL = 16; using ThisThreadBlock = ThisThreadBlock; @@ -250,6 +252,73 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle return a_block_desc; } + __host__ __device__ static constexpr auto MakeB0BlockDescriptor() + { + constexpr auto b0_block_desc = [&]() { + if constexpr(B0EnableLds) + { + // K0->L->BK1 Per Block + constexpr auto K0PerBlock = KPerBlock / BK1; + constexpr auto max_lds_align = BK1; + + if constexpr(B0BlockLdsExtraL) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, BK1), + make_tuple(Number{} * BK1, BK1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, BK1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + // KWmma->NRepeat->NWave->NRow->NPerWmma->BK1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, I1, BK1), + make_tuple(Number{} * BK1, BK1, BK1, BK1, BK1, I1)); + } + }(); + + return b0_block_desc; + } + + __host__ __device__ static constexpr auto MakeB1BlockDescriptor() + { + constexpr auto b1_block_desc = [&]() { + if constexpr(B1EnableLds) + { + // L0->N->BL1 Per Block + constexpr auto max_lds_align = BL1; + + if constexpr(B1BlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, BL1), + make_tuple(Number{} * BL1, BL1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, BL1), max_lds_align); + } + } + else + { + constexpr auto LWmmaPerblock = LPerBlock / WmmaL; + // LWmma->NRepeat->NWave->NRow->LPerWmma->BL1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, I1, BL1), + make_tuple(Number{} * BL1, BL1, BL1, BL1, BL1, I1)); + } + }(); + + return b1_block_desc; + } + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() { constexpr auto a_block_copy_step = [&]() { @@ -270,6 +339,44 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle return a_block_copy_step; } + __host__ __device__ static constexpr auto MakeB0BlockSliceCopyStep() + { + constexpr auto b0_block_copy_step = [&]() { + if constexpr(B0EnableLds) + { + constexpr auto K0PerBlock = KPerBlock / BK1; + + return make_multi_index(K0PerBlock, 0, 0); + } + else + { + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + } + }(); + + return b0_block_copy_step; + } + + __host__ __device__ static constexpr auto MakeB1BlockSliceCopyStep() + { + constexpr auto b1_block_copy_step = [&]() { + if constexpr(B1EnableLds) + { + return make_multi_index(L0PerBlock, 0, 0); + } + else + { + constexpr auto LWmmaPerBlock = LTilePerBlock / WmmaL; + + return make_multi_index(LWmmaPerBlock, 0, 0, 0, 0, 0); + } + }(); + + return b1_block_copy_step; + } + // Describe how data read from (LDS/VGPR) buffer template __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&) @@ -323,26 +430,61 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle return a_wave_desc; } - template - __host__ __device__ static constexpr auto - MakeB0BlockDescriptor_K0_L0_L1_L2_K1(const B0BlockDesc_BK0_L_BK1&) + template + __host__ __device__ static constexpr auto MakeB0WaveDescriptor(const B0BlockDesc_&) { - constexpr index_t B_K0 = B0BlockDesc_BK0_L_BK1{}.GetLength(I0); - constexpr index_t B_K1 = B0BlockDesc_BK0_L_BK1{}.GetLength(I2); - constexpr index_t LWaves = LPerBlock / (LRepeat * LPerWmma); - return transform_tensor_descriptor( - B0BlockDesc_BK0_L_BK1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + + constexpr auto b0_wave_desc = [&]() { + if constexpr(B0EnableLds) + { + // BK0_L_BK1 -> BK0_LRepeat_Lwaves_LPerWmma_BK1 + constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + B0BlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + else + { + // KWmma_LRepeat_LWave_KRow_LPerWmma_K1 -> K0_LRepeat_Lwaves_LPerWmma_K1 + constexpr auto KWmma = B0BlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I5); + + // Workaround, Freeze transform + return transform_tensor_descriptor( + B0BlockDesc_{}, + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<3>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); + } + }(); + + return b0_wave_desc; } template __host__ __device__ static constexpr auto - MakeA1BlockDescriptor_L0_M0_M1_M2_L1(const A1BlockDesc_AL0_M_AL1&) + MakeA1WaveDescriptor_L0_M0_M1_M2_L1(const A1BlockDesc_AL0_M_AL1&) { constexpr index_t A_L0 = A1BlockDesc_AL0_M_AL1{}.GetLength(I0); constexpr index_t A_L1 = A1BlockDesc_AL0_M_AL1{}.GetLength(I2); @@ -356,37 +498,56 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); } - template - __host__ __device__ static constexpr auto - MakeB1BlockDescriptor_L0_N0_N1_N2_L1(const B1BlockDesc_BL0_N_BL1&) + template + __host__ __device__ static constexpr auto MakeB1WaveDescriptor(const B1BlockDesc_&) { - constexpr index_t B_K0 = B1BlockDesc_BL0_N_BL1{}.GetLength(I0); - constexpr index_t B_K1 = B1BlockDesc_BL0_N_BL1{}.GetLength(I2); - return transform_tensor_descriptor( - B1BlockDesc_BL0_N_BL1{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } + constexpr auto b1_wave_desc = [&]() { + if constexpr(B1EnableLds) + { + // BL0_N_BL1 -> BL0_NRepeat_Nwaves_NPerWmma_BL1 + constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0); + constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2); + return transform_tensor_descriptor( + B1BlockDesc_{}, + make_tuple(make_pass_through_transform(Number{}), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + } + else + { + // LWmma_NRepeat_NWave_LRow_NPerWmma_L1 -> L0_NRepeat_Nwaves_NPerWmma_L1 + constexpr auto LWmma = B1BlockDesc_{}.GetLength(I0); + constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I5); - __host__ __device__ static constexpr auto GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1() - { - // B matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(BK0, Number{}, BK1), - make_tuple(Number{} * BK1, BK1, I1)); - } + // Workaround, Freeze transform + return transform_tensor_descriptor( + B1BlockDesc_{}, + make_tuple(make_freeze_transform(I0), + make_pass_through_transform(Number{}), + make_pass_through_transform(Number{}), + make_pass_through_transform(I1), + make_pass_through_transform(I1), + make_pass_through_transform(Number{})), + make_tuple(Sequence<3>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<4>{}, + Sequence<5>{}), + make_tuple(Sequence<>{}, + Sequence<0>{}, + Sequence<1>{}, + Sequence<2>{}, + Sequence<3>{}, + Sequence<4>{})); + } + }(); - __host__ __device__ static constexpr auto GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1() - { - // B1 matrix in LDS memory, dst of blockwise copy - return make_naive_tensor_descriptor( - make_tuple(BL0, Number{}, BL1), - make_tuple(Number{} * BL1, BL1, I1)); + return b1_wave_desc; } __host__ __device__ static constexpr auto @@ -410,31 +571,30 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle { // LDS allocation for A and B: be careful of alignment const index_t gemm0_bytes_end = - (SharedMemTrait::a_block_space_size_aligned * sizeof(FloatA) + - SharedMemTrait::b0_block_space_size_aligned * sizeof(FloatB0)); + (SharedMemTrait::a_block_space_size_aligned * sizeof(ADataType) + + SharedMemTrait::b0_block_space_size_aligned * sizeof(B0DataType)); const index_t gemm1_bytes_end = (SharedMemTrait::b1_block_space_offset + - SharedMemTrait::b1_block_space_size_aligned * sizeof(FloatB1)); + SharedMemTrait::b1_block_space_size_aligned * sizeof(B1DataType)); const index_t softmax_bytes_end = SharedMemTrait::reduction_space_offset + - SharedMemTrait::reduction_space_size_aligned * sizeof(FloatAcc0); + SharedMemTrait::reduction_space_size_aligned * sizeof(Acc0DataType); const index_t c_block_bytes_end = - SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle); + SharedMemTrait::c_block_space_size * sizeof(CShuffleDataType); return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end); } // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template - __host__ __device__ static constexpr bool - CheckValidity(const AGridDesc& a_grid_desc, - const B0GridDesc_BK0_L_BK1& b0_grid_desc_bk0_l_bk1, - const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, - const CGridDesc_M_N& c_grid_desc_m_n, - const Block2CTileMap& block_2_ctile_map) + __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, + const B0GridDesc& b0_grid_desc, + const B1GridDesc& b1_grid_desc, + const CGridDesc_M_N& c_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) { static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && (LPerBlock % (LPerWmma * LRepeat)) == 0, @@ -455,10 +615,40 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle } }; + const auto GetB0ProblemsizeLK = [&]() { + if constexpr(B0EnableLds) + { + return make_tuple(b0_grid_desc.GetLength(I1), + b0_grid_desc.GetLength(I0) * b0_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(b0_grid_desc.GetLength(I1) * b0_grid_desc.GetLength(I2) * + b0_grid_desc.GetLength(I4), + b0_grid_desc.GetLength(I0) * b0_grid_desc.GetLength(I3) * + b0_grid_desc.GetLength(I5)); + } + }; + + const auto GetB1ProblemsizeNL = [&]() { + if constexpr(B1EnableLds) + { + return make_tuple(b1_grid_desc.GetLength(I1), + b1_grid_desc.GetLength(I0) * b1_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(b1_grid_desc.GetLength(I1) * b1_grid_desc.GetLength(I2) * + b1_grid_desc.GetLength(I4), + b1_grid_desc.GetLength(I0) * b1_grid_desc.GetLength(I3) * + b1_grid_desc.GetLength(I5)); + } + }; + const auto M = GetAProblemsizeMK()[I0]; - const auto L = b0_grid_desc_bk0_l_bk1.GetLength(I1); + const auto L = GetB0ProblemsizeLK()(I0); const auto K = GetAProblemsizeMK()[I1]; - const auto N = b1_grid_desc_l0_n_l1.GetLength(I1); + const auto N = GetB1ProblemsizeNL()(I0); if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1))) { @@ -567,17 +757,13 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle max_lds_align) : 0; static constexpr auto b0_block_space_size_aligned = - B0EnableLds - ? math::integer_least_multiple( - GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1().GetElementSpaceSize(), - max_lds_align) - : 0; + B0EnableLds ? math::integer_least_multiple( + MakeB0BlockDescriptor().GetElementSpaceSize(), max_lds_align) + : 0; static constexpr auto b1_block_space_size_aligned = - B1EnableLds - ? math::integer_least_multiple( - GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1().GetElementSpaceSize(), - max_lds_align) - : 0; + B1EnableLds ? math::integer_least_multiple( + MakeB1BlockDescriptor().GetElementSpaceSize(), max_lds_align) + : 0; static constexpr auto a_block_space_offset = 0; static constexpr auto b0_block_space_offset = a_block_space_size_aligned; @@ -599,14 +785,14 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle template - __device__ static void Run(const FloatA* __restrict__ p_a_grid, - const FloatB0* __restrict__ p_b0_grid, - const FloatB1* __restrict__ p_b1_grid, - FloatC* __restrict__ p_c_grid, + __device__ static void Run(const ADataType* __restrict__ p_a_grid, + const B0DataType* __restrict__ p_b0_grid, + const B1DataType* __restrict__ p_b1_grid, + CDataType* __restrict__ p_c_grid, void* __restrict__ p_shared, const AGridDesc& a_grid_desc, - const B0GridDesc_BK0_L_BK1& b0_grid_desc_k0_l_k1, - const B1GridDesc_BL0_N_BL1& b1_grid_desc_l0_n_l1, + const B0GridDesc& b0_grid_desc, + const B1GridDesc& b1_grid_desc, const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& c_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation& a_element_op, @@ -623,9 +809,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle const auto a_grid_buf = make_dynamic_buffer( p_a_grid, a_grid_desc.GetElementSpaceSize()); const auto b0_grid_buf = make_dynamic_buffer( - p_b0_grid, b0_grid_desc_k0_l_k1.GetElementSpaceSize()); + p_b0_grid, b0_grid_desc.GetElementSpaceSize()); const auto b1_grid_buf = make_dynamic_buffer( - p_b1_grid, b1_grid_desc_l0_n_l1.GetElementSpaceSize()); + p_b1_grid, b1_grid_desc.GetElementSpaceSize()); auto c_grid_buf = make_dynamic_buffer( p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); @@ -648,17 +834,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /*******************************************************************************/ // BlockLevel, A/B Matrix ThreadMapping in LDS, As Destinaion of BlockWise_Copy - const auto K = [&](){ - if constexpr(AEnableLds){ - return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); - } - else{ - return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); - } - }(); - constexpr auto a_block_desc = MakeABlockDescriptor(); - constexpr auto b0_block_desc_k0perblock_lperblock_k1 = GetB0BlockDescriptor_BK0PerBlock_LPerBlock_BK1(); + constexpr auto b0_block_desc = MakeB0BlockDescriptor(); auto a_block_trait = [&](){ // A matrix blockwise copy @@ -666,7 +843,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle { constexpr auto AK0PerBlock = KPerBlock/ AK1; auto a_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::a_block_space_offset, + static_cast(p_shared) + SharedMemTrait::a_block_space_offset, SharedMemTrait::a_block_space_size_aligned); auto a_blockwise_copy = @@ -677,8 +854,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, /* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatA, -/* typename DstData, */ FloatA, +/* typename SrcData, */ ADataType, +/* typename DstData, */ ADataType, /* typename SrcDesc, */ decltype(a_grid_desc), /* typename DstDesc, */ decltype(a_block_desc), /* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, @@ -705,13 +882,13 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // Thread-wise copy // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - auto a_block_buf = make_static_buffer( + auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); // Limitation: NumDim of Src and Dst descriptor should be identical auto a_blockwise_copy = - ThreadwiseTensorSliceTransfer_v2{}, @@ -736,20 +913,26 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle return make_tuple(a_block_buf, a_blockwise_copy); } }; + + auto b0_block_trait = [&](){ + if constexpr(B0EnableLds) + { + auto b0_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, + SharedMemTrait::b0_block_space_size_aligned); - // B matrix blockwise copy - auto b0_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, B0BlockTransferThreadClusterLengths_K0_L_K1, B0BlockTransferThreadClusterArrangeOrder, - FloatB0, - FloatB0, - decltype(b0_grid_desc_k0_l_k1), - decltype(b0_block_desc_k0perblock_lperblock_k1), + B0DataType, + B0DataType, + decltype(b0_grid_desc), + decltype(b0_block_desc), B0BlockTransferSrcAccessOrder, Sequence<0, 1, 2>, B0BlockTransferSrcVectorDim, @@ -760,15 +943,57 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle 1, B0ThreadTransferSrcResetCoordinateAfterRun, true>( - b0_grid_desc_k0_l_k1, + b0_grid_desc, make_multi_index(0, 0, 0), b0_element_op, - b0_block_desc_k0perblock_lperblock_k1, + b0_block_desc, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); + + return make_tuple(b0_block_buf, b0_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> LRepeat -> LWaves -> KRow -> LPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + auto b0_block_buf = make_static_buffer( + b0_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto b0_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + B0BlockTransferSrcScalarPerVector, + B0ThreadTransferSrcResetCoordinateAfterRun, + true>( + b0_grid_desc, + make_multi_index(0, + 0/(LWaves * LPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(b0_block_buf, b0_blockwise_copy); + } + }; auto a_block_buf = a_block_trait()[I0]; auto a_blockwise_copy = a_block_trait()[I1]; + + auto b0_block_buf = b0_block_trait()[I0]; + auto b0_blockwise_copy = b0_block_trait()[I1]; /*******************************************************************************/ // Gemm0 @@ -776,11 +1001,11 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle auto blockwise_gemm0 = BlockwiseGemmWMMA< BlockSize, - FloatA, - FloatB0, - FloatAcc0, + ADataType, + B0DataType, + Acc0DataType, decltype(MakeAWaveDescriptor(a_block_desc)), - decltype(MakeB0BlockDescriptor_K0_L0_L1_L2_K1(b0_block_desc_k0perblock_lperblock_k1)), + decltype(MakeB0WaveDescriptor(b0_block_desc)), MPerBlock, LPerBlock, KPerBlock, @@ -816,16 +1041,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle make_tuple(Sequence<3, 4, 5>{}, Sequence<0, 1, 2>{}, Sequence<6>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); -/*******************************************************************************/ - // LDS allocation for A and B: be careful of alignment - - auto b0_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::b0_block_space_offset, - SharedMemTrait::b0_block_space_size_aligned); - +/*******************************************************************************/ // Shift Per SUB_K constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); - constexpr auto b0_block_slice_copy_step = make_multi_index(BK0, 0, 0); + constexpr auto b0_block_slice_copy_step = MakeB0BlockSliceCopyStep(); const auto a_block_reset_copy_step = [&](){ if constexpr(AEnableLds){ @@ -836,14 +1055,30 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle } }(); - const auto b0_block_reset_copy_step = make_multi_index(-b0_grid_desc_k0_l_k1.GetLength(I0), LPerBlock, 0); + const auto b0_block_reset_copy_step = [&](){ + if constexpr(B0EnableLds){ + return make_multi_index(-b0_grid_desc.GetLength(I0), LPerBlock, 0); + } + else{ + return make_multi_index(-b0_grid_desc.GetLength(I0), LRepeat, 0, 0, 0, 0); + } + }(); + + const auto K = [&](){ + if constexpr(AEnableLds){ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); + } + else{ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); + } + }(); const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); /*******************************************************************************/ // softmax /*******************************************************************************/ auto workspace_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::reduction_space_offset, + static_cast(p_shared) + SharedMemTrait::reduction_space_offset, SharedMemTrait::reduction_space_size_aligned); // get acc0 7D thread cluster constexpr auto thread_cluster_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = @@ -879,7 +1114,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle make_tuple(mrepeat * mwave * mthreadpersubgroup, lrepeat * lwave * lsubgroup * laccvgprs)); auto blockwise_softmax = BlockwiseSoftmax{}; @@ -889,15 +1124,11 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new; running_sum = 0; running_sum_new = 0; - running_max = NumericLimits::Lowest(); - running_max_new = NumericLimits::Lowest(); + running_max = NumericLimits::Lowest(); + running_max_new = NumericLimits::Lowest(); /*******************************************************************************/ // set up Gemm1 /*******************************************************************************/ - // B1 matrix in LDS memory, dst of blockwise copy - constexpr auto b1_block_desc_l0perblock_nperblock_l1 = GetB1BlockDescriptor_BL0PerBlock_NPerBlock_BL1(); - constexpr auto b1_block_slice_copy_step = make_multi_index(BL0, 0, 0); - // Acc0 thread buffer -> A1 thread buffer -> blockwise gemm // A1 matrix in VGPR constexpr auto A1ThreadSlice_L0PerBlock_MPerBlock_L1 = make_tuple( @@ -915,8 +1146,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // A1 matrix blockwise copy auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic< - FloatAcc0, - FloatA, + Acc0DataType, + ADataType, decltype(acc0_thread_desc_l0perblock_mperblock_l1), decltype(a1_thread_desc_l0perblock_mperblock_l1), tensor_operation::element_wise::PassThrough, @@ -925,8 +1156,19 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle 2, laccvgprs>{tensor_operation::element_wise::PassThrough{}}; - // B1 matrix blockwise copy - auto b1_blockwise_copy = + auto a1_thread_buf = make_static_buffer( + a1_thread_desc_l0perblock_mperblock_l1.GetElementSpaceSize()); + + constexpr auto b1_block_desc = MakeB1BlockDescriptor(); + + auto b1_block_trait = [&](){ + if constexpr(B1EnableLds) + { + auto b1_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::b1_block_space_offset, + SharedMemTrait::b1_block_space_size_aligned); + + auto b1_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1< ThisThreadBlock, /* typename SrcElementwiseOperation, */ B1ElementwiseOperation, /* typename DstElementwiseOperation, */ tensor_operation::element_wise::PassThrough, @@ -934,10 +1176,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /* typename BlockSliceLengths, */ Sequence, /* typename ThreadClusterLengths, */ B1BlockTransferThreadClusterLengths_L0_N_L1, /* typename ThreadClusterArrangeOrder, */ B1BlockTransferThreadClusterArrangeOrder, -/* typename SrcData, */ FloatB1, -/* typename DstData, */ FloatB1, -/* typename SrcDesc, */ decltype(b1_grid_desc_l0_n_l1), -/* typename DstDesc, */ decltype(b1_block_desc_l0perblock_nperblock_l1), +/* typename SrcData, */ B1DataType, +/* typename DstData, */ B1DataType, +/* typename SrcDesc, */ decltype(b1_grid_desc), +/* typename DstDesc, */ decltype(b1_block_desc), /* typename SrcDimAccessOrder, */ B1BlockTransferSrcAccessOrder, /* typename DstDimAccessOrder, */ Sequence<1, 0, 2>, /* index_t SrcVectorDim, */ B1BlockTransferSrcVectorDim, @@ -949,26 +1191,64 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle /* bool ThreadTransferSrcResetCoordinateAfterRun, */ B1ThreadTransferSrcResetCoordinateAfterRun, /* bool ThreadTransferDstResetCoordinateAfterRun, */ true, // DstResetCoord NumGemmKPrefetchStage>( - b1_grid_desc_l0_n_l1, + b1_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), b1_element_op, - b1_block_desc_l0perblock_nperblock_l1, + b1_block_desc, make_multi_index(0, 0, 0), tensor_operation::element_wise::PassThrough{}); - - auto a1_thread_buf = make_static_buffer( - a1_thread_desc_l0perblock_mperblock_l1.GetElementSpaceSize()); - auto b1_block_buf = make_dynamic_buffer( - static_cast(p_shared)+ SharedMemTrait::b1_block_space_offset, - SharedMemTrait::b1_block_space_size_aligned); + + return make_tuple(b1_block_buf, b1_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 + constexpr auto LWmmaPerBlock = LTilePerBlock / WmmaL; + auto b1_block_buf = make_static_buffer( + b1_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto b1_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + B1BlockTransferSrcScalarPerVector, + B1ThreadTransferSrcResetCoordinateAfterRun, + true>( + b1_grid_desc, + make_multi_index(0, + n_block_data_idx_on_grid/(NWaves * NPerWmma), + get_thread_local_1d_id() / 32, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(b1_block_buf, b1_blockwise_copy); + } + }; + + auto b1_block_buf = b1_block_trait()[I0]; + auto b1_blockwise_copy = b1_block_trait()[I1]; + + constexpr auto b1_block_slice_copy_step = MakeB1BlockSliceCopyStep(); auto blockwise_gemm1 = BlockwiseGemmWMMA c_thread_buf; + StaticBuffer c_thread_buf; c_thread_buf.Clear(); /*******************************************************************************/ @@ -1014,8 +1303,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle a_grid_buf, a_block_buf, a_block_slice_copy_step, - b0_grid_desc_k0_l_k1, - b0_block_desc_k0perblock_lperblock_k1, + b0_grid_desc, + b0_block_desc, b0_blockwise_copy, b0_grid_buf, b0_block_buf, @@ -1106,20 +1395,20 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle acc1_thread_buf.Clear(); // preload data into LDS - b1_blockwise_copy.RunRead(b1_grid_desc_l0_n_l1, b1_grid_buf); + b1_blockwise_copy.RunRead(b1_grid_desc, b1_grid_buf); - b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_l0_n_l1, + b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc, b1_block_slice_copy_step); block_sync_lds(); // wait for reduction LDS read - b1_blockwise_copy.RunWrite(b1_block_desc_l0perblock_nperblock_l1, b1_block_buf); + b1_blockwise_copy.RunWrite(b1_block_desc, b1_block_buf); // main body if constexpr(num_gemm1_l_block_inner_loop > 1) { static_for<0, num_gemm1_l_block_inner_loop - 1, 1>{}([&](auto i) { - // Data cast from FloatAcc0 to FloatA happen here + // Data cast from Acc0DataType to ADataType happen here a1_blockwise_copy.Run(acc0_thread_desc_l0perblock_mperblock_l1, make_tuple(Number{}, I0, I0), acc0_thread_buf, @@ -1127,7 +1416,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle make_tuple(I0, I0, I0), a1_thread_buf); - b1_blockwise_copy.RunRead(b1_grid_desc_l0_n_l1, b1_grid_buf); + b1_blockwise_copy.RunRead(b1_grid_desc, b1_grid_buf); block_sync_lds(); @@ -1135,10 +1424,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle block_sync_lds(); - b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_l0_n_l1, + b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc, b1_block_slice_copy_step); - b1_blockwise_copy.RunWrite(b1_block_desc_l0perblock_nperblock_l1, b1_block_buf); + b1_blockwise_copy.RunWrite(b1_block_desc, b1_block_buf); }); } // tail @@ -1177,9 +1466,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) { static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) { auto I = Number{}; - FloatAcc1 acc1 = acc1_thread_buf[I]; // P*V - FloatAcc1 c = c_thread_buf[I]; // O - FloatAcc1 c_new = + Acc1DataType acc1 = acc1_thread_buf[I]; // P*V + Acc1DataType c = c_thread_buf[I]; // O + Acc1DataType c_new = (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c + math::exp(max[iM] - running_max_new[iM]) * acc1) / running_sum_new[iM]; @@ -1190,7 +1479,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_reset_copy_step); // rewind K - b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc_k0_l_k1, + b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc, b0_block_reset_copy_step); // rewind K and step N // update before next j iteration @@ -1220,7 +1509,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); auto c_shuffle_block_buf = make_dynamic_buffer( - static_cast(p_shared), + static_cast(p_shared), c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize()); constexpr auto c_block_desc_mrepeat_mwave_mthreadpersubgroup_nrepeat_nwave_nsubgroup_naccvgprs = transform_tensor_descriptor( @@ -1268,8 +1557,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle // shuffle: threadwise copy C from VGPR to LDS auto c_thread_copy_vgpr_to_lds = - ThreadwiseTensorSliceTransfer_v1r3, // BlockSliceLengths, CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, - FloatCShuffle, // typename SrcData, - FloatC, // typename DstData, + CShuffleDataType, // typename SrcData, + CDataType, // typename DstData, decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), Sequence<0, 1, 2, 3>, // typename DimAccessOrder, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 07d6b0848e4..0159a4ed7cf 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -719,7 +719,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_wmma // Thread-wise copy // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - auto b_block_buf = make_static_buffer( + auto b_block_buf = make_static_buffer( b_block_desc.GetElementSpaceSize()); // Limitation: NumDim of Src and Dst descriptor should be identical diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp index 8474d8a617f..3635d3bb2d1 100644 --- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp @@ -247,6 +247,34 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } + template + __host__ __device__ static constexpr auto + MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BKRow_LPerWmma_BK1( + const BGridDesc_L_K& b_grid_desc_l_k, + const WmmaK&, + const LRepeat&, + const LWaves&, + const LPerWmma&, + const BK1&) + { + const auto L0 = b_grid_desc_l_k.GetLength(I0) / NPerBlock; + const auto K = b_grid_desc_l_k.GetLength(I1); + const auto BKWmma = K / WmmaK{}; + constexpr auto BKRow = WmmaK{} / BK1{}; + + return transform_tensor_descriptor( + b_grid_desc_l_k, + make_tuple(make_unmerge_transform(make_tuple(BKWmma, BKRow, BK1{})), + make_unmerge_transform(make_tuple(L0 * LRepeat{}, LWaves{}, LPerWmma{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } + // // B1 // @@ -288,6 +316,34 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } + template + __host__ __device__ static constexpr auto + MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves_BLRow_NPerWmma_BL1( + const BGridDesc_N_L& b_grid_desc_n_l, + const WmmaL&, + const NRepeat&, + const NWaves&, + const NPerWmma&, + const BL1&) + { + const auto N0 = b_grid_desc_n_l.GetLength(I0) / OPerBlock; + const auto L = b_grid_desc_n_l.GetLength(I1); + const auto BLWmma = L / WmmaL{}; + constexpr auto BLRow = WmmaL{} / BL1{}; + + return transform_tensor_descriptor( + b_grid_desc_n_l, + make_tuple(make_unmerge_transform(make_tuple(BLWmma, BLRow, BL1{})), + make_unmerge_transform(make_tuple(N0 * NRepeat{}, NWaves{}, NPerWmma{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + } + // // C // From 8e862b7bb0f75eeecff57d6d86f8752eba3ca92c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 7 Mar 2023 05:07:17 +0000 Subject: [PATCH 056/118] Change GridwiseOp nam --- .../device_batched_contraction_multiple_d_wmma_cshuffle.hpp | 2 +- .../device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp | 2 +- .../gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp | 2 +- .../ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp | 2 +- .../impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 2 +- .../grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp | 2 +- .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 2 +- include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 56225d22998..26ac82c062d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -526,7 +526,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({}, {})); // GridwiseOp - using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< + using GridwiseOp = GridwiseGemmMultipleD_Wmma< // DataType Family ADataType, BDataType, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 1f65694cfa6..cdb9b056b4c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -286,7 +286,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle }; // GridwiseOp - using GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle< + using GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma< // DataType Family ADataType, B0DataType, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 696dcec60ac..3425bb1564d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -263,7 +263,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD(1, 1, 1)); // GridwiseOp - using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< + using GridwiseOp = GridwiseGemmMultipleD_Wmma< // DataType Family ADataType, BDataType, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 58ecc719086..b8f94532490 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -222,7 +222,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm({}, {})); // GridwiseOp - using GridwiseOp = GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle< + using GridwiseOp = GridwiseGemmMultipleD_Wmma< // DataType Family ADataType, BDataType, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 267221bda97..2d6725b8bb6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -182,7 +182,7 @@ template -struct GridwiseBatchedGemmSoftmaxGemm_Wmma_CShuffle +struct GridwiseBatchedGemmSoftmaxGemm_Wmma { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index cc9ae5e60aa..5b41910a7cb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -321,7 +321,7 @@ template < // DataType Family index_t NumGemmKPrefetchStage = 1, LoopScheduler LoopSched = make_default_loop_scheduler(), PipelineVersion PipelineVer = PipelineVersion::v1> -struct GridwiseGemmMultipleD_k0mk1_k0nk1_mn_wmma_cshuffle +struct GridwiseGemmMultipleD_Wmma { static constexpr index_t NumDTensor = DsDataType::Size(); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 0159a4ed7cf..241e69dfb8b 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -119,7 +119,7 @@ template -struct GridwiseGemm_k0mk1_k0nk1_mn_wmma +struct GridwiseGemm_Wmma { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; From a4694341bf4cc1f2c13c0e236180f2948ecc6ba1 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 23 Mar 2023 05:54:07 +0000 Subject: [PATCH 057/118] fix a typo caused bug --- include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 53da913f83a..d929dbebc20 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -47,7 +47,7 @@ __global__ void { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ defined(__gfx1102__)) - __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size]; GridwiseGemm::template Run(p_a_grid, p_b_grid, From dc8309db83130baeccd1514e29f86d86cf0d68c1 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 23 Mar 2023 10:46:40 +0000 Subject: [PATCH 058/118] Skip A_Lds sanity pass, Skip B_Lds scratch occured --- example/01_gemm/gemm_wmma_fp16.cpp | 10 +- include/ck/host_utility/kernel_launch.hpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 165 ++++++++++++------ .../gpu/device/impl/device_gemm_wmma.hpp | 103 +++++------ 4 files changed, 174 insertions(+), 106 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index a068ea20ea1..e0e7e5fae4a 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -42,8 +42,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, // K1 16, // MPerWmma 16, // NPerWmma - 8, // M Repeat - 1, // N-Repeat + 8, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 1, // N-Repeat // N-PerWmma / N-Repeat = N-Wave S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -51,16 +51,16 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, 8, true, - S<4, 16, 1>, + S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - 1, // C shuffle (M Repeat) Per store + 4, // C shuffle (M Repeat) Per store 1, // C shuffle (N Repeat) Per store - S<1, 16, 1, 16>, + S<1, 32, 1, 8>, 8>; // clang-format on diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index f5ad7408b55..c12147bb14f 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -35,7 +35,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, // warm up // kernel<<>>(args...); - const int nrepeat = 1; + const int nrepeat = 100; #if DEBUG_LOG printf("Start running %d times...\n", nrepeat); #endif diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index be8a44dae13..b91e1de4c9b 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -298,58 +298,123 @@ struct BlockwiseGemmWMMA auto b_thread_buf = make_static_buffer( b_thread_desc_.GetElementSpaceSize()); - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run( - a_block_desc_k0_m0_m1_m2_k1, - make_tuple( - Number{}, m0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0), - a_thread_buf); - - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, - n0, - I0, - I0, - I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0), - b_thread_buf); - - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(i) = - b_thread_buf[Number{}]; + // basic intrinsic to determine loopover direction + if constexpr(MRepeat < NRepeat) + { + static_for<0, KPerBlock / WmmaK, 1>{}( + [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, + m0, + I0, + I0, + I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0), + a_thread_buf); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, + n0, + I0, + I0, + I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0), + b_thread_buf); + + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + }); + + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); }); - - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); }); - }); - - }); + } + else + { + static_for<0, KPerBlock / WmmaK, 1>{}( + [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, + n0, + I0, + I0, + I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0), + b_thread_buf); + + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, + m0, + I0, + I0, + I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0), + a_thread_buf); + + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + }); + + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + } } protected: diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 03301c2949a..cf832a134d4 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -89,8 +89,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm; + using GridwiseGemm = + GridwiseGemm_Wmma; // Argument struct Argument : public BaseArgument @@ -572,7 +571,11 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm" - << " NumPrefetch: " + << " AEnableLds: " + << AEnableLds << ", " + << "BEnableLds: " + << BEnableLds << ", " + << "NumPrefetch: " << NumPrefetch << ", " << "LoopScheduler: " << LoopSchedToString[LoopSched] << ", " From 058300535b4ac4f7872dd4b93accd0ee2088e063 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 23 Mar 2023 14:49:21 +0000 Subject: [PATCH 059/118] Bug found, intra-row permute off caused --- example/01_gemm/gemm_wmma_fp16.cpp | 2 +- .../gpu/device/impl/device_gemm_wmma.hpp | 14 +++++++++----- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 1 - .../thread/threadwise_tensor_slice_transfer.hpp | 1 + 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index e0e7e5fae4a..45a7d8649ae 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -60,7 +60,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle true, 4, // C shuffle (M Repeat) Per store 1, // C shuffle (N Repeat) Per store - S<1, 32, 1, 8>, + S<1, 16, 1, 16>, 8>; // clang-format on diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index cf832a134d4..d083a709338 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -87,11 +87,15 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{MPerBlock, NPerBlock, KPerBlock}; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 86195387cb4..d77cb969ead 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -360,7 +360,6 @@ struct GridwiseGemmPipeline_v1<1, false, true> } }; -// placeholder template <> struct GridwiseGemmPipeline_v1<1, true, false> { diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 272de707c10..ad958dd2ab9 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1401,6 +1401,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // apply element-wise operation element_op_(v_this_row, src_buf[Number{}]); + // apply intra-row permute. if constexpr(IntraRowSwizzlePerm) { temp = __builtin_amdgcn_permlane16( From 44be6438065ed7b4c094aeae10a4677bfadcd0ed Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 23 Mar 2023 16:09:23 +0000 Subject: [PATCH 060/118] bug found --- .../gpu/thread/threadwise_tensor_slice_transfer.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index ad958dd2ab9..e12ba154c63 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1396,6 +1396,8 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow SrcData v_this_row, v_theother_row; // int type temp value due to intrinsic requirement + // TODO: This temp value will generate the scratch memory if + // IntraRowSwizzlePerm is flase int temp = 0; // apply element-wise operation From b8e153a43eba24233dd8dca1127fd5fb4a1c7e30 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 27 Mar 2023 10:11:55 +0000 Subject: [PATCH 061/118] a fix --- example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp | 6 +++--- .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 12b868c7d12..005e7a6eecd 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -87,8 +87,8 @@ using DeviceOpInstance = 8, 16, 16, - 8, - 1, + 4, + 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -105,7 +105,7 @@ using DeviceOpInstance = true, 1, 1, - S<1, 16, 1, 16>, + S<1, 32, 1, 8>, 8>; int main(int argc, char* argv[]) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 9e9e5d1ec3b..632c6539110 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -151,7 +151,7 @@ __global__ void #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ defined(__gfx1102__)) // printf("entry kernel launch"); - __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size]; const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); @@ -241,7 +241,7 @@ __global__ void { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ defined(__gfx1102__)) - __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + __shared__ char p_shared[GridwiseOp::SharedMemTrait::lds_size]; GridwiseOp::template Run(p_a_grid, p_b_grid, From 0f1fca4dd68412b3117fe8bf120eae6c4a590dda Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 27 Mar 2023 13:22:53 +0000 Subject: [PATCH 062/118] disable buffer load due to incorrect 3rd dword --- include/ck/ck.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 5009dec5e96..1c4c994b850 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -68,11 +68,12 @@ #define CK_USE_AMD_WMMA #endif +// TODO: enable buffer load when found correct 3rd dword // buffer load -#define CK_USE_AMD_BUFFER_LOAD 1 +#define CK_USE_AMD_BUFFER_LOAD 0 // buffer store -#define CK_USE_AMD_BUFFER_STORE 1 +#define CK_USE_AMD_BUFFER_STORE 0 // buffer atomic add: integer #define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1 From 31ca2f411fc42be14c82ca14f9b4084fe313cd23 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 29 Mar 2023 03:18:03 +0000 Subject: [PATCH 063/118] update fmha config, no scratch generated --- example/01_gemm/gemm_wmma_fp16.cpp | 10 +++++----- ...tched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 45a7d8649ae..f637568fc0b 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -42,8 +42,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, // K1 16, // MPerWmma 16, // NPerWmma - 8, // M-Repeat // M-PerWmma / M-Repeat = M-Wave - 1, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + 1, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 8, // N-Repeat // N-PerWmma / N-Repeat = N-Wave S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -58,9 +58,9 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, 8, true, - 4, // C shuffle (M Repeat) Per store - 1, // C shuffle (N Repeat) Per store - S<1, 16, 1, 16>, + 1, // C shuffle (M Repeat) Per store + 4, // C shuffle (N Repeat) Per store + S<1, 32, 1, 8>, 8>; // clang-format on diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index f79d75f99c3..ab07b593ea3 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -96,7 +96,7 @@ using DeviceGemmInstance = 256, // Gemm 0 128, // MPerBlock - 128, // LPerBlock + 64, // LPerBlock 32, // KPerBlock 8, // K1 // Gemm 1 @@ -108,7 +108,7 @@ using DeviceGemmInstance = 16, // NPerWMMA // Per repeat = wave_m = wave_num, wave_n = 1 1, // MRepeat - 8, // LRepeat + 4, // LRepeat 4, // NRepeat S<4, 64, 1>, // ABlockTransfer MK -> K0 M K1 S<1, 0, 2>, @@ -129,12 +129,12 @@ using DeviceGemmInstance = S<0, 2, 1>, 1, 8, - 1, // be eight? + 1, false, 1, // CShuffleMWmmaPerWavePerShuffle 2, // CShuffleNWmmaPerWavePerShuffle - S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock - 4, // CShuffleBlockTransferScalarPerVector_NPerBlock + S<1, 64, 1, 4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 8, // CShuffleBlockTransferScalarPerVector_NPerBlock MaskingSpec>; // MaskingSpecialization // Ref Gemm0: fp16 in, fp32 out From 82fef9e9887202da74dd25b2841815a358910f73 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 29 Mar 2023 03:44:15 +0000 Subject: [PATCH 064/118] update 3rd dword --- include/ck/ck.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 1c4c994b850..1d5c2a818f8 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -36,7 +36,7 @@ #elif defined(__gfx1030__) // for GPU code #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000 #elif defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) // for GPU code -#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000 +#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31004000 #endif // FMA instruction From 5e30377881ba4b818e28882bc5083c1cec1a9518 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 7 Apr 2023 02:22:27 +0000 Subject: [PATCH 065/118] fmha config update --- .../batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index ab07b593ea3..552fd9dd1cb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -96,12 +96,12 @@ using DeviceGemmInstance = 256, // Gemm 0 128, // MPerBlock - 64, // LPerBlock - 32, // KPerBlock + 64, // LPerBlock + 64, // KPerBlock 8, // K1 // Gemm 1 64, // NPerBlock - 32, // LTilePerBlock + 64, // LTilePerBlock 8, // L1 16, // MPerWMMA 16, // LPerWMMA From 2c265ebdc9f0f9993fbb205d364fde5d6e42b5e5 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 14 Apr 2023 03:33:39 +0000 Subject: [PATCH 066/118] FMHA, add support to gfx1101/gfx1102 --- .../device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp | 2 +- .../grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index cdb9b056b4c..2ded6d2eeea 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -588,7 +588,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static bool IsSupportedArgument(const Argument& arg) { - if(ck::get_device_name() == "gfx1100") + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102") { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 2d6725b8bb6..2ebd3b308c7 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -60,7 +60,7 @@ __global__ void const C0MatrixMask c0_matrix_mask, const Block2CTileMap block_2_ctile_map) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)) __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; const index_t num_blocks_per_batch = From cad3212d87b5046f6c9be37c547fbcfa84743726 Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Wed, 19 Apr 2023 22:09:44 +0800 Subject: [PATCH 067/118] Merge origin dev (#2) * [Navi3x] Fix Gridwise_multiple_d operation (#649) * Add CMake Option "USE_OPT_NAVI3X" * fix bug * standardize docs (#655) * Separate bibtex requirement from rocm-docs-core (#656) * separate bibtex requirement from rocm-docs-core * point requirements to source rocm-docs-core repo * Add CMake Option "USE_OPT_NAVI3X" (#647) * Add CMake Option "USE_OPT_NAVI3X" * remove navi3x opt compile option from cmake script * Conv + quantization + tanh (#645) * Rename file. Prepare to support another activation * Add comment for quantization * Extract out_elementop * Add tanh example * Add conv + bias + tanh quantization instance * Add missing parameter * Refine cmake * Add external api and client example * Extract variable in example * Fix the comment --------- Co-authored-by: zjing14 * Add a denorm test fix (#603) * Add type_convert implementations for bf16 * Add the fix for conv_fwd * Add the fix for conv_bwd_data * Add the fix for conv_bwd_weight * Format * Format * Another format * Add a macro to use workaround on MI200 only * Format --------- Co-authored-by: Rosty Geyyer Co-authored-by: zjing14 * simplify karg in device/grid of split-k op (#644) * simplify karg in device/grid split-k op * fix mk_kn_mn instances * add more instances * use name from tensor layout * fix 3rd dword of buffer source descriptor (#659) * add fp64 instances (#658) Co-authored-by: root * Issue #666: Revert "simplify karg in device/grid of split-k op (#644)" (#665) This reverts commit bb5530af91352dca062b791313d9b77700335ae9. * Groupnorm + swish external api (#668) * Rename to proper naming * Add example of groupnorm + swish * Extract duplicate code in example * Add groupnorm + swish instances * Ractor instance generation, split into multiple cpp file * Add external api and client example * Refine profiler message * Use ck math version of exp * Refine problem size in example * Add host version of exp * add a marco to turn on/off denorm fix (off by default) (#673) * add a marco to turn off denorm fix by default * expose the marco --------- Co-authored-by: root * fixed quant example (#672) Co-authored-by: root * Add dependabot config and pin rocm-docs-core (#663) * [gtest] suppress unsafe buffer warn (#670) ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912 * Add memory index guard in wmma device ops (#667) * Add more macros to turn on/off denorm fix (#678) Co-authored-by: Rosty Geyyer * Fix a typo (#676) * Add (#677) * Allow using ROCm release candidate compilers. (#679) * enable use of rocm5.5 release candidate 4 * upgrade to ROCM5.5 RC5 * try fix the PUB_KEY error, remove the cmake-data package * upgrade to latest cmake version * use private dockerhub repo for rocm5.5 rc5 * add missing bracket * add vector load check * solve conflicts --------- Co-authored-by: Sam Wu Co-authored-by: Sam Wu Co-authored-by: rocking5566 Co-authored-by: zjing14 Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: Rosty Geyyer Co-authored-by: carlushuang Co-authored-by: root Co-authored-by: Jun Liu Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> --- .github/dependabot.yml | 12 + .gitignore | 11 +- .readthedocs.yaml | 18 ++ CMakeLists.txt | 7 + Dockerfile | 37 ++- Jenkinsfile | 31 +- README.md | 14 +- client_example/04_contraction/CMakeLists.txt | 14 +- ...near.cpp => contraction_bilinear_fp32.cpp} | 0 .../contraction_bilinear_fp64.cpp | 281 +++++++++++++++++ ...n_scale.cpp => contraction_scale_fp32.cpp} | 0 .../04_contraction/contraction_scale_fp64.cpp | 270 +++++++++++++++++ client_example/09_quantization/CMakeLists.txt | 6 + ..._fwd_bias_relu_perchannel_quantization.cpp | 4 +- ...2d_fwd_bias_relu_perlayer_quantization.cpp | 99 +++--- ..._fwd_bias_tanh_perchannel_quantization.cpp | 209 +++++++++++++ ...2d_fwd_bias_tanh_perlayer_quantization.cpp | 201 +++++++++++++ .../conv2d_fwd_perchannel_quantization.cpp | 4 +- .../conv2d_fwd_perlayer_quantization.cpp | 99 +++--- client_example/18_groupnorm/CMakeLists.txt | 2 + .../18_groupnorm/groupnorm_swish.cpp | 169 +++++++++++ cmake/googletest.cmake | 1 + doc/markdown/dockerhub.md | 93 ------ doc/markdown/tutorial_hello_world.md | 191 ------------ docs/{ => .doxygen}/Doxyfile | 10 +- docs/.sphinx/_toc.yml.in | 1 + docs/.sphinx/requirements.in | 2 + docs/.sphinx/requirements.txt | 283 ++++++++++++++++++ docs/{source => }/API_Reference_Guide.rst | 2 +- docs/{source => }/Contributors_Guide.rst | 0 .../Supported_Primitives_Guide.rst | 2 +- docs/conf.py | 25 ++ {doc/image => docs/data}/ck_component.png | Bin {doc/image => docs/data}/ck_layer.png | Bin docs/{source => }/dockerhub.rst | 0 docs/index.rst | 52 ++++ docs/{source => }/refs.bib | 0 docs/run_doc.sh | 15 - docs/run_doxygen.sh | 10 - docs/source/Disclaimer.rst | 13 - docs/source/Linux_Install_Guide.rst | 15 - docs/source/Makefile | 20 -- docs/source/conf.py | 219 -------------- docs/source/index.rst | 16 - docs/source/rocm_logo.png | Bin 355437 -> 0 bytes docs/{source => }/tutorial_hello_world.rst | 0 .../grouped_conv_fwd_xdl_fp16.cpp | 2 +- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 4 +- .../40_conv2d_fwd_quantization/CMakeLists.txt | 5 + ...bias_relu_perchannel_quantization_int8.cpp | 8 +- ...l_bias_relu_perlayer_quantization_int8.cpp | 9 +- ...bias_tanh_perchannel_quantization_int8.cpp | 87 ++++++ ...l_bias_tanh_perlayer_quantization_int8.cpp | 85 ++++++ ...2d_fwd_dl_perchannel_quantization_int8.cpp | 6 +- ...nv2d_fwd_dl_perlayer_quantization_int8.cpp | 7 +- ...bias_relu_perchannel_quantization_int8.cpp | 8 +- ...l_bias_relu_perlayer_quantization_int8.cpp | 9 +- ...d_fwd_xdl_perchannel_quantization_int8.cpp | 6 +- ...v2d_fwd_xdl_perlayer_quantization_int8.cpp | 7 +- ..._bias_perchannel_quantization_example.inc} | 3 +- ...wd_bias_perlayer_quantization_example.inc} | 3 +- ...2d_fwd_perchannel_quantization_example.inc | 3 +- ...nv2d_fwd_perlayer_quantization_example.inc | 3 +- example/42_groupnorm/CMakeLists.txt | 3 +- example/42_groupnorm/common.hpp | 23 ++ .../groupnorm_sigmoid_mul_fp16.cpp | 56 ++++ example/42_groupnorm/groupnorm_swish_fp16.cpp | 40 +++ ...oid_fp16.cpp => run_groupnorm_example.inc} | 79 +---- include/ck/ck.hpp | 5 + .../device_grouped_conv_fwd_multiple_d.hpp | 2 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 3 +- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 89 +++++- .../gpu/device/impl/device_gemm_wmma.hpp | 71 ++++- ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp | 2 +- .../gpu/element/quantization_operation.hpp | 131 +++++++- .../element/unary_element_wise_operation.hpp | 30 +- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 3 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 9 + .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp | 26 +- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 2 +- .../gpu/grid/gridwise_gemm_wmma.hpp | 7 + .../grid/gridwise_gemm_xdlops_bwd_weight.hpp | 60 ++-- .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp | 44 +-- .../threadwise_tensor_slice_transfer_v3r1.hpp | 2 +- include/ck/utility/data_type.hpp | 56 +++- include/ck/utility/math.hpp | 4 + include/ck/utility/math_v2.hpp | 18 ++ .../device_operation_instance_factory.hpp | 11 + .../gpu/contraction_bilinear.hpp | 66 ++++ .../gpu/contraction_scale.hpp | 66 ++++ .../gpu/normalization_swish.hpp | 93 ++++++ ...n_bias_forward_perchannel_quantization.hpp | 94 ++++++ ...ion_bias_forward_perlayer_quantization.hpp | 92 ++++++ .../gpu/contraction_bilinear/CMakeLists.txt | 6 + ..._shuffle_f64_f64_f64_f64_kknn_instance.cpp | 76 +++++ ..._shuffle_f64_f64_f64_f64_knnn_instance.cpp | 76 +++++ ..._shuffle_f64_f64_f64_f64_mknn_instance.cpp | 76 +++++ ..._shuffle_f64_f64_f64_f64_mnnn_instance.cpp | 76 +++++ .../gpu/contraction_scale/CMakeLists.txt | 6 + ...xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp | 75 +++++ ...xdl_c_shuffle_f64_f64_f64_knn_instance.cpp | 75 +++++ ...xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp | 75 +++++ ...xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp | 75 +++++ .../gpu/normalization/CMakeLists.txt | 11 +- .../device_groupnorm_f16_instance.cpp | 23 ++ .../device_groupnorm_f32_instance.cpp | 23 ++ ...oupnorm_swish_f16_f32_f32_f16_instance.cpp | 24 ++ .../device_groupnorm_swish_f16_instance.cpp | 23 ++ .../device_groupnorm_swish_f32_instance.cpp | 23 ++ .../device_layernorm2d_f16_instance.cpp | 23 ++ .../device_layernorm2d_f32_instance.cpp | 23 ++ .../device_layernorm4d_f16_instance.cpp | 23 ++ .../device_layernorm4d_f32_instance.cpp | 23 ++ .../device_normalization_f16_instance.cpp | 70 ----- .../device_normalization_f32_instance.cpp | 69 ----- .../normalization_instance_common.hpp | 101 +++++++ .../conv2d_fwd/conv2d_quantization_common.hpp | 9 + ..._perchannel_quantization_int8_instance.cpp | 36 +++ ...as_perlayer_quantization_int8_instance.cpp | 37 +++ ..._perchannel_quantization_int8_instance.cpp | 35 +++ ...as_perlayer_quantization_int8_instance.cpp | 37 +++ .../profiler/profile_groupnorm_impl.hpp | 6 +- script/cmake-ck-dev.sh | 4 +- 123 files changed, 4098 insertions(+), 1041 deletions(-) create mode 100644 .github/dependabot.yml create mode 100644 .readthedocs.yaml rename client_example/04_contraction/{contraction_bilinear.cpp => contraction_bilinear_fp32.cpp} (100%) create mode 100644 client_example/04_contraction/contraction_bilinear_fp64.cpp rename client_example/04_contraction/{contraction_scale.cpp => contraction_scale_fp32.cpp} (100%) create mode 100644 client_example/04_contraction/contraction_scale_fp64.cpp create mode 100644 client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp create mode 100644 client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp create mode 100644 client_example/18_groupnorm/CMakeLists.txt create mode 100644 client_example/18_groupnorm/groupnorm_swish.cpp delete mode 100644 doc/markdown/dockerhub.md delete mode 100644 doc/markdown/tutorial_hello_world.md rename docs/{ => .doxygen}/Doxyfile (99%) create mode 100644 docs/.sphinx/_toc.yml.in create mode 100644 docs/.sphinx/requirements.in create mode 100644 docs/.sphinx/requirements.txt rename docs/{source => }/API_Reference_Guide.rst (98%) rename docs/{source => }/Contributors_Guide.rst (100%) rename docs/{source => }/Supported_Primitives_Guide.rst (99%) create mode 100644 docs/conf.py rename {doc/image => docs/data}/ck_component.png (100%) rename {doc/image => docs/data}/ck_layer.png (100%) rename docs/{source => }/dockerhub.rst (100%) create mode 100644 docs/index.rst rename docs/{source => }/refs.bib (100%) delete mode 100755 docs/run_doc.sh delete mode 100755 docs/run_doxygen.sh delete mode 100644 docs/source/Disclaimer.rst delete mode 100644 docs/source/Linux_Install_Guide.rst delete mode 100644 docs/source/Makefile delete mode 100644 docs/source/conf.py delete mode 100644 docs/source/index.rst delete mode 100644 docs/source/rocm_logo.png rename docs/{source => }/tutorial_hello_world.rst (100%) create mode 100644 example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp create mode 100644 example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp rename example/40_conv2d_fwd_quantization/{run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc => run_conv2d_fwd_bias_perchannel_quantization_example.inc} (98%) rename example/40_conv2d_fwd_quantization/{run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc => run_conv2d_fwd_bias_perlayer_quantization_example.inc} (98%) create mode 100644 example/42_groupnorm/common.hpp create mode 100644 example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp create mode 100644 example/42_groupnorm/groupnorm_swish_fp16.cpp rename example/42_groupnorm/{groupnorm_sigmoid_fp16.cpp => run_groupnorm_example.inc} (54%) create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000000..ada22f1b56d --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,12 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + - package-ecosystem: "pip" # See documentation for possible values + directory: "/" # Location of package manifests + open-pull-requests-limit: 10 + schedule: + interval: "daily" diff --git a/.gitignore b/.gitignore index 5667695bb55..362fb9e2ef0 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,11 @@ build* .gdb_history install.dir* -# directories containing generated documentation -docs/source/_build/ -docs/docBin/ +# documentation artifacts +build/ +_build/ +_images/ +_static/ +_templates/ +_toc.yml +docBin/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 00000000000..b7395368377 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,18 @@ +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +sphinx: + configuration: docs/conf.py + +formats: [htmlzip] + +python: + install: + - requirements: docs/.sphinx/requirements.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index a950f41e3da..841ad1c04f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ include(TargetFlags) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip) option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF) +option(USE_OPT_NAVI3X, "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) if(USE_BITINT_EXTENSION_INT4) add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) @@ -29,6 +30,12 @@ if(USE_BITINT_EXTENSION_INT4) message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}") endif() +if(USE_OPT_NAVI3X) + add_compile_options(-mcumode) + add_compile_options(-mno-wavefrontsize64) + message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}") +endif() + ## Threads set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) diff --git a/Dockerfile b/Dockerfile index b03cb836ad6..cbfd4626c1a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:20.04 -ARG ROCMVERSION=5.3 +ARG ROCMVERSION=5.4.3 ARG compiler_version="release" ARG compiler_commit="" @@ -8,23 +8,27 @@ RUN set -xe ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/ RUN useradd -rm -d /home/jenkins -s /bin/bash -u 1004 jenkins -RUN useradd -rm -d /home/manitera -s /bin/bash -u 1002 manitera # Add rocm repository RUN apt-get update -RUN apt-get install -y wget gnupg -RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - +RUN apt-get install -y wget gnupg curl +RUN --mount=type=ssh if [ "$ROCMVERSION" != "5.5"]; then \ + wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - ; \ + else sh -c "wget http://artifactory-cdn.amd.com/artifactory/list/amdgpu-deb/amd-nonfree-radeon_20.04-1_all.deb" && \ + apt update && apt-get install -y ./amd-nonfree-radeon_20.04-1_all.deb && \ + sh -c 'echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-release-archive-20.04-deb/ 5.5 rel-50 > /etc/apt/sources.list.d/rocm-build.list' && \ + amdgpu-repo --amdgpu-build=1558725 && DEBIAN_FRONTEND=noninteractive amdgpu-install -y --usecase=rocm ; \ + fi RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add - RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list" +RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg # Install dependencies RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \ apt-utils \ build-essential \ ccache \ - cmake-data \ cmake \ - curl \ git \ hip-rocclr \ jq \ @@ -45,6 +49,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- rocm-device-libs \ rocm-cmake \ vim \ + nano \ zlib1g-dev \ openssh-server \ clang-format-10 \ @@ -52,6 +57,17 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow- apt-get clean && \ rm -rf /var/lib/apt/lists/* +#Install latest version of cmake +RUN apt purge --auto-remove -y cmake +RUN apt update +RUN apt install -y software-properties-common lsb-release +RUN apt clean all +RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null +RUN apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" +RUN apt install -y kitware-archive-keyring +RUN rm /etc/apt/trusted.gpg.d/kitware.gpg +RUN apt install -y cmake + # Setup ubsan environment to printstacktrace RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer ENV UBSAN_OPTIONS=print_stacktrace=1 @@ -87,12 +103,7 @@ ENV compiler_commit=$compiler_commit RUN sh -c "echo compiler version = '$compiler_version'" RUN sh -c "echo compiler commit = '$compiler_commit'" -RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \ - sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \ - sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\ chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \ - fi - -RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \ +RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_version" !=~ ^"rc" ] && [ "$compiler_commit" = "" ]; then \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ cd llvm-project && mkdir build && cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ @@ -100,7 +111,7 @@ RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_com else echo "using the release compiler"; \ fi -RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \ +RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_version" !=~ ^"rc" ] && [ "$compiler_commit" != "" ]; then \ git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \ cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \ cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \ diff --git a/Jenkinsfile b/Jenkinsfile index bb0b352d757..19ee17cf92f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,12 +19,23 @@ def runShell(String command){ def getDockerImageName(){ def img - if (params.COMPILER_COMMIT == ""){ - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" + if (params.ROCMVERSION != "5.5"){ + if (params.COMPILER_COMMIT == ""){ + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" + } + else{ + def commit = "${params.COMPILER_COMMIT}"[0..6] + img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" + } } else{ - def commit = "${params.COMPILER_COMMIT}"[0..6] - img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" + if (params.COMPILER_COMMIT == ""){ + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}" + } + else{ + def commit = "${params.COMPILER_COMMIT}"[0..6] + img = "${env.CK_DOCKERHUB_PRIVATE}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}" + } } return img } @@ -49,11 +60,11 @@ def build_compiler(){ compiler = '/opt/rocm/bin/hipcc' } else{ - if (params.COMPILER_VERSION == "release"){ - compiler = "/opt/rocm/llvm/bin/clang++" + if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){ + compiler = "/llvm-project/build/bin/clang++" } else{ - compiler = "/llvm-project/build/bin/clang++" + compiler = "/opt/rocm/llvm/bin/clang++" } } return compiler @@ -232,7 +243,7 @@ def buildHipClangJob(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION != "release"){ + if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } @@ -287,7 +298,7 @@ def runCKProfiler(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION != "release"){ + if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } @@ -420,7 +431,7 @@ def Build_CK(Map conf=[:]){ dockerOpts = dockerOpts + " --env HSA_XNACK=1 " } def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' " - if (params.COMPILER_VERSION != "release"){ + if (params.COMPILER_VERSION == "amd-stg-open" || params.COMPILER_COMMIT != ""){ dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' " } diff --git a/README.md b/README.md index 151da974a55..04199f11bf2 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ CK utilizes two concepts to achieve performance portability and code maintainabi * A tile-based programming model * Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". -![ALT](/doc/image/ck_component.png "CK Components") +![ALT](/docs/data/ck_component.png "CK Components") ## Code Structure Current CK library are structured into 4 layers: @@ -16,7 +16,17 @@ Current CK library are structured into 4 layers: * "Instantiated Kernel and Invoker" layer * "Client API" layer -![ALT](/doc/image/ck_layer.png "CK Layers") +![ALT](/docs/data/ck_layer.png "CK Layers") + +## Documentation + +Run the steps below to build documentation locally. + +``` +cd docs +pip3 install -r .sphinx/requirements.txt +python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html +``` ## Contributors The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md) diff --git a/client_example/04_contraction/CMakeLists.txt b/client_example/04_contraction/CMakeLists.txt index 971d5d9f1c0..7ffedfeef36 100644 --- a/client_example/04_contraction/CMakeLists.txt +++ b/client_example/04_contraction/CMakeLists.txt @@ -1,8 +1,14 @@ -add_executable(client_contraction_scale contraction_scale.cpp) -target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations) +add_executable(client_contraction_scale_fp32 contraction_scale_fp32.cpp) +target_link_libraries(client_contraction_scale_fp32 PRIVATE composable_kernel::device_operations) -add_executable(client_contraction_bilinear contraction_bilinear.cpp) -target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations) +add_executable(client_contraction_bilinear_fp32 contraction_bilinear_fp32.cpp) +target_link_libraries(client_contraction_bilinear_fp32 PRIVATE composable_kernel::device_operations) + +add_executable(client_contraction_scale_fp64 contraction_scale_fp64.cpp) +target_link_libraries(client_contraction_scale_fp64 PRIVATE composable_kernel::device_operations) + +add_executable(client_contraction_bilinear_fp64 contraction_bilinear_fp64.cpp) +target_link_libraries(client_contraction_bilinear_fp64 PRIVATE composable_kernel::device_operations) add_executable(contraction_g1m2n3k1_add_xdl_fp16 contraction_g1m2n3k1_add_xdl_fp16.cpp) target_link_libraries(contraction_g1m2n3k1_add_xdl_fp16 PRIVATE composable_kernel::device_operations) diff --git a/client_example/04_contraction/contraction_bilinear.cpp b/client_example/04_contraction/contraction_bilinear_fp32.cpp similarity index 100% rename from client_example/04_contraction/contraction_bilinear.cpp rename to client_example/04_contraction/contraction_bilinear_fp32.cpp diff --git a/client_example/04_contraction/contraction_bilinear_fp64.cpp b/client_example/04_contraction/contraction_bilinear_fp64.cpp new file mode 100644 index 00000000000..9238e4cd800 --- /dev/null +++ b/client_example/04_contraction/contraction_bilinear_fp64.cpp @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp" +#include "ck/library/utility/numeric.hpp" + +using F64 = double; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = Bilinear; + +using ADataType = F64; +using BDataType = F64; +using AccDataType = F64; +using CShuffleDataType = F64; +using DDataType = F64; +using DsDataType = ck::Tuple; +using EDataType = F64; + +static constexpr ck::index_t NumDimM = 2; +static constexpr ck::index_t NumDimN = 2; +static constexpr ck::index_t NumDimK = 2; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ +// kknn +#if 1 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{524288, 4096, 128, 1}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{524288, 4096, 128, 1}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +// knnn +#elif 0 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{524288, 4096, 128, 1}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{64, 1, 131072, 2048}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +// mknn +#elif 0 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{128, 1, 245760, 3840}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{524288, 4096, 128, 1}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +// mnnn +#elif 0 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{128, 1, 245760, 3840}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{64, 1, 131072, 2048}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +#endif + + float alpha = 1.f; + float beta = 1.f; + + if(argc == 1) + { + // use default case + } + else if(argc == 25) + { + const ck::index_t M0 = std::stoi(argv[1]); + const ck::index_t M1 = std::stoi(argv[2]); + + const ck::index_t N0 = std::stoi(argv[3]); + const ck::index_t N1 = std::stoi(argv[4]); + + const ck::index_t K0 = std::stoi(argv[5]); + const ck::index_t K1 = std::stoi(argv[6]); + + a_ms_ks_lengths = {M0, M1, K0, K1}; + a_ms_ks_strides = { + std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])}; + + b_ns_ks_lengths = {N0, N1, K0, K1}; + b_ns_ks_strides = { + std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])}; + + d_ms_ns_lengths = {M0, M1, N0, N1}; + d_ms_ns_strides = { + std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])}; + + e_ms_ns_lengths = {M0, M1, N0, N1}; + e_ms_ns_strides = { + std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])}; + + alpha = std::stof(argv[23]); + beta = std::stof(argv[24]); + } + else + { + printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n"); + printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n"); + printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n"); + printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n"); + printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n"); + printf("arg23 to 24: alpha, beta\n"); + exit(0); + } + + auto f_tensor_space_size = [](auto lengths, auto strides) { + std::size_t space_size = 1; + for(std::size_t i = 0; i < lengths.size(); ++i) + { + space_size += (lengths[i] - 1) * strides[i]; + } + return space_size; + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * + f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides)); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * + f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides)); + SimpleDeviceMem d_device_buf(sizeof(DDataType) * + f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides)); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * + f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides)); + + using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD< + NumDimM, + NumDimN, + NumDimK, + ADataType, + BDataType, + ck::Tuple, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Bilinear>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{alpha, beta}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = + op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{d_device_buf.GetDeviceBuffer()}, + e_device_buf.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 1>{d_ms_ns_lengths}, + std::array, 1>{d_ms_ns_strides}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + ck::index_t M = ck::accumulate_n( + e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{}); + + ck::index_t N = ck::accumulate_n( + e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{}); + + ck::index_t K = ck::accumulate_n( + a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + + sizeof(DDataType) * M * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + return 0; +} diff --git a/client_example/04_contraction/contraction_scale.cpp b/client_example/04_contraction/contraction_scale_fp32.cpp similarity index 100% rename from client_example/04_contraction/contraction_scale.cpp rename to client_example/04_contraction/contraction_scale_fp32.cpp diff --git a/client_example/04_contraction/contraction_scale_fp64.cpp b/client_example/04_contraction/contraction_scale_fp64.cpp new file mode 100644 index 00000000000..3c36aa21eb6 --- /dev/null +++ b/client_example/04_contraction/contraction_scale_fp64.cpp @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp" +#include "ck/library/utility/numeric.hpp" + +using F64 = double; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Scale = ck::tensor_operation::element_wise::Scale; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = Scale; + +using ADataType = F64; +using BDataType = F64; +using AccDataType = F64; +using CShuffleDataType = F64; +using DsDataType = ck::Tuple<>; +using EDataType = F64; + +static constexpr ck::index_t NumDimM = 2; +static constexpr ck::index_t NumDimN = 2; +static constexpr ck::index_t NumDimK = 2; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ +// kkn +#if 1 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{524288, 4096, 128, 1}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{524288, 4096, 128, 1}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +// knn +#elif 0 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{524288, 4096, 128, 1}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{64, 1, 131072, 2048}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +// mkn +#elif 0 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{128, 1, 245760, 3840}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{524288, 4096, 128, 1}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +// mnn +#elif 0 + // A[M0, M1, K0, K1] + std::vector a_ms_ks_lengths{30, 128, 32, 64}; + std::vector a_ms_ks_strides{128, 1, 245760, 3840}; + // B[N0, N1, K0, K1] + std::vector b_ns_ks_lengths{32, 64, 32, 64}; + std::vector b_ns_ks_strides{64, 1, 131072, 2048}; + // D[M0, M1, N0, N1] + std::vector d_ms_ns_lengths{30, 128, 32, 64}; + std::vector d_ms_ns_strides{524288, 4096, 128, 1}; + // E[M0, M1, N0, N1] + std::vector e_ms_ns_lengths{30, 128, 32, 64}; + std::vector e_ms_ns_strides{524288, 4096, 128, 1}; +#endif + + float scale = 1.f; + + if(argc == 1) + { + // use default case + } + else if(argc == 20) + { + const ck::index_t M0 = std::stoi(argv[1]); + const ck::index_t M1 = std::stoi(argv[2]); + + const ck::index_t N0 = std::stoi(argv[3]); + const ck::index_t N1 = std::stoi(argv[4]); + + const ck::index_t K0 = std::stoi(argv[5]); + const ck::index_t K1 = std::stoi(argv[6]); + + a_ms_ks_lengths = {M0, M1, K0, K1}; + a_ms_ks_strides = { + std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])}; + + b_ns_ks_lengths = {N0, N1, K0, K1}; + b_ns_ks_strides = { + std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])}; + + e_ms_ns_lengths = {M0, M1, N0, N1}; + e_ms_ns_strides = { + std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])}; + + scale = std::stof(argv[19]); + } + else + { + printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n"); + printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n"); + printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n"); + printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n"); + printf("arg19: scale\n"); + exit(0); + } + + auto f_tensor_space_size = [](auto lengths, auto strides) { + std::size_t space_size = 1; + for(std::size_t i = 0; i < lengths.size(); ++i) + { + space_size += (lengths[i] - 1) * strides[i]; + } + return space_size; + }; + + SimpleDeviceMem a_device_buf(sizeof(ADataType) * + f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides)); + SimpleDeviceMem b_device_buf(sizeof(BDataType) * + f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides)); + SimpleDeviceMem e_device_buf(sizeof(EDataType) * + f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides)); + + using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD< + NumDimM, + NumDimN, + NumDimK, + ADataType, + BDataType, + ck::Tuple<>, + EDataType, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::PassThrough, + ck::tensor_operation::element_wise::Scale>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + const auto a_element_op = AElementOp{}; + const auto b_element_op = BElementOp{}; + const auto cde_element_op = CDEElementOp{scale}; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(), + b_device_buf.GetDeviceBuffer(), + std::array{}, + e_device_buf.GetDeviceBuffer(), + a_ms_ks_lengths, + a_ms_ks_strides, + b_ns_ks_lengths, + b_ns_ks_strides, + std::array, 0>{}, + std::array, 0>{}, + e_ms_ns_lengths, + e_ms_ns_strides, + a_element_op, + b_element_op, + cde_element_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + ck::index_t M = ck::accumulate_n( + e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{}); + + ck::index_t N = ck::accumulate_n( + e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{}); + + ck::index_t K = ck::accumulate_n( + a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{}); + + std::size_t flop = std::size_t(2) * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " + << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + return 0; +} diff --git a/client_example/09_quantization/CMakeLists.txt b/client_example/09_quantization/CMakeLists.txt index a4dd80cd3fd..2b7d6fc806a 100644 --- a/client_example/09_quantization/CMakeLists.txt +++ b/client_example/09_quantization/CMakeLists.txt @@ -1,6 +1,12 @@ +add_executable(client_conv2d_fwd_bias_tanh_perchannel_quantization conv2d_fwd_bias_tanh_perchannel_quantization.cpp) +target_link_libraries(client_conv2d_fwd_bias_tanh_perchannel_quantization PRIVATE composable_kernel::device_operations) + add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp) target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations) +add_executable(client_conv2d_fwd_bias_tanh_perlayer_quantization conv2d_fwd_bias_tanh_perlayer_quantization.cpp) +target_link_libraries(client_conv2d_fwd_bias_tanh_perlayer_quantization PRIVATE composable_kernel::device_operations) + add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp) target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations) diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp index cf6807f0ddf..a10dd3e006a 100644 --- a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp +++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp @@ -73,7 +73,7 @@ int main(int argc, char* argv[]) SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); - SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); + SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< @@ -203,4 +203,4 @@ int main(int argc, char* argv[]) } return 0; -} \ No newline at end of file +} diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp index 7cbbd283229..b8e6a493efc 100644 --- a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp +++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp @@ -26,15 +26,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clam static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t G = 1; -static constexpr ck::index_t N = 4; // batch size -static constexpr ck::index_t K = 64; // output channel -static constexpr ck::index_t C = 192; // input channel -static constexpr ck::index_t Y = 3; // filter H -static constexpr ck::index_t X = 3; // filter W -static constexpr ck::index_t Hi = 71; // input H -static constexpr ck::index_t Wi = 71; // input W -static constexpr ck::index_t Ho = 36; // output H -static constexpr ck::index_t Wo = 36; // output W +static constexpr ck::index_t N = 4; // batch size +static constexpr ck::index_t K = 64; // output channel +static constexpr ck::index_t C = 192; // input channel +static constexpr ck::index_t Y = 3; // filter H +static constexpr ck::index_t X = 3; // filter W +static constexpr ck::index_t Hi = 71; // input H +static constexpr ck::index_t Wi = 71; // input W +static constexpr ck::index_t Ho = 36; // output H +static constexpr ck::index_t Wo = 36; // output W +static constexpr float requant_scale = 0.5f; // requantize qAcc to qz struct SimpleDeviceMem { @@ -102,26 +103,27 @@ int main(int argc, char* argv[]) for(int i = 0; i < op_ptrs.size(); ++i) { - auto& op_ptr = op_ptrs[i]; - auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), - wei.GetDeviceBuffer(), - {bias.GetDeviceBuffer()}, - out.GetDeviceBuffer(), - in_lengths, - in_strides, - weight_lengths, - weight_strides, - {bias_lengths}, - {bias_strides}, - out_lengths, - out_strides, - conv_strides, - conv_dilations, - in_left_pad, - in_right_pad, - PassThrough{}, - PassThrough{}, - OutElementOp{0.5f, ActivationOp{}}); + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths}, + {bias_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{requant_scale, ActivationOp{}}); auto invoker_ptr = op_ptr->MakeInvokerPointer(); std::string op_name = op_ptr->GetTypeString(); @@ -165,25 +167,26 @@ int main(int argc, char* argv[]) auto& op_ptr = op_ptrs[best_op_id]; std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() << std::endl; - auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), - wei.GetDeviceBuffer(), - {bias.GetDeviceBuffer()}, - out.GetDeviceBuffer(), - in_lengths, - in_strides, - weight_lengths, - weight_strides, - {bias_lengths}, - {bias_strides}, - out_lengths, - out_strides, - conv_strides, - conv_dilations, - in_left_pad, - in_right_pad, - PassThrough{}, - PassThrough{}, - OutElementOp{0.5f, ActivationOp{}}); + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths}, + {bias_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{requant_scale, ActivationOp{}}); auto invoker_ptr = op_ptr->MakeInvokerPointer(); diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp new file mode 100644 index 00000000000..a0e1865d320 --- /dev/null +++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perchannel_quantization.cpp @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using BiasDataType = int32_t; +using RequantScaleDataType = float; +using OutDataType = int8_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using BiasLayout = ck::tensor_layout::convolution::G_K; +using RequantScaleLayout = ck::tensor_layout::convolution::G_K; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ActivationOp = ck::tensor_operation::element_wise::TanH; +using OutElementOp = + ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 4; // batch size +static constexpr ck::index_t K = 64; // output channel +static constexpr ck::index_t C = 192; // input channel +static constexpr ck::index_t Y = 3; // filter H +static constexpr ck::index_t X = 3; // filter W +static constexpr ck::index_t Hi = 71; // input H +static constexpr ck::index_t Wi = 71; // input W +static constexpr ck::index_t Ho = 36; // output H +static constexpr ck::index_t Wo = 36; // output W +static constexpr float sz_inv = 0.5f; // inverse of scale_z + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{G, N, C, Hi, Wi}; + std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; + std::array weight_lengths{G, K, C, Y, X}; + std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; + std::array bias_lengths{G, N, K, Ho, Wo}; + std::array bias_strides{K, 0, 1, 0, 0}; + std::array requant_scale_lengths{G, N, K, Ho, Wo}; + std::array requant_scale_strides{K, 0, 1, 0, 0}; + std::array out_lengths{G, N, K, Ho, Wo}; + std::array out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; + std::array in_left_pad{1, 1}; + std::array in_right_pad{1, 1}; + std::array conv_strides{2, 2}; + std::array conv_dilations{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); + SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); + SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K); + SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD< + NumDimSpatial, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + PassThrough, + PassThrough, + OutElementOp>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths, requant_scale_lengths}, + {bias_strides, requant_scale_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{sz_inv, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = + G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C + + G * sizeof(BiasDataType) * K + G * sizeof(RequantScaleDataType) * K + + G * sizeof(OutDataType) * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + // run the best intance + if(best_op_id != -1) + { + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths, requant_scale_lengths}, + {bias_strides, requant_scale_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{sz_inv, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp new file mode 100644 index 00000000000..7637f5c7853 --- /dev/null +++ b/client_example/09_quantization/conv2d_fwd_bias_tanh_perlayer_quantization.cpp @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using BiasDataType = int32_t; +using OutDataType = int8_t; + +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using BiasLayout = ck::tensor_layout::convolution::G_K; +using OutLayout = ck::tensor_layout::convolution::GNHWK; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ActivationOp = ck::tensor_operation::element_wise::TanH; +using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 4; // batch size +static constexpr ck::index_t K = 64; // output channel +static constexpr ck::index_t C = 192; // input channel +static constexpr ck::index_t Y = 3; // filter H +static constexpr ck::index_t X = 3; // filter W +static constexpr ck::index_t Hi = 71; // input H +static constexpr ck::index_t Wi = 71; // input W +static constexpr ck::index_t Ho = 36; // output H +static constexpr ck::index_t Wo = 36; // output W +static constexpr float sacc = 0.5f; // scale of acc +static constexpr float sz_inv = 0.5f; // inverse of scale_z + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{G, N, C, Hi, Wi}; + std::array in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C}; + std::array weight_lengths{G, K, C, Y, X}; + std::array weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C}; + std::array bias_lengths{G, N, K, Ho, Wo}; + std::array bias_strides{K, 0, 1, 0, 0}; + std::array out_lengths{G, N, K, Ho, Wo}; + std::array out_strides{N * Ho * Wo * K, Ho * Wo * K, 1, Wo * K, K}; + std::array in_left_pad{1, 1}; + std::array in_right_pad{1, 1}; + std::array conv_strides{2, 2}; + std::array conv_dilations{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); + SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + PassThrough, + PassThrough, + OutElementOp>; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths}, + {bias_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{sacc, sz_inv, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t flop = G * 2 * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = + G * sizeof(InDataType) * N * Hi * Wi * C + G * sizeof(WeiDataType) * K * Y * X * C + + G * sizeof(BiasDataType) * K + G * sizeof(OutDataType) * N * Ho * Wo * K; + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + // run the best intance + if(best_op_id != -1) + { + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {bias.GetDeviceBuffer()}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {bias_lengths}, + {bias_strides}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{sacc, sz_inv, ActivationOp{}}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp index c1c5a651eb3..6439c22e7fe 100644 --- a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp +++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp @@ -69,7 +69,7 @@ int main(int argc, char* argv[]) SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); - SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C); + SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); using DeviceOp = @@ -196,4 +196,4 @@ int main(int argc, char* argv[]) } return 0; -} \ No newline at end of file +} diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp index daeff4ff4f8..f7c46a95fe0 100644 --- a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp +++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp @@ -24,15 +24,16 @@ using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_ClampMakeArgumentPointer(in.GetDeviceBuffer(), - wei.GetDeviceBuffer(), - {}, - out.GetDeviceBuffer(), - in_lengths, - in_strides, - weight_lengths, - weight_strides, - {}, - {}, - out_lengths, - out_strides, - conv_strides, - conv_dilations, - in_left_pad, - in_right_pad, - PassThrough{}, - PassThrough{}, - OutElementOp{0.5f, ActivationOp{}}); + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {}, + {}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{requant_scale, ActivationOp{}}); auto invoker_ptr = op_ptr->MakeInvokerPointer(); std::string op_name = op_ptr->GetTypeString(); @@ -158,25 +160,26 @@ int main(int argc, char* argv[]) auto& op_ptr = op_ptrs[best_op_id]; std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() << std::endl; - auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), - wei.GetDeviceBuffer(), - {}, - out.GetDeviceBuffer(), - in_lengths, - in_strides, - weight_lengths, - weight_strides, - {}, - {}, - out_lengths, - out_strides, - conv_strides, - conv_dilations, - in_left_pad, - in_right_pad, - PassThrough{}, - PassThrough{}, - OutElementOp{0.5f, ActivationOp{}}); + auto argument_ptr = + op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + {}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + weight_lengths, + weight_strides, + {}, + {}, + out_lengths, + out_strides, + conv_strides, + conv_dilations, + in_left_pad, + in_right_pad, + PassThrough{}, + PassThrough{}, + OutElementOp{requant_scale, ActivationOp{}}); auto invoker_ptr = op_ptr->MakeInvokerPointer(); diff --git a/client_example/18_groupnorm/CMakeLists.txt b/client_example/18_groupnorm/CMakeLists.txt new file mode 100644 index 00000000000..17c88cb61bc --- /dev/null +++ b/client_example/18_groupnorm/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_groupnorm_swish groupnorm_swish.cpp) +target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_operations) diff --git a/client_example/18_groupnorm/groupnorm_swish.cpp b/client_example/18_groupnorm/groupnorm_swish.cpp new file mode 100644 index 00000000000..a79630c2371 --- /dev/null +++ b/client_example/18_groupnorm/groupnorm_swish.cpp @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/normalization_swish.hpp" + +using XDataType = ck::half_t; +using GammaDataType = float; +using BetaDataType = float; +using YDataType = ck::half_t; +using ComputeDataType = float; +using Swish = ck::tensor_operation::element_wise::Swish; + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; + + std::size_t xy_size = N * H * W * G * C; + std::size_t gamma_beta_size = G * C; + + std::vector xy_strides = {H * W * G * C, W * G * C, G * C, C, 1}; + std::vector gamma_beta_strides = {0, 0, 0, C, 1}; + + SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size); + SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_beta_size); + SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * gamma_beta_size); + SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size); + + using DeviceOp = ck::tensor_operation::device::DeviceNormalization; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_byte = + sizeof(XDataType) * xy_size + sizeof(GammaDataType) * gamma_beta_size + + sizeof(BetaDataType) * gamma_beta_size + sizeof(YDataType) * xy_size; + + float gb_per_sec = num_byte / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + + auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths + xy_strides, // xStrides + gamma_beta_strides, // gammaStrides + gamma_beta_strides, // betaStrides + xy_strides, // yStrides + {1, 2, 4}, // reduceDims + 1e-6, + x_device_buf.GetDeviceBuffer(), + gamma_device_buf.GetDeviceBuffer(), + beta_device_buf.GetDeviceBuffer(), + y_device_buf.GetDeviceBuffer(), + nullptr, + nullptr, + Swish{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake index 3c6cb56ccea..d6577ac33e7 100644 --- a/cmake/googletest.cmake +++ b/cmake/googletest.cmake @@ -21,6 +21,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS -Wno-comma -Wno-old-style-cast -Wno-deprecated + -Wno-unsafe-buffer-usage ) message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}") diff --git a/doc/markdown/dockerhub.md b/doc/markdown/dockerhub.md deleted file mode 100644 index 91b6cb2295c..00000000000 --- a/doc/markdown/dockerhub.md +++ /dev/null @@ -1,93 +0,0 @@ -## CK docker hub - -[Docker hub](https://hub.docker.com/r/rocm/composable_kernel) - -## Why do I need this? - -To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images. - -## So what is Composable Kernel? - -Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. - -To get the CK library - -``` -git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git -``` - -run a docker container - -``` -docker run \ --it \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace \ -rocm/composable_kernel:ck_ub20.04_rocm5.3_release \ -/bin/bash -``` - -and build the CK - -``` -mkdir build && cd build - -# Need to specify target ID, example below is for gfx908 and gfx90a -cmake \ --D CMAKE_PREFIX_PATH=/opt/rocm \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_CXX_FLAGS="-O3" \ --D CMAKE_BUILD_TYPE=Release \ --D GPU_TARGETS="gfx908;gfx90a" \ -.. -``` - -and - -``` -make -j examples tests -``` - -To run all the test cases including tests and examples run - -``` -make test -``` - -We can also run specific examples or tests like - -``` -./bin/example_gemm_xdl_fp16 -./bin/test_gemm_fp16 -``` - -For more details visit [CK github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel), [CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example), [even more CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example). - -## And what is inside? - -The docker images have everything you need for running CK including: - -* [ROCm](https://www.amd.com/en/graphics/servers-solutions-rocm) -* [CMake](https://cmake.org/) -* [Compiler](https://github.com/RadeonOpenCompute/llvm-project) - -## Which image is right for me? - -Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are: - -* "ck" - made for running Composable Kernel -* "ub20.04" - based on Ubuntu 20.04 -* "rocm5.4" - ROCm platform version 5.4 -* "release" - compiler version is release - -So just pick the right image for your project dependencies and you're all set. - -## DIY starts here - -If you need to customize a docker image or just can't stop tinkering, feel free to adjust the [Dockerfile](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile) for your needs. - -## License - -CK is released under the MIT [license](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE). diff --git a/doc/markdown/tutorial_hello_world.md b/doc/markdown/tutorial_hello_world.md deleted file mode 100644 index 297df10b5c6..00000000000 --- a/doc/markdown/tutorial_hello_world.md +++ /dev/null @@ -1,191 +0,0 @@ -## CK Hello world - -## Motivation - -This tutorial is aimed at engineers dealing with artificial intelligence and machine learning who would like to optimize their pipelines and squeeze every performance drop by adding Composable Kernel (CK) library to their projects. We would like to make the CK library approachable so the tutorial is not based on the latest release and doesn't have all the bleeding edge features, but it will be reproducible now and forever. - -During this tutorial we will have an introduction to the CK library, we will build it and run some examples and tests, so to say we will run a "Hello world" example. In future tutorials we will go in depth and breadth and get familiar with other tools and ways to integrate CK into your project. - -## Description - -Modern AI technology solves more and more problems in all imaginable fields, but crafting fast and efficient workflows is still challenging. CK is one of the tools to make AI heavy lifting as fast and efficient as possible. CK is a collection of optimized AI operator kernels and tools to create new ones. The library has components required for majority of modern neural networks architectures including matrix multiplication, convolution, contraction, reduction, attention modules, variety of activation functions, fused operators and many more. - -So how do we (almost) reach the speed of light? CK acceleration abilities are based on: - -* Layered structure. -* Tile-based computation model. -* Tensor coordinate transformation. -* Hardware acceleration use. -* Support of low precision data types including fp16, bf16, int8 and int4. - -If you are excited and need more technical details and benchmarking results - read this awesome blog [post](https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224). - -For more details visit our [github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel). - -## Hardware targets - -CK library fully supports "gfx908" and "gfx90a" GPU architectures and only some operators are supported for "gfx1030". Let's check the hardware you have at hand and decide on the target GPU architecture - -GPU Target AMD GPU -gfx908 Radeon Instinct MI100 -gfx90a Radeon Instinct MI210, MI250, MI250X -gfx1030 Radeon PRO V620, W6800, W6800X, W6800X Duo, W6900X, RX 6800, RX 6800 XT, RX 6900 XT, RX 6900 XTX, RX 6950 XT - -There are also [cloud options](https://aws.amazon.com/ec2/instance-types/g4/) you can find if you don't have an AMD GPU at hand. - -## Build the library - -First let's clone the library and rebase to the tested version: - -``` -git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git -cd composable_kernel/ -git checkout tutorial_hello_world -``` - -To make our lives easier we prepared [docker images](https://hub.docker.com/r/rocm/composable_kernel) with all the necessary dependencies. Pick the right image and create a container. In this tutorial we use "rocm/composable_kernel:ck_ub20.04_rocm5.3_release" image, it is based on Ubuntu 20.04, ROCm v5.3, compiler release version. - -If your current folder is ${HOME}, start the docker container with - -``` -docker run \ --it \ ---privileged \ ---group-add sudo \ --w /root/workspace \ --v ${HOME}:/root/workspace \ -rocm/composable_kernel:ck_ub20.04_rocm5.3_release \ -/bin/bash -``` - -If your current folder is different from ${HOME}, adjust the line `-v ${HOME}:/root/workspace` to fit your folder structure. - -Inside the docker container current folder is "~/workspace", library path is "~/workspace/composable_kernel", navigate to the library - -``` -cd composable_kernel/ -``` - -Create and go to the "build" directory - -``` -mkdir build && cd build -``` - -In the previous section we talked about target GPU architecture. Once you decide which one is right for you, run cmake using the right GPU_TARGETS flag - -``` -cmake \ --D CMAKE_PREFIX_PATH=/opt/rocm \ --D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_CXX_FLAGS="-O3" \ --D CMAKE_BUILD_TYPE=Release \ --D BUILD_DEV=OFF \ --D GPU_TARGETS="gfx908;gfx90a;gfx1030" .. -``` - -If everything went well the cmake run will end up with: - -``` --- Configuring done --- Generating done --- Build files have been written to: "/root/workspace/composable_kernel/build" -``` - -Finally, we can build examples and tests - -``` -make -j examples tests -``` - -If everything is smooth, you'll see - -``` -Scanning dependencies of target tests -[100%] Built target tests -``` - -## Run examples and tests - -Examples are listed as test cases as well, so we can run all examples and tests with - -``` -ctest -``` - -You can check the list of all tests by running - -``` -ctest -N -``` - -We can also run them separately, here is a separate example execution. - -``` -./bin/example_gemm_xdl_fp16 1 1 1 -``` - -The arguments "1 1 1" mean that we want to run this example in the mode: verify results with CPU, initialize matrices with integers and benchmark the kernel execution. You can play around with these parameters and see how output and execution results change. - -If everything goes well and you have a device based on gfx908 or gfx90a architecture you should see something like - -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 1.10017 ms, 117.117 TFlops, 87.6854 GB/s, DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 -``` - -Meanwhile, running it on a gfx1030 device should result in - -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -DeviceGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> NumPrefetch: 1, LoopScheduler: Default, PipelineVersion: v1 does not support this problem -``` - -But don't panic, some of the operators are supported on gfx1030 architecture, so you can run a separate example like - -``` -./bin/example_gemm_dl_fp16 1 1 1 -``` - -and it should result in something nice similar to - -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {1, 4096} -b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2} -arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2} -arg.c_grid_desc_m_n_{ 3840, 4096} -launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 3.65695 ms, 35.234 TFlops, 26.3797 GB/s, DeviceGemmDl<256, 128, 128, 16, 2, 4, 4, 1> -``` - -Or we can run a separate test - -``` -ctest -R test_gemm_fp16 -``` - -If everything goes well you should see something like - -``` -Start 121: test_gemm_fp16 -1/1 Test #121: test_gemm_fp16 ................... Passed 51.81 sec - -100% tests passed, 0 tests failed out of 1 -``` - -## Summary - -In this tutorial we took the first look at the Composable Kernel library, built it on your system and ran some examples and tests. Stay tuned, in the next tutorial we will run kernels with different configs to find out the best one for your hardware and task. - -P.S.: Don't forget to switch out the cloud instance if you have launched one, you can find better ways to spend your money for sure! diff --git a/docs/Doxyfile b/docs/.doxygen/Doxyfile similarity index 99% rename from docs/Doxyfile rename to docs/.doxygen/Doxyfile index ca354598b23..1084f94c81b 100644 --- a/docs/Doxyfile +++ b/docs/.doxygen/Doxyfile @@ -51,7 +51,7 @@ PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. -PROJECT_LOGO = ./rocm.jpg +PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is @@ -775,10 +775,10 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ../include/ck/tensor_operation/gpu/grid \ - ../include/ck/tensor_operation/gpu/block \ - ../include/ck/tensor_operation/gpu/thread \ - ../library/include/ck/library/utility +INPUT = ../../include/ck/tensor_operation/gpu/grid \ + ../../include/ck/tensor_operation/gpu/block \ + ../../include/ck/tensor_operation/gpu/thread \ + ../../library/include/ck/library/utility # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/docs/.sphinx/_toc.yml.in b/docs/.sphinx/_toc.yml.in new file mode 100644 index 00000000000..ff212488737 --- /dev/null +++ b/docs/.sphinx/_toc.yml.in @@ -0,0 +1 @@ +root: index diff --git a/docs/.sphinx/requirements.in b/docs/.sphinx/requirements.in new file mode 100644 index 00000000000..1905de6e6ca --- /dev/null +++ b/docs/.sphinx/requirements.in @@ -0,0 +1,2 @@ +rocm-docs-core==0.2.0 +sphinxcontrib-bibtex==2.5.0 diff --git a/docs/.sphinx/requirements.txt b/docs/.sphinx/requirements.txt new file mode 100644 index 00000000000..d1698b2855d --- /dev/null +++ b/docs/.sphinx/requirements.txt @@ -0,0 +1,283 @@ +# +# This file is autogenerated by pip-compile with Python 3.10 +# by the following command: +# +# pip-compile .sphinx/requirements.in +# +accessible-pygments==0.0.3 + # via pydata-sphinx-theme +alabaster==0.7.13 + # via sphinx +asttokens==2.2.1 + # via stack-data +attrs==22.2.0 + # via + # jsonschema + # jupyter-cache +babel==2.12.1 + # via + # pydata-sphinx-theme + # sphinx +backcall==0.2.0 + # via ipython +beautifulsoup4==4.11.2 + # via pydata-sphinx-theme +breathe==4.34.0 + # via rocm-docs-core +certifi==2022.12.7 + # via requests +cffi==1.15.1 + # via pynacl +charset-normalizer==3.1.0 + # via requests +click==8.1.3 + # via + # jupyter-cache + # sphinx-external-toc +comm==0.1.2 + # via ipykernel +debugpy==1.6.6 + # via ipykernel +decorator==5.1.1 + # via ipython +deprecated==1.2.13 + # via pygithub +docutils==0.16 + # via + # breathe + # myst-parser + # pybtex-docutils + # pydata-sphinx-theme + # rocm-docs-core + # sphinx + # sphinxcontrib-bibtex +executing==1.2.0 + # via stack-data +fastjsonschema==2.16.3 + # via nbformat +gitdb==4.0.10 + # via gitpython +gitpython==3.1.31 + # via rocm-docs-core +greenlet==2.0.2 + # via sqlalchemy +idna==3.4 + # via requests +imagesize==1.4.1 + # via sphinx +importlib-metadata==6.0.0 + # via + # jupyter-cache + # myst-nb +ipykernel==6.21.3 + # via myst-nb +ipython==8.11.0 + # via + # ipykernel + # myst-nb +jedi==0.18.2 + # via ipython +jinja2==3.1.2 + # via + # myst-parser + # sphinx +jsonschema==4.17.3 + # via nbformat +jupyter-cache==0.5.0 + # via myst-nb +jupyter-client==8.0.3 + # via + # ipykernel + # nbclient +jupyter-core==5.3.0 + # via + # ipykernel + # jupyter-client + # nbformat +latexcodec==2.0.1 + # via pybtex +linkify-it-py==1.0.3 + # via myst-parser +markdown-it-py==2.2.0 + # via + # mdit-py-plugins + # myst-parser +markupsafe==2.1.2 + # via jinja2 +matplotlib-inline==0.1.6 + # via + # ipykernel + # ipython +mdit-py-plugins==0.3.5 + # via myst-parser +mdurl==0.1.2 + # via markdown-it-py +myst-nb==0.17.1 + # via rocm-docs-core +myst-parser[linkify]==0.18.1 + # via + # myst-nb + # rocm-docs-core +nbclient==0.5.13 + # via + # jupyter-cache + # myst-nb +nbformat==5.7.3 + # via + # jupyter-cache + # myst-nb + # nbclient +nest-asyncio==1.5.6 + # via + # ipykernel + # nbclient +packaging==23.0 + # via + # ipykernel + # pydata-sphinx-theme + # sphinx +parso==0.8.3 + # via jedi +pexpect==4.8.0 + # via ipython +pickleshare==0.7.5 + # via ipython +platformdirs==3.1.1 + # via jupyter-core +prompt-toolkit==3.0.38 + # via ipython +psutil==5.9.4 + # via ipykernel +ptyprocess==0.7.0 + # via pexpect +pure-eval==0.2.2 + # via stack-data +pybtex==0.24.0 + # via + # pybtex-docutils + # sphinxcontrib-bibtex +pybtex-docutils==1.0.2 + # via sphinxcontrib-bibtex +pycparser==2.21 + # via cffi +pydata-sphinx-theme==0.13.1 + # via sphinx-book-theme +pygithub==1.57 + # via rocm-docs-core +pygments==2.14.0 + # via + # accessible-pygments + # ipython + # pydata-sphinx-theme + # sphinx +pyjwt==2.6.0 + # via pygithub +pynacl==1.5.0 + # via pygithub +pyrsistent==0.19.3 + # via jsonschema +python-dateutil==2.8.2 + # via jupyter-client +pyyaml==6.0 + # via + # jupyter-cache + # myst-nb + # myst-parser + # pybtex + # sphinx-external-toc +pyzmq==25.0.1 + # via + # ipykernel + # jupyter-client +requests==2.28.2 + # via + # pygithub + # sphinx +rocm-docs-core==0.2.0 + # via -r .sphinx/requirements.in +six==1.16.0 + # via + # asttokens + # latexcodec + # pybtex + # python-dateutil +smmap==5.0.0 + # via gitdb +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.4 + # via beautifulsoup4 +sphinx==4.3.1 + # via + # breathe + # myst-nb + # myst-parser + # pydata-sphinx-theme + # rocm-docs-core + # sphinx-book-theme + # sphinx-copybutton + # sphinx-design + # sphinx-external-toc + # sphinx-notfound-page + # sphinxcontrib-bibtex +sphinx-book-theme==1.0.0rc2 + # via rocm-docs-core +sphinx-copybutton==0.5.1 + # via rocm-docs-core +sphinx-design==0.3.0 + # via rocm-docs-core +sphinx-external-toc==0.3.1 + # via rocm-docs-core +sphinx-notfound-page==0.8.3 + # via rocm-docs-core +sphinxcontrib-applehelp==1.0.4 + # via sphinx +sphinxcontrib-bibtex==2.5.0 + # via -r .sphinx/requirements.in +sphinxcontrib-devhelp==1.0.2 + # via sphinx +sphinxcontrib-htmlhelp==2.0.1 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==1.0.3 + # via sphinx +sphinxcontrib-serializinghtml==1.1.5 + # via sphinx +sqlalchemy==1.4.46 + # via jupyter-cache +stack-data==0.6.2 + # via ipython +tabulate==0.9.0 + # via jupyter-cache +tornado==6.2 + # via + # ipykernel + # jupyter-client +traitlets==5.9.0 + # via + # comm + # ipykernel + # ipython + # jupyter-client + # jupyter-core + # matplotlib-inline + # nbclient + # nbformat +typing-extensions==4.5.0 + # via + # myst-nb + # myst-parser +uc-micro-py==1.0.1 + # via linkify-it-py +urllib3==1.26.15 + # via requests +wcwidth==0.2.6 + # via prompt-toolkit +wrapt==1.15.0 + # via deprecated +zipp==3.15.0 + # via importlib-metadata + +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/docs/source/API_Reference_Guide.rst b/docs/API_Reference_Guide.rst similarity index 98% rename from docs/source/API_Reference_Guide.rst rename to docs/API_Reference_Guide.rst index 3665049dd6f..b59c6e30269 100644 --- a/docs/source/API_Reference_Guide.rst +++ b/docs/API_Reference_Guide.rst @@ -49,4 +49,4 @@ used in the CK GPU implementation of Flashattention. .. doxygenstruct:: ck::ThreadwiseTensorSliceTransfer_StaticToStatic -.. bibliography:: \ No newline at end of file +.. bibliography:: diff --git a/docs/source/Contributors_Guide.rst b/docs/Contributors_Guide.rst similarity index 100% rename from docs/source/Contributors_Guide.rst rename to docs/Contributors_Guide.rst diff --git a/docs/source/Supported_Primitives_Guide.rst b/docs/Supported_Primitives_Guide.rst similarity index 99% rename from docs/source/Supported_Primitives_Guide.rst rename to docs/Supported_Primitives_Guide.rst index 066e024bc0b..4c3adf67d71 100644 --- a/docs/source/Supported_Primitives_Guide.rst +++ b/docs/Supported_Primitives_Guide.rst @@ -72,4 +72,4 @@ Else if :math:`j>1`, \tilde{Y}_{ij} &= \diag(z^{new}_{i})^{-1} \exp(\tilde{m}_{ij} - m^{new}_i ) \tilde{P}_{ij} \\ z_i &= z^{new}_i \\ m_i &= m^{new}_i \\ - \end{align} \ No newline at end of file + \end{align} diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000000..3ec81ee9df9 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,25 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +from rocm_docs import ROCmDocs + +docs_core = ROCmDocs("Composable Kernel Documentation") +docs_core.run_doxygen() +docs_core.setup() + +mathjax3_config = { +'tex': { + 'macros': { + 'diag': '\\operatorname{diag}', + } + } +} + +for sphinx_var in ROCmDocs.SPHINX_VARS: + globals()[sphinx_var] = getattr(docs_core, sphinx_var) + +extensions += ['sphinxcontrib.bibtex'] +bibtex_bibfiles = ['refs.bib'] diff --git a/doc/image/ck_component.png b/docs/data/ck_component.png similarity index 100% rename from doc/image/ck_component.png rename to docs/data/ck_component.png diff --git a/doc/image/ck_layer.png b/docs/data/ck_layer.png similarity index 100% rename from doc/image/ck_layer.png rename to docs/data/ck_layer.png diff --git a/docs/source/dockerhub.rst b/docs/dockerhub.rst similarity index 100% rename from docs/source/dockerhub.rst rename to docs/dockerhub.rst diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000000..f4e66c1b51f --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,52 @@ +============================ +Composable Kernel User Guide +============================ + +------------ +Introduction +------------ + +This document contains instructions for installing, using, and contributing to Composable Kernel (CK). + +----------- +Methodology +----------- + +Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++. + +CK utilizes two concepts to achieve performance portability and code maintainability: + +* A tile-based programming model +* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation". + +.. image:: data/ck_component.png + :alt: CK Components + +-------------- +Code Structure +-------------- + +Current CK library are structured into 4 layers: + +* "Templated Tile Operators" layer +* "Templated Kernel and Invoker" layer +* "Instantiated Kernel and Invoker" layer +* "Client API" layer + +.. image:: data/ck_layer.png + :alt: CK Layers + +Documentation Roadmap +^^^^^^^^^^^^^^^^^^^^^ +The following is a list of CK documents in the suggested reading order: + +.. toctree:: + :maxdepth: 5 + :caption: Contents: + :numbered: + + tutorial_hello_world + dockerhub + Supported_Primitives_Guide + API_Reference_Guide + Contributors_Guide diff --git a/docs/source/refs.bib b/docs/refs.bib similarity index 100% rename from docs/source/refs.bib rename to docs/refs.bib diff --git a/docs/run_doc.sh b/docs/run_doc.sh deleted file mode 100755 index 58b0936c678..00000000000 --- a/docs/run_doc.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -set -eu - -# Make this directory the PWD -cd "$(dirname "${BASH_SOURCE[0]}")" - -# Build doxygen info -bash run_doxygen.sh - -# Build sphinx docs -cd source -make clean -make -e SPHINXOPTS="-t html" html -make latexpdf diff --git a/docs/run_doxygen.sh b/docs/run_doxygen.sh deleted file mode 100755 index f66c038c1bb..00000000000 --- a/docs/run_doxygen.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -set -eu - -# Make this directory the PWD -cd "$(dirname "${BASH_SOURCE[0]}")" - -# Build the doxygen info -rm -rf docBin -doxygen Doxyfile diff --git a/docs/source/Disclaimer.rst b/docs/source/Disclaimer.rst deleted file mode 100644 index 5dcff748c87..00000000000 --- a/docs/source/Disclaimer.rst +++ /dev/null @@ -1,13 +0,0 @@ -************ -Disclaimer -************ -------------------------------- -AMD's standard legal Disclaimer -------------------------------- - -The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions, and typographical errors. The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited to product and roadmap changes, component and motherboard version changes, new model and/or product releases, product differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. Any computer system has risks of security vulnerabilities that cannot be completely prevented or mitigated. AMD assumes no obligation to update or otherwise correct or revise this information. However, AMD reserves the right to revise this information and to make changes from time to time to the content hereof without obligation of AMD to notify any person of such revisions or changes. THIS INFORMATION IS PROVIDED 'AS IS." AMD MAKES NO REPRESENTATIONS OR WARRANTIES WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. AMD, the AMD Arrow logo, Radeon, Ryzen, Epyc, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. Google(R) is a registered trademark of Google LLC. PCIe(R) is a registered trademark of PCI-SIG Corporation. Linux(R) is the registered trademark of Linus Torvalds in the U.S. and other countries. Ubuntu(R) and the Ubuntu logo are registered trademarks of Canonical Ltd. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. (C)2023 Advanced Micro Devices, Inc. All rights reserved. - ----------------------- -Third Party Disclaimer ----------------------- -Third-party content is licensed to you directly by the third party that owns the content and is not licensed to you by AMD. ALL LINKED THIRD-PARTY CONTENT IS PROVIDED "AS IS" WITHOUT A WARRANTY OF ANY KIND. USE OF SUCH THIRD-PARTY CONTENT IS DONE AT YOUR SOLE DISCRETION AND UNDER NO CIRCUMSTANCES WILL AMD BE LIABLE TO YOU FOR ANY THIRD-PARTY CONTENT. YOU ASSUME ALL RISK AND ARE SOLELY RESPONSIBLE FOR ANY DAMAGES THAT MAY ARISE FROM YOUR USE OF THIRD-PARTY CONTENT. diff --git a/docs/source/Linux_Install_Guide.rst b/docs/source/Linux_Install_Guide.rst deleted file mode 100644 index 0e16bb6a986..00000000000 --- a/docs/source/Linux_Install_Guide.rst +++ /dev/null @@ -1,15 +0,0 @@ -===================== -Getting Started Guide -===================== - ------------- -Introduction ------------- - -This document contains instructions for installing, using, and contributing to Composable Kernel (CK). - -Documentation Roadmap -^^^^^^^^^^^^^^^^^^^^^ -The following is a list of CK documents in the suggested reading order: - -[TODO] \ No newline at end of file diff --git a/docs/source/Makefile b/docs/source/Makefile deleted file mode 100644 index bde66ebc258..00000000000 --- a/docs/source/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SPHINXPROJ = CK -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index 65ac187034b..00000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,219 +0,0 @@ -"""Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop- - ies of the Software, and to permit persons to whom the Software is furnished - to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM- - PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS - FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR - COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE- - CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -""" - -# -*- coding: utf-8 -*- -# -# Composable Kernel (CK) docuumentation build configuration file, based on -# rocBLAS documentation build configuration file, created by -# sphinx-quickstart on Mon Jan 8 16:34:42 2018. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - -import os -import sys -import subprocess - -read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True' - -if read_the_docs_build: - subprocess.call('../run_doxygen.sh') - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = ['sphinx.ext.mathjax', 'breathe', 'sphinxcontrib.bibtex'] - -breathe_projects = { "CK": "../docBin/xml" } -breathe_default_project = "CK" - -bibtex_bibfiles = ['refs.bib'] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'Composable Kernel (CK)' -copyright = u'2018-2023, Advanced Micro Devices' -author = u'Advanced Micro Devices' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -#version = u'0.8' -# The full version, including alpha/beta/rc tags. -#release = u'0.8' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = 'en' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -# html_theme = 'alabaster' - -#if read_the_docs_build: -# html_theme = 'default' -#else: -import sphinx_rtd_theme -html_theme = "sphinx_rtd_theme" -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] -html_logo = "rocm_logo.png" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -html_theme_options = { - 'logo_only': True, - 'display_version': True -} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# This is required for the alabaster theme -# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars -# html_sidebars = { -# '**': [ -# 'relations.html', # needs 'show_related': True theme option to display -# 'searchbox.html', -# ] -# } - -mathjax3_config = { -'tex': { - 'macros': { - 'diag': '\\operatorname{diag}', - } - } -} - -# -- Options for HTMLHelp output ------------------------------------------ - -# Output file base name for HTML help builder. -htmlhelp_basename = 'CKdoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - 'preamble': r''' -\setcounter{tocdepth}{5} -\newcommand{\diag}{\operatorname{diag}} -''', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'CK.tex', u'Composabl Kernel (CK) Documentation', - u'Advanced Micro Devices', 'manual'), -] - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'ck', u'Composable Kernel (CK) Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'CK', u'Composable Kernel (CK) Documentation', - author, 'CK', 'Composable Kernel for AMD ROCm', - 'Miscellaneous'), -] diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 68adf58afd8..00000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,16 +0,0 @@ -============================ -Composable Kernel User Guide -============================ - -.. toctree:: - :maxdepth: 5 - :caption: Contents: - :numbered: - - Linux_Install_Guide - tutorial_hello_world - dockerhub - Supported_Primitives_Guide - API_Reference_Guide - Contributors_Guide - Disclaimer \ No newline at end of file diff --git a/docs/source/rocm_logo.png b/docs/source/rocm_logo.png deleted file mode 100644 index ee09dd09c71e3de1081f0f4c8b67967f79d768fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 355437 zcmV)XK&`)tP)Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!vFvd!vV){sAK>D|D{PpK~#8NZ2fh3 z9a+An3+D7WmCR{IF*7r>#mvlLvBk{H%#0RWvP_bhnbJ8QeLa~Y z24dbj6c;>#alWND}L3mVMgIDu&*cuta_&^_eS=Ej82o4H_ zt$`uz<&_Yop^YF3DTGT)B1}mde$qz~qpXa0c}2t>KZI5@16&LZ#aSPJJj%+!qL(k$ zgG2DJwgzV#>(G{;i>9(lbhmY4cxV9QV?&r39mW*bhx>Za+17%lrUumT`DOfG1%-vk z%*;eeaxx;rBM|5x03RP8xVX5$*wh4y{BBaxGB|Sd7>*n{f)giBproi6Cmfw%Wn~2m zOG{YV*ud1<23jVjkkQh?VP$pfS5m`4H7y*{(8WJeJsfbhm9 zL^ZV_y15l`9i2$%>PCEL7h>8w5Y^U(NLEyPyI3)u9f<4hL=wM$3jgP6L;c7c9YWUl z2(l)|kUcqp+?iPvEi9q@^g3!+w$Z${gQDRj_~+EYJ~j`oDP{0VuR?rPKl;wx#@5q6 zVCn9+7`b>KgBNaN{rL|#|HB`!`1B+i%`{pNeDy{(OJe-P`;cW-LryC1R6TH}4EZ^(wYp#F& z3hOVv#?rGFn19T_{pcCSA3VXxy~h~5_Xs_A@1yh19kkuNiI(fv(Q@q?TCQG0^OdV& z3G4rOyXo>3H0@qS)6Om$FYTb={CTuox`c);{!cc}qwCytw65-=ZuT5vDtloanFE`s zT-e6s!!fBCZs`^9$gYB0W;tBaiuk|J!-=>|*hVD7EFcnk?*7nr_Jq3q2`HLdLDtY1 z@+=K=OXykIK+nPo_HLdCk4=On+nl+DC2VZ0VPRn|uA5ofLrzs6UmQ}x7su4GS56Pd zbu6H2>jG^TZ?;o+95ZmjK}}a2)b)XcT?AAEGN2z@3e}K2?6Z!be5B;Z3c#ldL3pRLtq#4t&8V&`#i`tMG!T3}%{7>4uR%|BKIX@} zxuJGqX?+Ek_|H2%zks{fZ(?KZ3`R!AaGw9Xix)3om5xc2rh3psKPS z?ruIvIhlb9!f9w|7)|9hsLd}y?#Xl{M#UhSuuM-+M?i28OwCQO7CPL4forZ&>BXkY)Ag3q~X&G4p?Jsm!z?-u=5YzI zN=kvv$u!vV^EhVb!i61=2mfb2>|_G?n}iT{LWMOnBC?slYi&aee?I|uiQPR&V8wQJ z{vCEv9RwgNx}zPjT>|(zkl5RWWWw)ce;-Z`4&Wp^&g{u)WQ_UR-Om`keGm1U zTc|m+f!eK2G@Lt&Cc>}d(glp&y^V!u&j_XOFh$S}-MoXo>$fmSs0`kFfRTrfG4}W= zCLTW%p*Q#R#armT`oBR>ECGDqd?(flfwuaM0JZ;T3HPsm_Y9qulF_WULo)Zy^A|&WCzf8@giEc&ZCJP%Fy{csGZzGXl^S^{L-Q677yd#Y}hB2 z!!5g>RSU#6Y$W|7chQM8m`<47x5p(6D!bvZXC#jZ7h-tp^1{Pv6!a zTBhbOWCt7^9ShHZAea$?LK_u;XJKgpGXhaV+Xx4bE90X*GI)Ql9KMiH$1x2PXxY2L zjB;Y=5eQ`)4@et%;FysQ=^XMNwrjnrV7XZH+V!njZ~tWTGE4OEb7~ z>pFJWNX9$caQ{3_kDyywoI__<3t=^it&P(-yRn4UCc?6-4Xw=$2n_Itr-ui^Xe%L+ z;V?8ap~V!Supl2{p&@X0ut#%#9%kEHFhAUj;ek#Rlonx>4L<)=5t=F+aWXaur!q3p z$oH$Kft{k+mFE0_Vh^fDa7(Id;-In`_7i*p^bTpU z7d0?}thqH*9Zo>Q*$tZRUeNOmfFb{$NkkMZViRDQm<*egG}!U8*t2tTIz_w6FM>Ov z=UY*Qz}i}bG&CTTz!NR6rBwvpua=k8(~Cq_Y*!Z{N6-;|V(~jibqcok27K`}z67xd z%3i*vuOI276Ewh4n%^7>XI4=(xrWq^N%-Y75_p7NRtdtH6cVbNki-9L?c`~UT)2k$ zTTjrxeH9&NE@JZ5L(D#WhSqbJP_eRxsxzCY+}uR<*)0)#g7x)XzlOz^FR}TjGdGU%bNUufM~>i*GUY=p}~VLa&QpZl{^H-nd2Z z-6Z%3zbn@XIKlF0w!aqdYvz4TyH^Rl%cAKuUOY$epGEuG3+OttgQ2sxkybNIz@$OP zB@spe0`Mx~Thxhw@^1JPwctcr8BD@XLc=YX0CI=Aojdeg{9r^N>7MX}s?`Lr8Iu? zg%m!f@f}w;fdSvwF*FJeVX-h}YSXaygS@pr6x|b{9a#*^>~>gWcS1G10{d-}@R3Fk zK2-F@=gI;2f@b%bY9KySBka^dxD3M=Dv@|+gblQdKp7nDLoFj#YfleqnwwD5Sc}%q z7PR%WV`c3$Ze88Q`I$*9R2JdxSRZyKhOjcygR+U|ySRq4Gs9S)97H9JF_bpt zW@`gqXIDhi9IUw!>gwvEqM#UQv9a)Ru}8RvJ=QwvF;bL?^`TbuHJ2kQI)u?K12co; ze6M+oa${+%EJa0CF|zYAP*z%kYDUEbnn)~7@8ro;czAljmOY6(qqGa{#@EUk&Z?@2 zr{&}s8luPE9-RjI=+V;@tfabg<$ACe;M5uE~AaWD?7b{VpjRaCQ8;eP<3`2wF2}mY@_Sy6^z|~fEhw< z;QDQh+=|LLp=|C5ZF_fU z+B!qYl(CkbsQ^5AJtL@7K9uzgplxaf4;r3Na0pD9M$E|q3oCOPo+)FkC3FqVa70oS zAATu`cR!cHdtb`p)5FS;)G&pyvo~DAX^7l5U89m<8x#japE#)dra?2J1V*XNFgw)) zz2r8?_!QwQlX!fh9)izRL-3_qD8A4L!{-{|_*^rR%P8I#gLf82MlsOQivHei401yp z8lOT(?;vXG8fYCQ;(n%!{O3I0-NE(o5iDn=U?DjQpkHkQbym3i4)!_ia|c~Rad^0vqGxk2oXwIeIi z2l25X7#!-ueg1$~&TL_GVFst?M){c<(a66sJ~D!wjBF$)B*V`y0D(av(A3m|9eb1$ z)MhiTI}voQ2D)&enI-7!U_2}slWr~;GB-hkiabgbr4fGk0D|}KMb5!v*rX9Y3X8&q z{RvzO3CHEaA~a{G60D&}i;qKfej)m~Vc%dpdUgE@{&?d$=29}z#Gb9m$`qI4BXOVK z@lj+9?#0IAIZgJ5?q)n6>&8TFG3pC*G1StH>HaB<4NYKtY?9w`7(IQxBJ>(sno&bz zDyynS9{+assa&MeuwwXr!2!YW_wj?9t1C>*OrfZz3<+5&96fd%M~@zZhK2?zDk{YI zD?rcO+#I&{cI-`UMd&Fqh96N1fBprA=>_GdA~w0skiTc z4?QvB@;+9K06+1z072dsUf(ys8bsRg1Tu!FaB6r43C(?ouI@nf)H23)cCh{82^MeP z!1&HZtlYbg3xvbvAAiO<0%Bl?X0x%4%B3~rP0t~J{xk}g*N}6185P^xs3!CpE}lm# z-=}@&GNbMlG@reQ!JGF9y}Pu%d+6i(;2qlEgC`gx_yp()wkO2g+2>zlmIgLQ@XZr^ zzt(~PKLKt6%vi$uKbH9TE3UIn)A)p1WE`G<^%^rTzsA&y7nmUUMhU$M+TJXUa)HlX z`R=zg!{5^UzQrgbZa3lFPAhGrkqXdjW+eYR_z1XvgkGCaR|vjc{!jQl8wkA`0>AG3 zHkui4`?jy6d|(MSff+DzONOCmDvSb8!8MZ|TFn5$YkLt~-U6STGT4MCK+DMsDi)5A z)-{KWo;g&^?4fFI3k6DmtiCa1bq%4YZv;(K3&^Oc!<6kZA|VkDlne{5o3it^w6=hi zjTH)88KB8PWJ* zaRkp!k6@yy6brM%SUt0ZbK6^3TU|msO|ZVD9HaBY7@inJF- zi|(E{A0CEng85-tDQ+@uc2}1mJ~0Ljgxws!|Ki*NZtPycZ`nBi$G6|&>EaUVJUmfi zsDn`_J6vNde3hDkXL-4JLNndYOT~0vGHNnnQC*UYp7t&b6L=#NQ|KEWLI2<&x@pyI z9qni&^s4#4EumrNlBg*^@y;@H_)T$IXXH*-@pKhDoT))k;1X#5;%0|FwD%% zP|w#22RVdjYbErej0{$6T)OOGWi>RhUrqsE$thyrThlwpcq?sU2?fS}0eaeQUeNXS z6QO4o5e-YmeA|>amgkuHCf1&y<+(80dK8z5`}2Z@eR%|RSQCNQ)Qkw0V0y7^LxSlg z6L!Kry?6plu(w~KC)At$dT)RiB^=~vZ5I!8M6|Yw2RkC$8IgN?kiht#!p(|GG=g+-lET-iNB>-)n&z<4@ zlZ4*b^A{LptR3Os86jZDh5dPa&N895OpBc-^u}qjUAG?)c=yqI^B(Js?KKf@&8(Jd zw+K5z@G5T;fP&=-!28wo1n5;U-c~T?*KM3b-Qroq)8;Jw(_rA12z`%a==-O`Cb5L3 z*M^YFHUt({!8;=l_Tka6_78!Hn=kZec(QtCIIf`&MMDcH7??p)Qx9?i^z;m&##k%I z_U9202>-Bfn6o1`XZka@wuGsL39RjGpiMJ8D4~Q8_sHO%K9j~jeJ+Fd_OiW7XhKHQ z0=m|2Fm?8aonIu}qff#=B?sQAB`^suhLT?)j(g=oF02xoNgYs+X~i*@0(_;lIW+r;kQdca3ucxC8jkQhaBlJePhR|8pir%J9%=Hdq zhcRQFjeRpY0V_fN7^D@AIeTK-)f?kZt{CD*xfL3QHEuxb?!MRzioh(RQd6)$Vy#W! zt|^ba5Ko*6@u2;w!plMf`Qh%^ZY;q!1lqTo^H?8j!Iqkr`90e=4aC7x|D9t8!X+{6fzeE&;+LHOoW0bW*C;$mJVrW3=_mlA=x zjCeFv7Nft5od`Ev;b6?@=m-Y-`q4umv@+T@(3YxdYf)TQhHN3)66Wy)Pk5+s<12YZ z?4{`)AoLFW1N7ARKhSpfhMtc<41+>p8WAHx@7H~KLbMeuPk^2fZ9PlM3B4)=)z*s# zFob9;nBK3@`^SBHzaF`b{`DXRO8{D=06MVyj zKP8LHNE;nT8atZ|cHEisi^yGBM$yJ9$~HGpeU`uD`HLuD-aum4AX57$P_c9tJy-9e zgWwyu^B8>u%;3F;;z5nkhmSD+nAZ2~C8l4z#_ZPw8KLyQt9e@7uO-|Tp8IPpy!u`& z@qX46p*PMbJ3{CU3(zB2hZwnsSrg9$xbl6z{(->#24jR?H(}g%`yrYc!-aYSzB>dS zqpna_ZxMKJ_VEe(^L~Y1b|LF@k{PR}|`1mk8XL&XSAandC5WOYAQo#b1hmLn?>8}?d(HkX%%|A2hiW%i_?9h*zF&| z9X9eI4=+snhGID^9s@33s5LQT)U(8;w38zEPJ4Od49UJ07cIu&bQ5i)8E9ZVCl&YV z%Fz+zMay$Wb!IG9TB>nld;k|l`>;JTip9wh%rHh@+1$d_%}p$f4PpDt3Wml8F*rJk z+R6r0u-8e5jz?;0DheygkX4umQ*Q8~fq}HDVoVPYpr#}rWhWDGiB>Ta!_R5qfB_v- z46!$A)6~U`iwB0>JTM#)il@^fn5!?v<@s^krd?iLTSEt<+5#=%+T~k#^57YM{O$(= z?>-(qxsRW}{TkQj7tjzGf@+7i5I*01@vj}hKKyWqT z+rUot@;$U&x`p1G57A4wbP*`Sgx=5t;fU@dM$Z?Rdj5)$^cxXkGyhNm#Qt&p*W1&4 zu28=|CKl~a0Nw;+@+hGKBiAF3`GwbismK}KQg-a+ucMhdn+bCMvK;Fa>QrgB4UeW=# zlrm_zMM2Lm5%!6Bh^^~FM&}?-Hnk#-NhBmIlQHuo0+UnW92x}!cAAQYW{}X(#W7Va zNT_Ksmg+)|C8I7_UJ!zr25p=iVZ_vDZe<|?&(hWsy2i#hCaZ>z_R8Wngxx>yQNafX zHSmR`9u6y-;IM)*4#*f{zpN3Cs9HhJ$OTH4o{+Qf#bLt`95PSDA;(Ne_?ALBrVW~j zy^!~>#{tVz1YIf)5^{%((s9@{69-K)vEPVgkcoHNa$=Dc;*7rfVl-A1p^J^Or<)ei z(~H8Q5|mZdps#lb^Fvd(I5LCn%37=>r(-294gDSgXmWHxfxbS*1N^ZV9E1(p&qiPn z&d0>yQbsD4(h|_*=Yl$KC+t)e;QPLIY~*L6voHg7g<0sUEWuny3kK_}G272*T~~vp zfdO2bnZvcoY2064!S&^3nr9~=T!#3l1hh7^p_Ut7w{TQ2F&aYGMr2qRsu)ja zCq^+pHHx{OPE65IdiXh~oqREC=ZQHFUo;t;VU&^f2LkGM=Qi-)zkZ07=|SwAU&l93 zX$@=ZSYZ^p%ZC2s)&sn}`w+i-{R%&Q^8(*}_Y8NhU&8j-80teJP-AP4-Lwom%qzxk zJ9==Xx(YR3Zs-dO#CS>^22W+6EjtsVH4Qj7HjVAUVVow2=Lx47=07l^m zuZg)COxTO6>gnOAk`ngFzJcDC!oEHBzK3)TMCd76JG_COmoE(bgJDdwG>_tY#U+XR z^L~Y%Z~#Myw(dn`@Gh%@AKSp+p-0<`5DsFnvy5(PK`a3lCxDL?!}TbBKfwaU2u#3- zzqL8R?4tP`q2566&GQ84g|-l+?JbDvZbNKeC*u3M2(C_Kjt(+@u3_xX4J$TjGw>;CfKyf}%9pm$N$_=Cy@#%w_t8#!>bv&<{rB$^I*%~+KqW|E*2U?M*pnMRf8>*t@!l2QQ3FO<==FYiwrBBw`6A zO>KN}NKUl8clW8`gF`y_=$J0PkT$@V1m73ObnwX`ZG3!47oSU-V6UPz_NrRrb0u4r zC%(`O!(Pi2M%+@U#B@M2p$|$yEs$_5#Zikq95&8kW#WkWDIB)Q!9mkgIB1-McP4U@ z(VZTP`L;TAGSaoxHDIuR@XgW1fqqmrG@z!T39XE}JtZ}$qtQ$!rDG~433U$6NYc>2 zL{tRMP+@nY<1lOEgqzU`xR-GX7t+#krr;E+LcCC(6okAOZ;X~@Vz4wDH3VM;jcBy5 z7bCsB=18b_O!O5f~^zpS} zk>SWXl|=)rL@*Uv82o#2=R970`y9#PjGsP)Pe>fbPXu7x#UD3o>+qD%8H$O}4v z=69ZD$1)&5uc89w#YN&!p3JN)B&8%HEIb?mG*@4KCl_aD7#kTuMOhiLGBS{oloStQ zDAVG)2!3JUvIsmzT4A_~DI9m9a+{zC!59m|H`^#sO*~ z^xo{xGYN}=1r6FNK3RmGeR}pkK(CmlS4!wri_iLv} zGK|FWQKU>y()bo|YGDz@%j>9IIg7G|O}OP&!Y;LtAn!x##oLU8HwnBu=wN*8ynTnj zxsQQIjJgEU=;Id{e<~Q*D@;Bo#NO7_3t|7M;-oa;i3u|{9e z;NAjn;K@t$^Syf>JVn>N$LJ*Vgk{(LCvR@wBk*oMAn@*^@wx!L2XAikb2busjWoUg z9rVO#d+riy&R#${JB1QP+rou)TSIe3S++-E z^nq~j!ramv1}3yTIVF6sM-uPtRli%Y4)mPbPRU2F+#SWnt&@VDHCILdqpK`<6<$JorTTxbZoPi*yTodA*T?NDe0&T48~

-ZHbVL$vR5#)2 z!YuyBlY9728r`>-cktrzUEF?j3ojX2FJHZkTi34Q_J#9UA(;O5;4bSH{(A2k{&?de zes}94u5Zp_W~c+DX=(5^w}jd8!|+s7fG__K0h;RY)z(A;t*qvRGsYq!G0vzukyn73 z${H+lqrO06IX}>i)9r0E+%63BdpDJqqBJiTd8e|Go}MO-I}GCY@uNL^d3lKkF%0zd zp+vKjm6C?!aS0qgd<6RX1{h`J7akXest9E!4EVIMh5>C)o^kkq94${qR)pT?Z=rXX zjZ?ea4o;S{YpzJr}2p%+1@M)3V3gaauA zX#B_!lBXt+G&z9`#@*7@O%%%Eo&RON~$ghP{b_L3o&M^*N6ZemG+-7{dag(v~ zE_(0Z!yx1H@WVIY8{^*<;P)#4$A$U_sEJ_X?a^mn|Gh>CH*qNzfydVl@qPXgc-@3t z2Z7gq=ONlyt#=-X)pGj*>aX5G?d4mj*}aJdt~U{Wjn`;;1YZ-a^bPcG5PB@Z@^;=@ z9)VZ2c@7nf^To8h{MlvJ3jdFb$R1ln-q_c42&SHL%Y*6fV>`Ir=!nks|N#XJ47X=(DK}1!W1Gv&)U`+rnEd^ zsmivw@0bkU`$~#%Q^!Zg2{vgnd?;yvcaLb|pY|%_H+$9b{$722c!2jGvc`wU?eT$( zJwBFq#vVmKe4!MCJ%)sydkK^x+MpEP0eQb>D0ntN#k~Ovj+KzKDZ(+U0vr>TLRlB% zo%6FJnCxssdtDW#$Hvjs-i=1~&TXyK+RP*rR~4aSpc5V4ouZXpoS48$c{v9Be9%j) zskbmj=Ls7uh6Z3aIT6#?8P;Lc%#1`wKT&4 zVV~cXi)ia;Wh7m|)W9HGs%vob)>S-ucn8yc-MGK8ju(uqf4y@XFA1*~cdp?1wQIP3 z{sLZp|1IvndVz0#_yNzJKEcaJ_wfAcMXWYg<8PNY@t55-{7le29O=L>m)3E0VFXPj zrx51l4P$i;XvrRfnTj-QmF3{7sscZCEu1p9#auun&c~%;J~kN(S$S9}t-uV8Zlj|C z%Wbuotf|H{;V?{FYAh^55hHL;W;#xuOhz=#Q8>fI%hw04Zf>w;WYyKyfufu|WF(~^ z$!IHaOoFg;5=~Etw!%;q0ea>F{0Kc=_M|duYS=3+&A1>VLhmyLB{AL}VQ>78(9`x5 zhO30aG&~xXF$u6vNcsCdJqLDNj^feVLfW1%T%|&cxPgRtP-QiOt7;HN@I}zfBkLN) z64t}{y#yc%j|Z^?=>69K<~AgDbs(9hozmTfWPXQ4M*cYAU7FTh7^G(p`{~2PfA-Cl5GD1^jrgi;TswbV=HU% zh^?uGnK(?v%-k3zW+spllE(oFe6UvzA05`g`$vuN-f?5RdrS}i?SLBoX`d>gr$Oi$ z()4I|hwSm*F(Nqv$l#=_S{p$Be;I!GVVRRiLv2sTs^yi zU!J|dFE3u;#lu^8_3S=gKYoITckbiqt5F6i$xa9r9 zQW(TsG{20db35V?jJ(Ntqz#M_to?)@jYY6h8fGXhGFXVtU7d(zTaKqKCesu%rxuVi zwL}PP;biYLqM8O7ZwF|eBWS&J2hEoVkPDa5e*FeIZd^yt?OPbQdmlpt(g@)+Mo^8i z#t5zvp$NEPp%_=+?4u*h2)ceo;(kI>*jLy4_!)YH0~NggO}#+xqvz=53WRyQ_ifPL6Hw~V{j&*Rax%XqkZ2Y>$N7d*dv z4=*3y!w=7&<4-^S7Qg@QCp>4PTi@8iC7RziY|u@KahRoT{=77fU#_p`^t<)S)0R z|7K7!El-9JOJ{UXVuUa?HG2a*ma&;BEbZ-}rmrXN&-;RA_=U6_K9k|PyebZ<>qA1< z42ousP_l4_y1gf~T?3%!6-L{Ofq6tSY~nIulb8j&lR0om%Y|b)BW`9M+!=9w3QOTv zQVxGwc|dvP-)%3nit+Vrg;h5oqNWjHl>}c!JtC`{5Zlm!#Ad>-tq*CP1IQq#GW&k)cFMbP#lX?0N@gdZ!it(*6E^M0YaxZWv3Qi!}Y>l-vkL5$oXg7GmnhX|*?jgBC8a2Tn4e)_}$@)>Wl$5si;8DtKv zB71n1U^s_?E3~b%moa?(4kqv2!`$OXnBa2i!6S?_c23jG#_v7F(CtT%3hyN=woOQ^hX3-!BwQ>iGPC zEIv7)C_?Z3y^8qF9wod>yL|9r?0|9sdC?;W)!@a*tDRy{W5(#4*%`Q*nTF>jCAgZFh8Awv?Jlm^;`=Y1%*Lc| z7`|l#|Bi8Xj)vC7=rTAqfS!R$#@ki=&wqJL(4E71_HK`FJ;9%U`Yj#`5WKpJ*Z1$? zuiyQE%V*Ew#p_oz!+Y4-SivT(_P1wN@SpcC4LWQ@!sn8b2-(PVzv0GT{y(_>=l{U@AOC<`|MG8m{(k}Y{^NgP>xW+m!WS67`y7)Gzs8UNKblarWWuS*+TIx8$PF_+|mmRDcjwxtBP0tj*!4Yuw@Dj)2nK9A|gOJUI18nBj(9$u+ z!J`WJaE~mbts*|yqk#9nlE=GW%D*|FaYz#%A2Sf+?r)Bm;-8LM;N9aiJxP0fB<+aL z}LS(&wN5suuHHcm^pDXRM={Vhgqi7SanSMP)F@^aXL0W0 zdECE#6ZbD)!`t z^{g~J%3zh3;P*FoaB+JDvkMcLm~O|_-Ddpt_fz;^|Gti2eq5ztHlnw;3s*00VvBIR zxwVRgzD@$|BJNzdh?VoJxc>YWZa%t(7f)~FcTex)=O?#u@9qWMy1k9l6O8pk?da*M zLq>KI3d(7n8>`q|SwmTTGW^&Of*3QyOpOr729c+$gElh@EVz1L(ajUf!6BF-%*KO* zF-)*E1_q*r;42Yki3tiuOh6#Qf`Z`BrI&ETRsi0K6L7G#g+2kMB(K0oD=R|pFl|p$ zTNm{-7khh0S{@q}Y`zfN~ld*e5NGPx#!=BxUfKlyFwY8|X=Dn?c^lp0;-a zYBn@ICz_sn81%iPU>uYv4pp#<$%I|f8|a-#&xdPf0X$9>!8@)RsW-d1o^D}oqVh1eP*p8wl`WTumnxdno619{^!NbVj)SY4|K zJzoOZucjV;b#?HsuV>5^4wf)ZvvW^iJ5L-OMau9bvS>MJBd1Yw<_2b;{|Rf~|2yWN z|Ahv57gftwP`_~v@aShR+QMjT9#pFemgZtc4kWgm!5SLlaBLD{0}=uO#r^7t(l- zpnLx-1uhlv8`gU?y^oLRvAvq%6DdoYofZE1s1>1ShYzHj@QIu&zEJSMmrB0)Qk~G# zPllvfx>ypXX^^(cfs#ul)P38a?Ar+i?+&Q>_rfG<1a`4QaE$DPb7&*pnJ$RJg|<_e zC?`;AvI)F8^mevlu(Ja_+^7bc+R;(dg!aZx^s|9>bkh9V+HkXX0J{ZcxR87btI;V~ zOv}VhUoTb&w5j^|UzjYhWZV-Tskh^WIjIlYGkBh;Xj8u$7hN9cs3+rLAxJ&EUijBe!BHDq6P`aD#DD&64!{4Q54UbKqpB_)Wdv1MV+Bf6qLJn2fszn^OjH!( z-kH-F>}|l@(g@D&tl{pxD|mGO23|gYgnQ5K;r7!zSYDgM$XE~KemzR-iqJLGgL@C| z;V~O#D>wcyXBPz7SR>rp9C5}5$kWq9GaJV;fw$!ofK@+#%=`IafH3P~bZrg|MJYj; z=It%M)|)Zb&Dj~wv{+#}E@6U8CBju?-)#JcXjoRjJAvqY*2#f zy*UMqjf()3R#nAbDJgt>^cX&skp3^|sWRSbnL^gUmf&-QiiI0A9sHo<8UlT>5N;XCGn(izCcJ8=6^+ zqK!f|BbX&D#dSUwBox;J2)ZDqf?zHKYikfe<2%LVP{$3tWNwKOcN~#To$#l53efYZ zs)cV&4SZ{B#n%ZSjqIddGR?%&_|nFv2)-%g5hP#!~R1tEO1m3Sze)R^buG~Pw(} zBp7yLTQ_1F+Ywk=jrgW6gjY4fCO#FqexcCzqUE{K@+cZ|R(9f4WKxW$$Mg(vK%Hi% zqm4rEr%322;XT&7U#Z}|{Tl3~jqrtxHNKE{z{fHUcu&F}zd7!J_obZ0eR*Gs(bgY( zwIgxRFcHU1Qz0QhPq01f0z$7AY69&1dZFx1)AQ+sdE_8Gl1C7nHh}2#ZoCs|uY>=R#y1LNJp0T~V3%x^q7#Z$ES4{(nUh=;_a)r{@@OtKYxzD z{QL*}^~c}gx6fbV^~3vkd6n^#QRONd@tK@lETyNR%F`9?K>@f~SdNFq)i|4;j@|q` z+$b!?V_L$6sVPj3j_~i(OnV5-^)B4LIf%2{wPLFHJ1G&bA zsIaxdQceaQOwv*(2Jqm_GS1FT;r5N|c+AiC{M+yF?DaReaPcDM=cWnNehiKFV&lSD z+q@IeJtCpPRdfIC$LB z(wde>;IWYjrYFo=p=V+O2_+?bB_Sc2o&dd1CFMov9a7Q5aZM9Q>smub-wsNqjJP&F z&~gfbu6qQG{S#mjnhLwPQ#f&wc9&HO@0@a08GLff2)lCl<(I*q9iD%FDFO>C5L!}= z2*NI^vJo+r&0OEdf`@;%&j&{Ay}>yAA<0)d(cegh>v{PA`i)>a%Dx zQA{ZT)s66}sD)QWHM}dU;LA4a&({SAN0^1Nf$U&~BfCjMLx^EIX}fkGQ?GtO&H0Cn zgV#{F@&M`6H*j*|I?|?YA#3&lBk^sdP27i1^=Y`5OrU7~7UsYH?^yluf1sZcwnv!n z?%rLr-@SvjyZ6v~_aRzuKSt|K0`JCCG~M{Gadr*Y1=zi*+G}@EeT}fYO4zXk$O%<; zg=WV@RKlvbd>yTPUB~qYXxY7o>a#acx^@*6XBcZQ&?+xKL;sbhgx(_zT)Kmv?JH>8 z*hU@WZ1MOMvigURM!QRB>p_SxXJt+iy!czjH+Qm=%Y#91B(!~lpyBQVRkmM6TSvx6 zD@dDI(DF=hSW5@{RMl{lY2%=ZaGIkcjw`9b!rloX5mB(Ru@mRZGq)h{EX>3SE_;dN-Y@s^`daVAQneWk|1eL;92ED&aM=S&J9rU?1ZNOAdRmNS_1S!yWpGL zkCR2?D6XEsJFbRmu+@`?qqPoFQlgMkl#g;YqQ=%Xaq6SK{vLD?g6%zwcEi2sXY~wp zqPDRTMfq7+85zRrzyMYT2T)U(hsLH_jLl47eq{+Q9i5n(oW=Imd2DWNjOEl6JYmCsd1ez&*--C3e2hnrUgFu4Z*b$v zEv#{4TAZ20`s!&ub|2U7+(uh_JN*27VaNAyb+Ut>gEfMT^^s|0gnC-ow38dw*mzdF ze6bo3jA<`#w7a<@%jpEd?d;)UW)3?;BbXT&!hoftuMcfnq88(^x`sNmG}WObOd2XB z0Z9pAW|gCmkdze9*hoxDfgRsN*r#V>V=K-;YeH~p8W`Y+oE$zU@IF2&Ku<=5-j}i} zgq|*rX&6IN%M#K$g6*B4?Ri4OJ^;FI;V|(_fMr-3920WkmPVt?sUqB}S;Dds0R>g8 zN(2{Gv1oX2ffrNVjJO&CjTKkjDnc*3qydoxo=_3RjR-H|eXIzYUkEF-f)Tcg(5Y=j zcwHN#XbXWyzzJac6>e`UfYCIN6;RE1S|h+tFt-{JVt&=tEW)g^LOc&Rb7TyKvx`U{ znm|-5p~s}+U0wsvvPyWBSHQQ5--Xcgt*RzWX{P)Q!&;kZd|gOqG8udJJ!&u9MRe~H z!aG-yvv?o*E6OUqJcn8I(?) zM&bB04Q~)98R;V%S`fkL@19%2_?Jld@&8O)Hw=x2Za^rsy!@f+;sGW56SO>AND1@g z36q2B8RC$JHcd|x`&BjAo;0vmMjo=f9Tpvj6RvLJAbD|uP?j)+%9I_mmVr4ANvY!f zFBxsWl)-O4m&3cCE7I~*XnAV*%^pp>cR&vx95Kbm64uzG(wwsu-oEHK?cl7t-EJ>&ns6+lrRXW^{G7 zp|hhE1ET|IZEwcH_!!nECUI?Z6SKp^7#khJ;?e@vHr6pRGKz`eaV*cC#_emj@Pa1y z;K^g$es~Yx|L!Ne{NXj8fBzbP`j@}pcR&7!AD+L!w+|lT^|c##b!Hn688@yrb>M1C z2j|dYARxLKKihP4&`&cu>mP_YHxDdR&wtCK2iV|xdRiLn9qnP`Xv?JyoNdhEVW@{_f~(HS8I!(& znDq3;psNSE7-L&pTu{L18fIn&H)9iaIQlSW8R;89M@JXxTG~)0j8zyPg!60_S#q*c zkdi!(tRy4fWnpP2dv4X)>5@dqul?oA8C<3mC(8?n`b1LDVTg6Dsjf{^4 zmNA}I5`u(ga7{D8)<{SRM_UOvmcLL0oG|RfpBe03aB^&ebskyM zH<2}SkA`;-r%pdZ;rcg(-`B`qe1*KFZ;-d}68UpaaccY)@+PmN=jwOZ`osT;j%$xZ z8*9ArW|&DWmo+!;F|OW6H4Uxm>Ky`(CP&bf?Fx{it+7f7yP}bmeg;e)8~BxGh3pn_JFU66~M;v7`em5Gp>Vr1><>LCEzP+ebxiIEY^ z(YQv~VCKgraAA8J8|!PBCxk`@hs2|%7YK{Bxzo6OVF#~Yzs94dkMYfSukjPX_w%2A zkKg|CGw$5JiRTaQ<97t(_iQA8U@!P$X#+R=M{%`p2p4FR56-ON;e{<+*<8i?@)Aae zN6}g<46JTIPg5hI*NK7VUX-MlAkZTip?;xAjEzCCn;Sw+Y>;lixXB-N%i9kNPA+Jt zruPyyD}>Q5;WVgggnlz?+++-TwRHxU=7jTqX?3U1;?~wBTse0Mx31s7>u1mK;^u8k zclYw|c*2}f)Y`#@8qcMLle+`N2vdHaYlLYQGi zJea{aZAJ4_*4DwExP3kd0_ZY?N zFIcaTzw{hAvkwWAiwLPX4Zo6cr1YG@?6be3{qj>ZUlGonxQm+WcTh#>m2+8sgC@ph z8NpRb=#>z9MTDJDg#=zcVJB39uq5R22)+D^TwdT(748BC&6Xng$t z(AW(^qM;uY0Sym7(ef1R97W5MW4k&|;2mQ7JglyPgR1Hx@D8YIVvno>qhK3p4_Hp&vd9t-xMFyaT4e?Qn{0ML=Q)(h5fLPMV21e3VpRLq#=LRD+eS8Jz9h z5f&PYyu4yGHP)l4p&G4?b?6~fx;p#OPZ0Idwu*`hk)M-;@xft?_6%T*a2gvN!Sdn~ zR+m?>wzP~(XST4lAdI8CfZLa^;=6CY#g9M!fZzZ2C;a&BYkc?WC4T{Ndv|Fb>+|EdaPbVLRu)mxz!=`#h_Rsow6OuV z)YhWCu><+p1&9dngSU$f;$!_0ALfT>M^6;l`(wz~8<+edF=clGRSL@JH8#bHyBE&5 zd0~O~joR3w#@HC~I_gN{57+Dyg!M8;>Yd9pyi0iS_z}J*xWB)46YCv42yt|RJ*~~k zN;n0~6jlVD1Allg_IN?;>B6}TH8n+`3GX8a9gPiPZKw}R#%nWLrV#<9r>PB1buFkc zF32k>LsmfnGV*U`Q<0I866^T!qd0J2KbI0>3@#`vg5wD%*x1>M`|_*^J!@Ng7?_wr zo;~qr1m4GokK)6_$ME413EG|vzLHVKAtfywS0nUjdU6DwBIB*HnJd(+JfU+U2vVg%Z|%3Tog-)AJ(?{rG-<1fL&YFM!Ff>do*H;h=+f;DNvsOSnzgc@ta$ z@I1Nn6qo#4v^T-_gz~2Oc~?}41C~=6Z!<<_kk~!U$lMHH+MYkb>CadjP)h(-mLs@< zz-wuzK{g_5^fXH5F2eRCt+sLk(M>Bz>$`-U(Q7z0b_H2uJII^8isGePC=_b(7UDYB z;9EKh_fvgvPVYcO%?xT*?xXeMGc;U$h??EIsJL<)<<|%)wwWTPm_nhh-b4Z0cOD^j zig3#&+=OKgVJB49xm~fcS*Nylk;BS8yNf*GER5}I$X>XB#NiEi*H6NyX#qii2g1dYpfM8n-yPA=6c1bommWB%qQ&H(gah(QusFda6%!2;wtahh z8U&zz&6lr{{y@cLDTHX;uym!rv+TLuN;ZQOShqOTmqz%I%XBrCyi)1KSr9;&&2O0#PhHDwrJ*uGYT@Q8t zR%irvLO*f{I-z~g7S;n=VG+~<=g1BOCJ*49L^U-8awBt7R)(3p0?k7g4t7om4~s%h zMizQ%tI%0efYRi68dMruo7>Svzzq!!h*$9{CHPv|vrdkTV~UZYyS*FZG{2dd8JgZ8 zHcp?$S;Ft?)>+)Ua+RR_7B8MY!BYb6^@9ia`N>l}zi|_f1?WAvi*Fu3!K>T%@Z+6_ z_?NH0!;2fYvAep4@%A=M^>m=8xti}ijK!@rls7kEbZQLUw9XdBl-9a>loVGYKfef} zfnMJuW+;%6L#d1c2DrXSou9O{#k`w4Dz!D? zzvnaL8R(+l!yAj~S)ye=zI%hv{u%g@$HT-BzStj*^(-6W(;Ej&CJ*c7Dh&} zGBAK0ZO)NpPn)!1Sr{6@fVcGw^kJy014C_X=xJ#|n?0yHdrc)aCRuq!NHe}l%gN%n zBn?lPJMZv89AmUSbm)*crIZt6vtW7-PL6M&Crlbj=vgu9>Kd6qh7I#m5qbpPp|{XG zCWAeM-eE;TPt^cYn&yzzwI%eNpknR{4O=hhIS0bjHyT#KiExNYhig(UyfaG}M=KGS zUxg5wTNr^SnjQfcT=u4fBY(aH<#5j_hI3|tIH1|}R53jB86^ovA;N}MwIYnsH-sCW zV0Yd$Kd}f?Z@#BbnJ}z`C4i5n=fjloEBJ&c>s=`vh^P@^=TS<#W4SY~x|de4%4vJ$ zZ{b%ZK0ms%52?db$emt7=Ey9fTYC{uMY|;U0tit7c!3T4{0$9^d!2lI0I9w6Na)+HX+%~}Bf5TykIf@*d;_Hm=lH%GeBTv>)egZWqY;kD{2tlea3lC) z8yC>H^#nC%A25F2M%m6STHQ?)U%i3+%Y@s`4sv%1QUP$ZyF5O2>hg7DT)d35b2~^q zdkLB6cag#M%|Oy+T20L+IeK0K8Lu06~b%=wz-`!%VK=1okln4Qw0j*01hiG^Ql~r+ok#?_~D)tG3k;w~>;9xkpyTX*6zNN4&5PrfzISX?e z=;~SExQrG)+0Qsj;QeNgGX6=hyuC`aJVI}uI^H|1N8lOby4Hm&2aER!|JJAZV2vJo+fT1ooPAxb%SRpVh5HYclC^(sbx$08P zfPW~gA?Ew3!YAOSQpHi+iNS`-%*ppL-l?;k{8-vA~ir!YJ=ilLDq#^Eu{ zjgR8W)+TP8KZjdaF5~*{B|N@)6HjPQ|Ks6v{KOyq{_YN5K7E95X?{0$ckz@F_oo-H zaPj;xt!M$z8Zih=GPbT9_BGVW9}B9(H3Ny|t@;>j3f z7N#(wr=fxyM`w8xw%EAm((};k<%42I$r4F<%vw2O&cYtO+6Fii6pU&u4I~}fj}B`~ zECh#OyD%TGcQ*0EliT>7aK5v=j4eWVq%aRr?oM#DG=`apK1>Kb3)-A;P{NqeRVX7J zUBXS55mpy^ItI|xHGqbW9<)W+sY8>cuBHwZ#sx(MHX>Ph2$y%1l$OFViQ}T}?cKkZ zvGxezcbM>#Ktx0o($X_v&!}u|Ya>F>+J^9BX%no+3C>Rr9$~zt>Ai*Cr^jXg4!vV4 z`jF5tg|w~}6pbCAYT*J+J5NH-A12=6w7nQYFBPtdr{JAYMB6JP^n`;K)ncT5V{|mC zf>O9<=fgfV8`eqbuu4paO|r0$C>ze%MR3nAgHK5fH!xx10pVap2V+$`g1GKU0~E@u zq@KkkZQ8qn@T2MZ(Dc0d8w%w`fO!&j9_6eu0eIDLFCp+4$HXG|-1ytN3E-pUc?$da z__~nhPDa!zn%OGyrdN>CH_pG!7+1lVUPt)V@w>3o6rycdV;kbzhmqd9fTFQ;h$TP* z^7;{+KgeP%EbK*CNhe}!y2XLJX-qr$6Vq=7Kr=RmmhgQ~G1`|7!#TMTAvC|*)%&PA za}O2UcTjrq7K$%jXMDYi{N2mQ-J!7&a)RAu2ykPio!>#q*^4;2eF^Coc5!lx>zfyl z%1Yn7h^#HbZ}Sq;RMC zw)7*Qv zR>u2ZGTMI0Si4t61mFAnl<}TW2Q_JV`grd+El{SCJzd9HNHWJ*eFplbmNnAh75{6(D(+|C{Zs-Pg z(PFz{5YPpqfF8V)psI{uHC05|S|Y^b1ft`@a56g?rNueuD$U2;fp$FWX~bM|I11gY z(N>m^mYQ-b_jhByy#?Jg!3Or4Ia%3gY-qxWV0prc|BLe&pPj_i1d2SmoZ{EVoOPBGV_nzWk?>@$p%e%C|ukrNxV_dm*89V1L@b-5&w{sDl!rz*k z!1n1GT$&ugLUSF4Yf8m4Lz*heG1=3LK1Q3;Dh>lix;njA)IS#VPz>o&&<*a>U#P(BqNKD4;;pa2mb+jG`+8+m2g-= zlhD&+FG?fSvxJg~Jyb1EK;6cb@zxtgp237(B_=oKI!w+sRK8 zqjhQ)Y?ISr9-jW~@HuXTo;Shd#~3W^vkR)GR*nja0%qmq%A zukos8M6Pc`T+b-7CQc)FY6Tg?v^@eVh@GNvK5l4pGfl5f9Q+*K(8eg+hrHnx%-;P8 zxjk#hY+XW9%_O3V`w(8(PN20QzPcHi?VTtb8bs;j7@F7DFmT}t+E*_ky=e?ysdcc9 zC`NEGQ_zJMXub3t^_L!^Y@3EhOUvE5ggn|?9wC==?lOzkcb;|ODzf-K>5Q^zv^v4` zQV6@`jdMs^-$u&%Iizl!N80*%q^@owm8Kcjw+^?W0Ya|^I#Go%KUvP`--gioF2vJn zh12|sMyF6j+dbLL?@-!GkT=2SR5Ls?>*1GMi}bc3@hH1dKom6G{h@rq1ImtW1RgC* z0G^=%UyTi@)BugFnA`hM&KCiObh_@c89ZTxM_hl$5R*;4AoNS~e2-om! z!_d$ORten87q+pzx`?5kCaey1pgk%WEhijMt*eVF_FzrQ8W?80EhnflH8jxT?t#mV zjd-y zjE@e#f!-bwdKx&YtOIEsQ^*+-dS-S|wxI1V+!$xw2sT$f?kc>NaasVR2SF*8 z@O;4nX@qZzkJA>t38=8vJ|xq4GDjB)y*VUw3?rhxQ{0LFW}tH;VmrDK+uDW1)_!F7 zP2=1*e?|4wHVS(gi<>5pUO$Zd&IyzbOrm0B9L)=J7(Bm;>04K@c>fNTA3VbHy=NHQ zx`FtL9=Ilz!z!u()(IsD=WEMXub}nX6Es}DPxxI!(dI6SwyvOH^D+w0UPT_4nQIr2 z#@L#+aS^F!E+TpD98xw2Jwh&t>q!J%;>uYhE^i^>^d^$#&mv}U4dET9VV_UiJ6Qp% z^a?m-S0bvu8%2!sV>>sobo(*Z?mWZ7m4{fo`V8ae9wNJW8Ww@+2rXzp7NKV!od#`> zAZWPyLeb6zO7dS2*Xo^Be%kV3*l(3D%+meXb}hxpA;mWtU_UFFAB=~5uerw&#+S1`d7frhtTt_fmu)^ z%){DX8Ql$=_yO1_kHR5!g1^BCEMkXX9y1KH=wX;d471%1;hk3RKr}=pprfc1oed4B zudP9SO%)BQ3`13=xX@aOR}<~{X}$;Nn@b31PBGqMB>G4Cu`oS`)zKlWcXs08+#&&Y8$Vsz#kZ`Vu3pFAZr{V7Zr#Rz{`eif zeR?0)ZtUXaw=ePH#S1(Vt~ok9f+COHI!_U4E()P|d` z8f>*x;GnGqM-3ggY8t{`jj@(Srb&xaqDd>Msz6;y73%UTP?hCv!cAUL9=R8eBq&0hl}4Cy%2GJRhtLyD&&t*o z#)RG*(~}dS_dcQbK4b2OhmPZuqtf_FQivq#gq}8`X98J63qsEp%9gY}Ye#51xwGda zOhY2!M39|`Pr-@AR5);HEnJzNMrRrk%NP*_{g7}%FAAo-&w_?0K+iJqBr6pbESr;A zu%q!gWfzLpcS6|TmscW|Bi9{rO5sR@bfgVB6;{Bxq!KPPHdg_1v^!Up3t{HUav}6w z2s~Fp(~ZD$C+vO&rW@~fE9L9nzV^-Q2}Az&{&m77CC8A`KZC@saU^z(h;c8phR~~T zK~x*9sIv!gt)0jePKWCp!u*}*7~j2*N=C%W;nQfI-Nf+rRZQ+&!`Q`3n7+D$)kk-* z`Qj0-{`>=O{O)(y{qDEedh#8bX1CBVcb+D=jO5lagj99GC9e)njF$d&JxJmE~? z!RbpVT;4(P8UeUY>s!8ntd;Y~TBk7*fXR%o@k^UX5Fkg`#d8_Q+cAq9h*?-i%-jZI zCpQqvIPTXv4xff$##=`A${v(WpGE1&3c~a2VeKCeJ$rwcyGO#zEeg8!A+Yj2iSSd6 zh#>Tm8E*}J!lCKn2Mt=jyp;04>i9hGs@0^bF062|#OT(()u^wejg*1-$>cyf_e9IO{?@^J4GY2|^Fj@($DT zj$7aZNjrQj){iti0KyG0-LgH)S5LCfLQVkRT z8kh$+!78#7HZgs$PZ-9DlnFQ{Prxo=6gF`q?4*Zb$)x~3qew!pKQab=d4=eyZ$M9H z7n&GRTbf$Li9~zbn=#f|jitdlT$t&=@<=DV+FxF^p zmxtT%sJ{+BE)C%7f^d#(69)O*%Ax`^Gs0|dZsE$sYk2kK6`tI_jpeo$4EXq=SV@PpxK!lp~>6~8;t2Mrx)1m00$Ox`i^z{UQ92UrKinN`e8tq>L*jX?iU@iaBzG#@eE3j6eU zTY#Px86${G<+FWP^41>cM4+y%D21W8N>637=;Y8T-cgViMl~F#aT? z1tCZ)v`Nht%Qo$l2tivKqjfqfBOf-|MX(hBSU~Ue zESD0&3~S+9Boyx}tb%h9Ut3(x$16ohx>YbDGe&z?36qivWA%pNU(tr}x^5)24kD7a z7tur@vSSk_8cn3_We)it-3M5?cMs>kevX@e z_z8FZ{0H3r(=WL3^Y3xvr(baUcYnk7vmbEjn?K>~*MGp|&1V?7@dRz>Zy{%V6%ma? z@Ga|vcX1bdO1co+Hc8+zdJZvu&u=4dbSP=_r zh?rkR_}mI2=T;HM=pH=00H3xA1h!10c;O}rM|WToQN~Uz651?H^B^ecdq6?g1uDiK zP%-s{ys-;xLXzMTmj!*#5UAMmemhslumhAdw!|@g6C4(Sr!8JKbPs{|6+7xZGKz$r zq8M#;jM&ZtLtyLZ1Y<^8BW^!N=Eg8I7rhHOb+UXaVKfGcb3xU#;CXIIbT^70JMFHB)| zxEE)~`fzuC1g{r|@cVP~xV14sV=Y2qX0mt@^KJrnk)P@Er5m_TZsG+Y`Siwp+`MuP>(f){%{YmkNPmpSd!xz47suknzyhYq>QGl!f{Zc&rz`_$MV1`FCPT=vj!PfMG09_u z*)j2Y7srkebYdM7>&W55*uQ^24iI>IxfF)7xO@7-*FS`IXD@sGna#ZqEQFXm#hGe}ik^{fyf`{gKO`aOveM?7Vt~+duz=M}PY>p8os4 z;?|%3MDYC$_x}2CxbpL#vHJ8|oPP2xw!Z%pwtxFqoc-OuVExCxVD`n&7`ponI(MI< zX8jflr_a-Vt6`B+0CSd8K^;7+JK)>YkEo$}WG-Js&Y3GnB?#l@HW4|qfr#mKgwL!a zjLXoeRRoPMAezP(N+XRNTt(UX10;8ELft(J3YKwDHIIghQ3#X`{Gn##2NgrYPWJ?) zH0>avWs5`VmQXNvgU*RS$QV07-pqkZ8ywd&#Sv{`OrEYd4sWl53W2A9uVm!0M_L~H z2)zS@o`Z`g0@x0W&CFn6Vgw6YOX!;z!ql3UM`#|G)5I4CXm_8=iqLysgq|`jPnp2e zVEWL(2Zs&tp)f>6$`&6=zk%LfC3hTE^Q7td5~g8ra*jtpKo;WSE0B@ZhV)aNh)M5& zZD=)&18dkmYsCF|){)(?6QCDA3YX+5I44aId=qd^o`g&46i%F+Wb7S-ZPF;-8LOJb;n05%l%$6XkD+gL z9CJ&{*gkg-XI7VKd&7idKQ;)d+vm4%wyzuOrNwyE)s3GQm+;fdI{tY13VykD1K(fY z70-8les&E%U)#kGtZ!~y#n$2+#`=4)!-#qF{0<&ozJa?tSH&Z{D|2%=Gdqv%**Prr z_uePTs*&v`|PQONk-=;hcG8x_07o@_56Vw ztnDxq8j6jA0!*Z*W3H?e_f9Y3@%AOy(&3j~+z4 zx+>yndY+7;c6$%Ni?$c6XABP=0~p9FKwU69Mp!9%8k`KxPD+BY^%x|M3!o$LjuLK+ zuENysLJ4Lk9I!Y*I{s!j-F}2|YCnTj)8u!N|o6 zMsD8FclClEV~duH2ZW1&YrFeE+tW`xElG#))1?XO`Ui`rD(TVqgnfHP;W02`>@|*z zgK<=XSSHa)Fo{WqY3xau#WC{6r@<;Q1J+3d-pL$TrscpYGY__mzV>;ga3HiCiYnny zTn$G7aE!7}jIoZiItK#IzNiXzg_UC23(qYqg;P-(PSEmPiWqB)8F@<@;8D>eMq97i zX1=}x!NRO9jIR#DOfl)jNUG~YYReE33B6=mURM7IvU&vIt)j4RjFEN*mtOpchXmG* z*FR(T$v3$D%};nqxV`+-U-073f5x}}{_lADZ~uvFfBY-1{^?(E{lEVQ&iwo*bl-e{ z^36*q+uB9l&Mh=v5vG`bB+jhT|L7ZxzW51K-~58v@BWB7cB&cEiwJD$BE)N9l9~^V z=yd2O74Z8t!@qeP;XS7jH%jxHxrFr7H;}b<7wIdvku-k=5hDa4VHeZCj>Nt*Xx?}R zecwDt5p*gRkx(=Wg0hi6WOdviqkRGj`mS6*f#Yhn1fK&ADO%#Vx*ZNFTd)JP6DOaL z&}OXFHo+k^TAFZr8G-kOw5&KM<9=Cre93n7)iEikvVF(KC&SL(Q9MUnXop5-CXBTf zFlIYc)-u9ZhZXVh9t8sL*E27K{dr2XJY{@*SOXs))yIcNjq#y`1wNLx#V2x3_>9qZ zkBSF@>jNpB0BBjnz|1KP4qkb153YcBcpU;`+Yp>OOsnf<(&%RLXoG232Z7fQr}#m! zP9%=NDRGpB$4VZDL&_xVl4yMdpL6OA-f76qMMq;Dx_a8tJ=h~oD9}g48yFz;Iy%rM z3`p+nrSS>(4`FC<1Ox0HI~!VPbc7wXd*<{yW)~J2Z)dT7dJfwg%UE9F?YU{3oo6E& z8N}u0M%-;_!E?rl@7SaM_S|`Vd+scrudm?2+!P)$2H))M!8QJ^=QP9%3$uh?55|Xw zu(`T{^|@tSzjzgED;t=gF>cJvp|7eMRVm46&C0-1dkYpjT5)B08E57eaP`tv?C#t^ zb8|Pp%NTYq?Gk=lSez4vy|iGVFdwtYDVSuP&dXu(!R4LUNiP-Tpj zlNIbv0G=d{9T#HjQCi&*965BD^%iz4Vc*>Wf^Pr*eb~2euL!*Td-p<;K-JLHWOOx$ z8LiIP)*1!`u(>Tk$HpeitRl>^V#UU0W#b^8*e%Cc@);wtV0!NzJWS|G;7bW<9FUfS z1fz-s8?!9qi@c#Jjlc$4HjdD?bA}d6)7}~C4lGAkXwd#N*rN+GZD@M(J&O?7a0#DTAonGF#`BfMDUpte3l6ruu35Gl25@hmBpxQlU)eA z+)~X*SHqf!@ZQiD{mH| z=U!D0UtyAhre^WfHIJN9_@1gnTEj3Jrq7~ybRKy_lgRBKN7dwM^qtv7!{iD!@4dp6 zmp|hc;q>5#KjXpozu@ZA*SPZJC0_jUC;a>${}W&T?Z4wLVYfrjo%#M}%zyozc=7Gx z%?rq0*+PzRlyzYPX^WdkTi8VU!WpD5p5g5c!s`q&mo`v*_B_hAFN#yU3zu_jxpp79 zi)RtrH;0JM8N~E0Bdl`~UUd_&Dd>TDZWruIhvD8Z58t*Gg!gVCb@CeGySI@&cm*kq z>(Ft@Cb$A1rymG;Jzql47m5b%gq;&_I}>~^1fD&P5O~LwY#^!TAg)WO+dxLg7Dv@h za74`j`;|1rNGnV}D_m6SD?;xRwxj)$(l{t70}ooir?($0EUjQ{YAhbS5H9a%Y+=cy zVa`rf6QAr8Vyzth`BQnk`^B3{1>V~yT;frMz|&$G5_)F%=(r`JXGiFrz^9D1U#WRx zzh(fW3?raqkpL}+3>doR!^EQm#vWy`32cO`u-`6y7B(lRVIDIG({M)H=sq~b4Y2xg zB4H5r2}7{q^Q;Ly%cOBwB#hGXXn%~p@5K0dqaZI6O`T2X80tpPa3A`GtA%lc@1>db zGVTry3}R$x7~>9Nnd*U0d70 z8sBHJv{alV;K|?wUeB)J8IAAe=qPT{7QY^yz@OOby=?Et4kOLgjvkyH9>Ltmus8^M zmS!jHW1QmiI-1+iT3wC&_-LH;bVk0HD{>i4dwF}hBp)}H=CO5p9wXypSUGzRJ&fZc z)5g1Xb+Nf}JM{iIR8cukk zz}S)z*aAs}SupJ^h~V-&b_5}etHA`DC*!H>aY=Y6D8reSWhzI@lahvlloX_d%QXrU z1qerBg&9(W0~3c1;K06pI6%M&XIbn&xF7qt+{@d7+3or2D;(In4-))aDvC)Np^~GNK1L0r>8|$b1Y+p&q;2=Mjgpvv* z`5EQ)j6~=u@q4M5SwoqnWM;#LW(!3tJE+*5fQo|)RGr+QF2Ii#_$&Cdy$L?X>|epB z7aRe-5W-IYUql=XX?kJ_@T2XS#1edQ!hXJVf-e)6$=NVJnGK6{+8!gW^{Ham()R2K zHah}N1Rd>7DC>esSml?q%3z)MrfdpIVN2lImCz7NDn&CqQCbg|vPQVm^gJsY;ak%r z4sZ@*=bTNrl~1f9yKNE~4P&StIfKD-cX0moFIao@9d^F?BUbLb#Q3?JxbWl$0`HG_ z^80_q!=L_&mw)_MeEp|?#na#a89)5%f8r1S@Bf9Tw7y$^_zN!l_Xds{@p#4Js28i^D0NS#?m z76DnzSX;C65Un>~qyEZE6mHx_{Mb48w=cn?aSrZH^YCn1ViewhL*5j$f*T-Zn+*k< zY^YjiLd7(J9a<2i)IG&vA9C8xkkYWl5hcb})^Sw}NT^uihytxn*&2uBOvG!<9#=QR z5mf^mRMDp4sfxh+oRHf?)BEzc1U_L~+IL(EDr{TPi7BwOv4a`YhY4XPP6baXFd_t% zwGFW6h`cyS=(}IYA=!QYcJOT1Hsce7Q>~ssE>RbQ?2imYlCCmvpm)s?|6|BM`Z3>pr{jd|DN85Ac z@&tir%d$)!Wty0Qb@m*r(x+jUHbc{!#XB}yY6$dkLqSO{n!8)kEnEv@upa}xgdc%7 z*w>Fy0!XZ(A@N+>DMsL#nGuYQ^D#niVPOFqjI}EZGgz7$!^YeUF3`wsZe76b^)0Nl zcVH+x6H_^-uwC7N+dacrC@#lJX(g`IHREo58y;4-;%<2(uCl>i>+HkDiAgLHTmu9B zXzlDkZCxF@+uPAtR*8JtSbC5*;_OV3V{48wT2GCm9Tu`PaCvM1y$zL&>2 z9y&^!J91czuR?S^M7SN?zn9S4OW5tj9xlHU?2ZPxhjzD@bzHcZH{VxNRZX0XKsdkG z($XBl#Z|2NeT8)^#tBju5tsFdr&m<(p9{ZqzA|&{J zlBybz*3yHFt|4UjTv>f%$QqbJ*3caC!bQL>Y{YBgD%m+fnbs$OPmRW>{&(;NK!>&` zo}5nb=>~;E_bvGJX?q5=J$;se2)-l{e8vQy32o0jJ_F{7nJ`Pqf?3)rSY+hElD21+ zQvxd*pLJfj2s+FB3b8DB+no1X5{g2J@GBtripyYIQV!eFD%h9Sz@e-T&gBh+UL(BN zhI}d-5ZBs=Izp$9Fe@Dvj%cw{2Lzp_-DNS%YWbx{|E5nzyF{3`Y(jvFMq|YAAX0+G`!VkUt{PRW9-=r zC|f^^!u4%JZ%eefH;`LL^6VNCXI8~Zm|jNQ^b+gz-{F_Y^@OQK#E#D+dUO_1BXg`p z#Eq}g7B3+0%qwDNe0FMAq^{B#zvGbHOxDWQ{??s|-5sCD5|ZhOBNl zlng?lr0-9Ya}XpOiih$S{UVXtOlgHuM;E8EX}2d2&kP zK0G1Te$JHetK}VFT5xdpf=^(GI3`q>;6m7^XJq=u^7Ks2Aw^64biXv-|5Ap)lfyrK zu7uxwrAFWhlL`n!R5WRLdiacn_t`N!d@9LUE9=H+>y0l|1F=sh9EXi!A!V5YCC5`x zcQ1m5XDO6iOCaq~0D0FkXfe{-WX{5~Y!mLqYp_WjXMz}nH7(Dc@zy1M3a)vl;aa*5 z&&EseY~F=?-37RopM`VI3f?i7mVm9ECZa+EP*PfemgWW;UKe`WJJHj@>h4BAqpo0o zy#u{m@5RWlF#Y>5<|oIoFg=H*dB)kr)0m%`!oti1&MYnB(#jd!K64&77MC$sUyHu% zWQ@?NCUY_{pPPsI%sgzfCx6jBf@kdmxKdt^8;u>fJ3N6a<5Soe8^_G(h&ZobNqHH{ zN=wA6ob@oO6i3G(nsyhcr-3*%6*N0Kp_!J}FI;fEtPmYV*_fLd#>&bJx+p+2*v}C5BA~piy)@Jb2*TD%w%!vRyLD0C+icT;d2FNQSKuQ*VM)wQ#J55SBf7zWdF_V#l275+XMpy}&-C+XnfDl~|9>l=| z1m3>=tbHQ*4(uo7_I-sf_I%0uN(A42f>9WvA{^0GR#1Y0jxH=s%?LDt&B_9n7N!K? zn-jV%g&8f__(bTL(fSBHGsasTg6uFq_or;EAF)1Xti(~XRSUKHU++cO~e3<*9X#$4mLG?*l2z&JS*CbT_M+MX%F zXPQ|6v#dgx(fZ67MTIhbTLSbf{ti4sv9O%bW0h3ErmTup1H1A%I1zMi1fDm+C!GCJ zGrx)cts5xnnTK~WjWD7R?g=&UO0GvtQ7`gZ=df|-CroZXz~J&#EMI<#yWjp5FaGc! zcu3%V`?vpzAOG!t;pLzI6%T&u>FDLj+&aTkyrx=3)uGn5ol<8ec#RV|C2rBBE(}5o5Cm8<|GP z@Fao;gqlV`-xPehCgIz~^`1q946Gq~>^woai@2e)NFBY1h^7UYM3mEp(xJ~%Fb-l= zb!W8ofSj%ajoy|^D@dprvDnFJnL%2^oQ9{5gEIQ~;+Q%~5UJ zEjEJf{vmXyr=l?~7$c_=Fkg^?<-&YiY3{&oeFvWL2Yxm@jjNsgxHdG33nOEgYHP;` zOK4( z0{r&M$>V^$F!_Kwj;d+lxNt>!#u6z#W5^hpLyop5&zLK3X(vt~sbudA<+tDyVy*x_ zO@dGJE%>wuJ}qGmHiA!^wx=B$4Q(2qPDCtpXneXfKB4poKLZ+{0gcZfF&zfU88AGV z1;f-+FiOvb5#eV<_!(yv!8p4ZCbU0O0?&-VvtX>X6rfjF@urFhzv2p5(e$hcJ)81c zI8@YgLl9=jak9D}PT{$52+fBFf#;FZ1iy?9MCA1#Ccg(6HRC8~p2ozP zJGk@B-|+I6|A~j+)A0WM@A&yY{x5>>KXCuYUvT@o-{IExzr&4he~at1yvyJHg!Lyc zFn0YmIxg;_>dZFsme!HAxJDD(6irWnUBdJV;-?6^sU<`cauGDT@UeMBO)iTS$@@ab z<`B%78^8|VZ*UYo{Uh+`9)w5d5NiZpT@&!`oQ8YLB;1-N;odlbfVOG))DFTfs~*0^ z{4UjFP_hk&w2?O?b)0cT%?grQ*5av#5(J&3mMN5s?4WLTfwHARVSe`HtS(qEZpO2KY$A6rW1ju-;tg=Q9OD zPg#hzK{#v}11XDSC^}?8%cB_Dgq{kct)dglxfqJBKxUe{B6_cwFh7zKc5PZljp0N~)5IRm@CcW{V}umL*xTWoBk( zW@XUywkC1^D0(auUg*4T)xD^}vvu06_9 z@iz&dtJkmM^3AKba+}fe(rH{ge-aO_U&7nBuH${i-8UXR#gnJ6;?=j`#2X*d_P+iO zKKS^3eEZ{X;NwTH;mwn0ac9GN+*rK=57v(3-5ne8<)!oZ$&*)Ubg$vZufB!vK6-+8 zX>_mOxrN6!uj2thd+*#OTswXi7mlCCspBVb`0yc|Jaz<^Xnos;N3gA>4SN~G4uq1p zoUF%Z%ck*KYcC%64&kkJ>+vHR>6foQ#Cta{;FYt7@%YdVeEsx6e0pRTUS2baqy7Cj zJ2j27(=#|RF^RRUZD_XJ&`@bbN>_$tnF>`by^cyO_qs9d_hDl=gc;h>IKi}>5q6Nr z`odzQ3kp$B{ZG>poisHAjZVj-bW8~FifDDkv^y1VA~kR3o zUy$Gn(fmTI)*`fe143&yBD|J{LhwZizUZc%h;P}0_||<$()gszLGLiC2)t@qUNxg_ zEsyp4j-h@(fp_SH0=+syuaTvL(Ca&V4l9pd#rWO}XrI`Fn*PlM-$ArbpG5omix}Jg zkWu#zcAO)8b}{N+d>wBvLjIaD_h-NR@A%EX|L^$c|KtC|&;JMh;&=Z{82&T?rlu6 zOt7qY{07EeeuJQUhoE~KqYvN22xILqEpOn#WAxp7sFVQ;K-cxVXuWU~E$42bA5t2DxUA7J$Q>*zgw4_ybZp?l92EI)h?+C)E$sR8H+JzcOFYHu}Y=nDpb~QfWLhwqD%K9+Or>lwmtB+u(a~yuA>MKoG4}31VD?-81Dn71 zzhhp%#f(;(OSj#DAy&5O##Zd+&3a+)0bD(O8dtAe#Kp_!aO(6C9NxDBXZG#J({rcs zjjI>&G0UshuHfr*n#(!j^JT!E3Rhh@N{$x4_8m)>t|2k zjnf3)@qKuBU>je@PQ1Nu3+}9)!s%60xU^*}&TZU;y~|f%tfm?X8eTokH&dab?Uf;v zvjoZf9JG|GvC`$l#z>g4)`=;bot9@tFO|QV5R2p&Ak9b{CR9v>o0df*cE?gk;7M$i zXLTiFcm$rRpa>fNOc~8iZ!*H{u*2c^!xxJq#v7`tv4t(C>}d6Sk9dRdGnvf_$>FPl+3M^E?>*0K&-iIukJ z^rFn|hgSB(4Mbp|C7R+HmtXSZO~T8_>!tbmXnuae&rkRTrq>}bvmU|OO$gEWB8<9`jl9t}??#+4 zH?f_z_h-;MqJU3JLQm$Ho{XKsCyZ?co{QbWZc>iDg;eY$T z2*5w!Cx7=31m7R<gx){l!(aXl9(?fwoRhs|-}^fDzVkI~ zCGgfUzRo^<7c);7V_#-;CFI6gRxrXYBix4Xze?D>jiLK*(d=Hw;DguDfBy-3Za+Zh zjoU1@(01h-S}$H@xq*&LchGU+9@@^*{w_Yo$j!Gf#?p4=2I_X4Lv(5%O?xZMjpNW~ z26?SUXy`-GCuP;+F0@Q;!qomV7}|IM^&@N0Ff@Z$Qy-En;hGL*K6b9wOye_SiCTv| zEgMJPGx=pYWHFhP@ZPCzXu{ILVbnG@vq4TPmN$1#)*~Xrq-+3~+U=XV4a_Fm9!``$RfsS2> zbniu^Yac=#`w(p34`1s6gu725z3eK|EAJq^;Z?L>{63aG{Tnnt{%f?o^0ydx^>H&f5o(TpjMl5)$M`$H!`jdO8|Jm^%h5p#=rowoSLwoJG=mMzoj5Qt zjgvce;M%!UIDhg8_HJB@HGNC5t+yYq?%IW)Up$4MFe<)rbPt}MI*d>6-N0v$?&9Ou zA2L$DjyK<8qRO;lsDz!Q1b=qih5D=+Q&m zd;9=*9^bFnBH*VnKjca)I^;hx5*S?O=pFF|Y@e!=?J8-x< ziQ8QrcztLP=Tj*hjYe^$s}tK91^4xKVc&{j99chwv%46v_iV!X-5dG7*5km|jo3mk zP7U^BprH;ep#W+KyefSKYIQo)YBY#1$wGYLB6O%V7nX%cyTuGNH8+Q(=^gee)b*Vw?Ht3`}ArHN6y zlr451O)rP9Kc`HK95Fw>ZW+RoM_?D4DxtD5)(H5X+g>;hLo@|LEUjz;V`kL-Q}D_1 z;f@voUk{w^y>NB)!zra_08U!Avws9G#$6ZT=VIJ-kFJD^@N?1p+-yt*e%_Tc@X`4E zjJko<8xdT)h2YzU@P-|TY}$<&BW|4WHnHO%l8m^iXW)BoeA2f!4r(P zjJOA7aPuiFJ9Q0%yU!vsvKckQ+t9r7AUfB|+J8^5=K6pNYX1}8{`>zE zAOGs#@zKwIN0|LPe)B*6PyGIWW2F7Ze$b)pc}dO1VeWpqyILI z?)IyUV^Us0?`;C_=0mh!xrO$tH)wb_3A-z3IDH$B>#L6oxm$w{#j&f-S-HQ$4vF?He|uae5ZP?mqZw>GAGC zgc~{uy`_lOcR=qBBTr{$F|mP^bWnyxgq>JkKJOos&4HeQA+&XMBas%^F?mS5rRgQu zD0_l&lxQtjl&`~rd;?y{H8R%PmASRq6$unrYf(hNsXSfK_?AKw?1v^IkQ+jAbOgn* zF{qOh&{eO2rD+TAt6d0m@8|11h;YwggnEv^-+37Ru49M}TtH^|EmV!)N89!fFm&~2 z=zsVdG~E76)ZP3A+OPj5x-b3(TK9d1hRyFHz4j^68=o>Ayo>fT-^AdpA7fs()r9`a zN(?dXEVH^W?Tcc4x*oF)E!e$coKbfh_N<-8L`N(7BN0r56FAee6hB~0{PCH+ci&poj2aW z(^p@|qo+@C^WJTozi}C-_%}|SKZWbG$9q?<6DdSUPCgRQ$Cc7o5rxa(xpb+9;xSHMa0n*-k@JdC?u#$AD)Z`C^Z zXEq`*yZIUPb}G<|ZrO|2w*3nDk~!Hk`xA{ySK65OK3ZF6^%zPqV>deEIof8z2{#> z&HhV>Y(4?cs=e?`>_(XHHM9BvS~ed=@9vZ6+{TYLZbkLXH0oz(P(8I05jOCVfj-3h z`cboV7&V=Ph&2fGLWtsA069J)@N7GxBU#O&oNHUGy!J)bPl=tY|vpg2wr;#L6 zfy0lS;tIT&Rf?}>>F||ogR)TQi$yNX*8~`C6UZ^wp~%^evOqs{kx^8nSJLXHVW?jV zQ`07xTeiX0u?Mc6gG>&`5MOp4;i1#;_n(Bfm!(W9rfUcl-qI9X^OV`}XmBNAb?Z+jz`!nHTl^ zsk1movpaI?DE1uMg#!n6;mY~bczWwLUcGl8PaZwOoA11Zx8HvkAH4exK6&j;d~o$T zcD1))TPlXr4fVL*(uP|!kju3VxY*Q;8H)wW39qfO7}zhpg;kTdwRZ;&Y+i%@ z?iSQkr4S7T;bXK65_U0`RHX$enqHCst18u?j`6dRKx<-Yq5ZXs5fgYFw7o{g*&12J zn1J*!###tDZ84*(K#rxDkQ2*Ov9&Dc&sDNvkrh{Bd`%hNFv_BzEHZ^mW>v^Gc`^=9 zHdB$F+3Hjh)igk*(A-k80I9@Z*&&TBY??n`MZl+JG<+>>knpP_{4%uG1pkKgs8u@M zC=%Eadf6(Cvc{e)EGod0V{W-Y>7^^Fbo>!~5_6w}PajGC(f9;@veC1prU91PCYbA6 zVQy%Lh48bqbivx*11l|}lHjv3^4b}9Z7hzV<*>6j2tU^-ZEu-?Z<-Z#HM~=6;hWw7 z|C&v-y{!nX-+}PP-2~qr#J221e5-V5(DVd+f9%wF4!xsjrs*~ACH!Qz27%WotG67z zfRz^?VBPgM&@g)hHB$#MwEq@XUw#W)Uj76--~KN4fA}R1zyCd4{njsV_QUVt_!}SN z=?{K^@Bh<(#ZUk3zvE~B@?Y=+0o~vJ7C)rvefgW;;(NdTJ0;%!@VEbj@BGd0@%B%C zg`1y#fs=244f|ew7h4Fs^$%ai8p3XpkQ=}C6w7Zu!RYlz7{2y01_`$Qs}HdB%3XA` zbX>Yiz};tTy^Ge8%8;cZ^khB8!V>7%a1Jh8j;6MDfVj z-Y1%hN-WGR!&erm@s%YyyqIUf3xwVag-*Pv@?m~y7>ml&1Ye`F;g8xk2yJX7Dl)UM zHf@KaV?R87#}OPkkNC>#$jsbB-G;|Vt-g=Q)NO>O?jS;VB{si_>fK-Gb^Z)JC%!=c z*&kxy(l5|=<*(3l_1EaQ_$$W$UlN3Wjb#^pioQdipkv!>=-Bi!IyOE)*QN(d0Iy-* zL@0#yi6l0JVp!|)VZGCbO+Ffu+lvv#xps#YRYo1E^*VIf>=0@OamDT-dz>mk7Gk8#iL#*fMPG?Zjr9)ltTtOKY~`=$dWV zI=K#ex9-5H6Gw6E*naHay&0$X?Z7LSFW{ql_werByLfW%4&Hz7ZG7~#_wnTA2Y5t? zK0b8@dwJmwG&bQ3VRt-PgA=g~&Q>?zECILPYHGn%zJ{yoX0UOj z2g#5VKCeqz%TDIU1{^j7EhYpj$`K~;VuV zV>BY5)1p#T2@11wQIcN-8~=Zpjg@p_2sD!fWvZ6o zBn0IsgGB?8C=6B`3J5;Affp62u!J{rwzfig2pdh=U2^vRbv z`}xoD;4goVC%^nhT&Lk({q#$`_S3({`@j4RzW%G<;K%>`?+W;S{x3ZK^S|N8zh|WV z?LR8d^1e;Odyl}o``Pz#=ABQl|LHr}{_-1GfA3YSChVpNyA`y!FhsL z-b3GoJLtJ^8(kM}qT~Dxw4A=Clr~23PC{?s!ac0K@e~^#zmGk#GV>eXz>e2G#>BO! zXxx1onXNK6`7+uLUqNEaA-HDN!^z0%WaCjeyb{Kq5g2K1?tv8q$7*!$+(YQ?rs=Ff zWOM|co*sC+3B3LxM0kBHgq_wARF)fCqA_5xiiW3|gI>0pjbo7-MQj*l?A?l~X*RO* zh^3bCHpRwS8r2M9{GOG($SKm{r6n3duZ*Twf%ydlUf!JPy~KzsDZI*7S7X?#vo|@7M`^arPA6psiiswH1d~O<-qN2evfVV|8r|8yYh>F*txz z%O|mOY#OVER$$}WS?t}l1-m!SVDGBsxVm92K0I+8U);KdkFQ_HgY#$b+RKd0Z@-4y zx31yr@uN7rX%jZKbzpN<9rk1zvBek1{zMIq)wf_1O=`kr$Ihw*PPR7E1Z#1wxe+%; z2bDGSHZ5&NbAGfpFc#{rIbOOGF?nru5(ZN=%fwf2jeO@uj^NV4tFe1(C zS0~`hF#ggItDc#ow8$P==9@p0U985E5)BqHy1Z1PR*nKbu|0XVC$?8&b3rXJm)5WK zhoKEbp$o=P9!V%GZ&c9w1b%vgucE2}71d49*R?=T>oX91MuN}O)&p~AA1qR_dIkx; z5mYkn&Vi3nmlf1CItAB?8Ms!iq3x|xY%j2SGlH`caS6SRdlcx!w(M7+r`VoAkH#m1 zm239Q_1-q_KgJ^gxSs%&0m=tt&h9z%p1g_mFMkco&eOKezk=@Crfp_Rs$YpZ)DW;*-Dnt+Lp_5B}j_@WVg+ z8_U1r`~3X#-~9`|_N%|cqtAbg3-2=`zw$nIFv@OXjGdv?t)$VdxcV|i3A$n0<{+bJ z|Jl3fJ#&lJb_1ORT>GgjXgx`I@z}^H-gf*FMreG~w`hsC-c)ujo4xirw%z{_d!K%a ztvrsMd4SFXm(jZK0;;wgLtth*Y|B@na@jbnJg1hvF&Nv2P~OlBT}>BDCu3tb0{!EN zkFG*=a033$WpLE@!IbQT%9Tc;#fK$jvhJb*3rpAts>Ss5e}vxRf)W_m5Dt&8L{mox zs+mZl$pqmiWAf67S7qRj#AtaIyp&afuPjvKtBcF=mBo6zkZV!K=)EYLc&ofvq>U)^ zY_s$=$Tc+~&)SIs#{i1FV^D`zp)|3UuX!8O!eInQFC#g94^=XKOlDE+eh+O&K1Bxs z*Kz&_=(zkB=)Ca@)L;D)wU@t#np2;n{`mLNcKR>ScIId3x%Mkeyz)CtJp2X5E`Eyf zV{c;9skd?9!u!~N<}GYL`V{l}Ty9K6qgWMToFxRNtyauhtk^){ts~qf?3GyVv=Mej z#=KH=R2Z<-T8WJDys(rmZIM~vH1B_j3 z(;;kbNZ}lzcWH74N7rw`s?k?CWU4RBZyaUK`vl2h4nbX1>=-Mpn08k3dBQVl=rlKTa{WCJ4Y}nHCvB zFT&5AMTKyb6vI#R^DzE8S8Cv3f+=?*mVYFS^65bz_PirObe2QMP(s^NIVIP!;Mm-1u90_e4`nQWjI2aP66Ly znqO{(X>Jt_Sru10HrzgFJOOCDA(Z(e$}&S`JjdGb9Q<^&>he?_beRT})A-73o9DpS z)D8oqu7Th)(fG`aP?nw{SeMfH`Ut*$f{!NR7+ML(vQ=>W8TdBBzj_NoYj-f_{s;Kx zZ0~>qJsGfEMbnes+ggH78K<|O&^vextui?I@EMl#=sJD_)A!!TWV*!7*h%@{`59leV7qnFWg_yU>^pF;y*XPqphaO51Jcmpf=J7Z_=p<~Bc zq*m@mcyKeSR_;Z^%wcqFIg61acQAhL5qb_?MSLxz;P?jEM^?hpKLSJ7AkX&zjLicu z)b&D}?tmuV40WslWr;@Q`%+j^8Nw2C5V__kiYimQ2g1myaAJ|hh{dHcbH{)z+FrIs zkA-Ydmz0#EfcIOhss_tfO(Vv{B#%>)ytf2+b0Y}Slm;L2O0;-kaWTHSs1z?OslZp4 z81U5`v$E-rtdK078H-B!|8)uEC>#E?pwQNdVpl&(e4|i@R-!aM!`HomiD4HUT}R*> zI*ah~%ZN|iL}vXX)NX&1joSxkKK2>9Fa8Mq_kM-G*Z&4RulyC3-uWfEF8&NHCx48F z<6olf@=ujvD$`GYf$1CH#Kv>);MnC4aQ)6FxOL}aT)6fw=Cv^1^$`lAu>i)y9?be& z*y?p-uh)xxEE@^G%|0LFZwSjO?HHsL3^FpE9vQ;r^)ooKVFt&yti_|ld+_$@19o{O4&vL_&*0-H*Kp;^A)LE%824qB zOD2I+gFQIEVh|T6hjC)03p-n?vBF!45r+jULq3d!0_gR5QNx(-X9c(My;x{LR$T>S zWI4QqS(wm^)A}NWSg=e3A7idOv$K^H!H}N^eL=qR{LVrcS=1_2R_I`_G{env7Knxs zrT)v5oSM2u+8s+xJ!5S(WIs^Fz8F0dwY+(1Q%F=r5l{LN4?7U|7?H4*p~_T(+KNIn zX$#R%l8eT|e6$p+(8L?Dwy+d+D#q(_8``XXMrSVn8o@eO3!Wn zUBlOvfR*-E;qsuA1}ZyR71HP>-U{@VlvN;CZ$h5YiULMm0bdD?uf!$$n+Bm~+||(h zN&^HS;a3_?qAXsGGJ>xxQx9!*BU@_1kEUHgHoDP<>!- zkaF|klZ1*u?*cjrnx4}S(0ld~#&5ra6W{p-?)~x~aP5ab$GLBQ2Z!GM0EgfI2JZj# zZ}9rB{}HeL+V7Yfv1f(p)^v90&fh3z6f#|Kl7b_6gm9JH@lIi zck!BAkfpU^A)%MW`)467Z(*^HmZu~1N|CM7!OBLjvwsjxon6W^;#fLK+pANSkIK|l zBN&fCYqDT*o(eB6Q4xG)critJiyu1q@%4ID#s_!JUj=aHGZhN{)Kk)FMW>djBlcHkp)o&PQd zp8gUeZ~ZM+zx^+mdF7wcf90SvJ~r=EybO^oAK$j)A;!6F}!hfJDzUafE&ZZc*KbFuwyB1_AbTsfdO2T z&)03iY* zhjEvdcX#`4Y_D&|7M}F&$tc#xLzqoPu&J>MCuxl5r;QZl;!0>OW^4U zI|J>`Q9;`?8xe9lk%)zr=W-bWO~%rtX?4=Auk^kWLUS9Xh}FsFDM?1q5JC|Lf&mMH ze!?zfLPOMw=AePEtq4`6i)q#xw5oE^khKU+S=ne|lx^e<)>v4ECbbcD8Z&CjDv>U; zBEr@>WH2M{cB3jDSBx*6k@0(V%4R{ys#-XFVU+QGmlF8KU z!E+3*$A-;Hwny^)T-CE`x1w9F@Zpit#y@&%nKMEy1?|{+TTV-*y_`PK4L( zLTuw+BnZAF!Ivf|GS6aeN`|i-IF6cwER4Bz2T!7I|0y&cx`_7Ux6pCoKDtgmz}WRS zarTS9#^c}oD=vQV6YO~PO^jc-f)ii=Hm-l~=eYZ$U*R&X@5I}mDAU$1fAk&P`__-~ z_9{=!Hxc~iM;Kp}T?a?sI+(g=VD4Fkisn9=br19n-6*ear`0t< z6RCqLSVfp5QRE4ukk>7j5jEfLL>{mG5+;J&ax-$YW`fSZ=xN}!rF>?X;_ll#RTLz1=7Y(oAE^V9ZRRvYw4z`%Xlb9zt^P1gb{Q zpmxPY)K6YT?bLOoCTO{n*O8gMkD48?qUGEt7<}+kZ2j7Q!LG0WuNc4o_vkqHW92zt z%gJw`@5Bd~IsG0EUHAYuZ+?Q;AAc9`z5WGWedU{&R~3q)j)u}4_M$i7#B9)ugRuy% z&=gM4@bac2QBW}>xzIWyrzC3;aZ_iHR z&S)>LFiyV3IQ#zKFg{#1hR;^7!R?MN+@F}lTNlpa0pFe@OuV2Jl zm(Swfk$t#vXg4k&+J)1E;UU`O;nkBkvT_`I8Ev=JH)1pOdK)3LraFaX$uK4xtFdcr z5IctZF_uZ8Nrs;o^+>a17|oR2MLoWtVKD&S5E8G%X7Y(W*8*yHnhXk)CF@(56o>#VG;N-j#+z#74TX6rDvDm z8<~P@Y&Efl`g4V~f z_Y^YwP9wAb92$=@PM&^9i@Jx+uYQgTpZ^y3fAMd4<9GiP2S50(vHd;blkthq<>?#{vJY4;$P*D%LyL$<-nYiv^XOIgGi`%*~-P0yqp z=a-Z#9U1f4sOD?R;o|je?d(C_vpKdhIDT&S1(O3C)KEMDt=qOxWU+)Y~$?%1nr z_ES4}77Zg8(Yo>mS|)FycKjOBjJxRxLT~yOQfqFZddq#Z9(oHiPkxELpZzab|IWXn z|LV`sdgR+^+4D8@?0Fw!2j0gz!f*eTZ{Xs?&vEnV_i*>sFEB42iXjt?pg!zFcff_| zfERli9nbNNo%9EBh~V2F4q;Q!udE$6YOlmH#>w$a5-VHlG2BsumF?BIuzVQb-m?oo z*tG-iPb|lU=4u?H(Y)TMm>_I=Wv>c? zZ?K^Ty)sqH=|tK}0O<8fk8Fq$G(fOLH5ycxX;DkFinc7WI@h?Q8^!>|Pck>=LO20Y;dy)Ux%ftER~jc-4F@8J4sFkguhtk)=M3 zM8c1t*95=43=vl;s(g&9o&wZ(a**+4quQ6tMyD7ZQ8ik81*o@Xp~aMgE@J_@3BKmS zY}Dmtp{^hg^@T;KFVWESbm*@1qC@XSrlux+^guRcIw7Idfw=jJAItEW(!O|n=2(UBgVpNr5|B1_3cHtI{?;)mdKEdkS zuVTfe`~10!XxVujb(;^OYTX`$R&Ij3e>HraEX}J>nI3^5Ok?x2fso>BMTM&odQXe; zH6tNtPLD#LUIul11SJU`)5FlD+fd4eMjfw4aU_MJU|fl_`RAJGb+%N6;-?x zrBZ_fi}K6yLKY2=z?+vv>m&4}yi`DwB=i;)JCRN3FHT*{YBaZGIQVeO_!O?Dq2pKk1L)SRjJE9zWJFzxNdtqqkUv6|69$j7ZC9GsA7Q zm|!{7-HwOLM)BI{2p;rw;$R|(y&*TA5?CJ)bRQ9V?+=gQ4gSpS;UOF!ps{tdV?%2z zHg|UuY&}@p)rnQ@tr%}=zzAEE@p@WhVEQ&wB4uBo5v zKw(Q=Lo2AQZAORDAp+8697ud)|**jtjMJ${3-a9sOuy6 zXaj0Cn56=~XbReB2IT}_Ipgkg@EOGT2tF}BV{<2rO59xvQ+uBRK1=ryDwmGJ*0=o6 zz~>!jC75K?o!W>H<8GADi?7?MtS+3{ycbopzWVJ)_%Y3I>p|4+J%Q?dr;w)oHPQ5j zuDp&NZ~q9p-ue+Xz5H!j+1D{}@*$QUzl{w~zlPoKehcH5A7B}y>F~*`N<^K$bWhnW zZ7rd-o(8v$5L`{zP0B_>XRc%T#1-@rVBLq#DXTOq`*Z9)j^>?*QMY9usy6Os*@HUT zXyc9}Xx?>9DQ$aBpliQub$JdQdrm4##7CBIf~RXWZ1v-)Nc5vD*oiWK3(7nVD0S8H zn$)1oU58SZGJi94kq(qax@dA8$PaX&Fw{=7tA{F_R_v~Tc9+jMn@f|=wRw?FdBW*ZA z{_bC4o{xYFg#t)0);2J<4tl*<;|*XJTa9C(6b=Pq*hv7c=Z(LtREH57#wxcDlkpUW zX*vUqRan*8gxxKTI6?@VVI{lNSdTN!{FtWmN`Eij9~s35Bg^m>!S@QGc!R(@+S!Fo zwe?s{fUV&Po@T2uL5mxsnT!xFqbwsyiLPm^5XdDn7$@*nvP?x27-j4p_IWTI4PmKl zzer1sIIRdcEHo+$+-Am5Z7J+(6&!_yaOLJ9oL`7kQ3;X-CGafHg)6%d-XaYGtmMve zJq)xZ9g8g9>2^2~@OYK|YpeMhWF?bY78w_(j7qO-K%7OHI`*t*E5Vo>OGFSQgu?`I zz-@)kR*s;x3~5Ih>YRjKWg%+Jix^|)p*{E&bcbFY5s z(AXfo$84q3wWz6WM3RPH>2aXgU_v3GR_^vI$Cn5_*)>gyKu_Xsmhy~m4t)8%{zZ&S z#S(KB@P(mbqo@fIe4zx&!bt^uI$Gaz@KsdNSO~rf8P-DZ%{}Mqg0ZzndDb@Qkl+i?Y(`}D7M9J3uGxb4>=x8)+M|Fbvw07yzB}f^b$r-$woo9Fh!f2 zk|WDFLAU($RSX~J&m1|Ao&#sl!5H1L`#73*9zg@))HRqHcP2Zs479t7Ip8fS)hobL zV8_OO5#hFopnHk;{Q`cxu(%Y9_^~|4lf`Fc>ZA1di`k`_7@nJ+95X!1%phgGMqMRd z%AwuS@V-LeeI=&?FXWr>)jURK0&jkiP1(>}pqEYPw`ulyXHmw$$alV70a^ye75`a`UJ`RCaG&aZLy8-I&y-}wz5{P6Ga=CA${-};yT ziFt0HAKri;A;#LY%(~#`V{FMcP?xu|1zFFNxt=Y^cD~V#w7%5@-mHL!CNYpmp|^$+ zv$YXBnwqeWC+R3p&IQ`rrQR-F>+i;$?oNWQpJf0~1_p7nryExpcMmi+V@swMYZDo) z<=Ma4dl_LUxp9G3*av$QQtHP-AS@hH!s2VO$gX(@-( zREm(joKet?IpBX z7NB1DA{x!P=pZl~bVangJXBSbp~YrHeVKvK)1g{rL4)3bDndKLxE`x8BgNPow+CqN z2{hH!p`ON^N>?kbw`|B9N`zr>x=_OCqa(=VxP*<_5}KY=4tee)C7Y2bhsGz*`0`~Z zEUOE}cE((Suh_$A%&05iQxmWn-Y@?dd@`&>z*j-T6!29vv=V$Wp05k0|3BcfGwM2r z2)@xNc*bVnTd@X#m9qrNT0%sYq}qfefhP-)CRT4muZJ{y7Q^@yz9iq2yfvHa>2 zEW7j+!)Ko|dfr3Z?we@Xc9}8r0cP)ijHx^Cq4yMH(ZTb|67xMYuz{l&G0MoQl;f8% zLKqIv==x*{`h7II-N(?n^CA zrr;PDhn+yT_b!L4e;k34RR}Lzi`v!uQ8l?2p~3YqH4LMeP|IP7^{b#MeR!?-w{KeJ%U1K7zK_X@@;;aIPK2FC~2^ zK)AsP9BF95@s?H`Y;M5G&Nf_VYsEvtPRiZBew<`9Jl)ZOJ+%$kNE4YM zOea~y0R+5ucdKUyPw>5&9!}SQ^Mew{H=;f{W1~%O&9RFdX97ol6*N z8GnuZzFmWMlZJpRKt`h?4e0c<40S!{X zSHuWaB*P$hkCX`bXnjhr?lbU}MbZlR)n;8 zYwKO6jN@|-tyJLWS+)wnm9iG!T12MSJzI}&9f7w2nKheGy_R-0xkj1Vmf5%yqvvj6 z^4b#&ow|pn9T#a@`w5|asNQ&%#`gwh?tFxfBe&78=bX~HAlBBt_ax)zX~xg9=sUzh z(Dm~39{IX_f7em8Y(GS+JAm3vjIkSNm$bIbY66c27pKWZR;)#EYz_Pbom)C)`WV}~ zhGFX&g|lxA?tWUV*lgzztnCA+?BMr0`5t>#!PPSfOUp2{RUJ^p8CfIMP&2mbV%4aK zSHqfVfTy7YuDUjuVzp4aqbMZk@~vLvSv<%!x!5Qo@ zN0Vn+Sg67RHvTd@N2Vs0lpA1ny5VI*A7WjT6;$M2&#)m+Gi}L|@bdYeLN82FQCBz_ zZ)tbg1YWkD&@;_^<#khK-5%ffY7)Zx4)=-Dxng$FIa$T(ay>)~jEn*`< zxP!ph)zEt)>SV8w0r&uzHV$k*HC_MpaDi4=`adS+#Hl(fl&h+Yp*Iibf$ zYg23BVpR20>3y_0w>-n63AxI(%7Eg^QhwcJL7V{1L}RGt|Ciw@y~%V&fwM6(bnddq;Y<&VgTG8+X zyo8>A#WVurGs-cza)#iWMS^CRq4fnuCgJQKLwI~OTDKoW?}4-E+6 zjmo}_1mI;%Tzwlu=boTu=Q(7y9!JgA!>HeONO}I&BG2Af+NJRGw(W-*Mfag$BTaJM zE=JQGiq*xZ8Lt^zBQ(0`_&ORJK{qr7?*IWku##{iuzSZ~?-_xkdqe@8xwRLj<{lUt zyP&Tp>9!5S+rJWlLH<79zqOebT;BpK&ykhqO7<#=boL_F*#~z` zGs^uj6jc&-yyiK2J8~*$cRCAUH)nT?6xcm`c1Pezd|gMK#=<-!Ud++c^bCZa ziO{S3e?V_ZX@K{SJkv`P()DaC+Zk_{()6SQXC-WDT5;26ggbX2+Or#xrF#(S*@Fnp zFFt$}DMsC@@e8P(yo$DU4=}jvEi60s876Q33M+2?8bcRtck)E;udm0^?k;TS&rH)mR-_3%9>?nWHwe>y9=mI*(8aGw>EPdQj>S;N z*qw1WS&=IdGnx=GmLtHZ7u9Kz(3P`P@Z$=22s2Yb0qiAeW!hGliXR{#T)g1k3Vz>i zfvwVnpwG()D=QOclyP)2gF%_T#V9C{OG*a?i}JjVuP0qA-S&)z(J;JT8@!cj0g@Swuos|-uxug>Tm7Y2 z8r5Pbtin(rA0y#>jHik*8qUQse-1|MOIY}EKtuZ`__SGQEXqO)Tj`ck6HU*8SV=iT zB`U-S`8u5)9hL}UMOLKr9weM|&n0C+Qkfv0$ix-ft75|x=jS@v3qf6me2tdyD@B1; zmKM@tiHe2b%NE-s_;Pe6eaxJ{jSbgc#^#$(F>&%H>Sy=CxpW%3hGm55B!71sx)0ny|A~93+HwM^%}14a ztmzGVQBAPbY~GK$&GLNiAR0F9S3p;{emAPtY)6&Y-1HVCrZyoyIS0ES<7;qiH9{kV zFoEYM?7TEP_fo=I?5=|**VYFsVP|Zh&C0MHg3in$&o_0MdX%Q>p-DAEm1yGkyAdPY z>lxS64a42ltE^cl>leD3+m$UjlRX3Q)we>&h9%D+u%p?Pi`hMovIYfs3whlZ zl$2tzm^<(L1-$PUGVLrB(6LTr$-)yVH8d0ii`}lsjz7diAhU%Cw`4M@Ku-if0Uzsz zZ2psEBPq}Hs_Uf_uL)%aFBa#U@DgLKK<}%0W(9g*A@p8&20a;^yqM5iqK(i(Q^+;f zD(mUVz~oZj5OhIaquAW8XTIi5&%n1=Sv=I!z7w9Vz3}uL;<;rbHguL|eH&eS-oxnW zFED-QZ!mu2*BChUWAq&PKDv+p0If&AK<(jgq50(J=swG@9sMTSx4nnfb#I_~_I2JH zZ=hk@2bkyJO>eW>VX1V$O8D6bIy+5ImMiypdM^%C#D|@0;hZJLK zn2>XoXkcM0W+^CwpI=LvY>4SC1f&5$TO}gV0Ax3IWjYOCyG*Z9j)dCt$@&TDlwgrT zn6i{OyDxg`b{NplrDxxw*)rT3-FaMguh}DBGz2OrU+F&f+=A_ zYr=^3xE@VGHR{}XXbTiEmgl33$F6V*I{o?Rb!B6~m4zW+4u<`VxxPHcel^BDg_!WE zu$<6a9xOw*J`3&YEDRWo=;4juQsF?7&`XwTQLWP}9UH9`epIO(NNe3lSUre&gGewc zrD3Ko-xLlw5)@o0TR{Q;uATxU5JD{8*`EP^*A1mr*KT z_ENC8P)O@2{C`*<MK7TaF>Vb}!=V_aer4nwZ^zY23HrYu44%HwtiH$EQPhHn}F-2t+gw| zc4UsSJloVJYET-hf;yT)2~D^tkVK(B2@PYcqqPU=u}Rv@8pMW|!`a>gTXP32buF+r zw8F!4;~+3gz0tWSTVDAeW~abThN4t3(#kVg+MRS*uq11MVM>F2z86HIdWTl@p>z;fpD{~9@!~oM(b2EIz^z@bha!V=|=)IUH z9T;YOMOIfSwByAh2j&+$2|n7OT4rQ~A=A*Z4b{kF#4WJ56MRc~j|@Q{TY<{VG(7d| z5NO$oNC$1NYd`!Q`{3(7%=mu{ww@Dk3|~ZS_7U0+eTtDwKf>q*M%z>0MK3>Z-~ADq zcD|2>-5;R-z}L`l@N1~s{Vu9EzJj_L{?3|L(Yodh)J?yJ>UD2po=tC7rej$x0zM}! z4iBuf0~^8TWD)QMXnP@^pb)_qjPe)@AkN4s^p6t^(sS8D_3dwH!a!pa`WO-W36o*A z7!%au87Zt}GpUp^#%=?@KhED7s!E|Zoj^|}iJn^8U0t;Tyas-)HW5QrID~}9!%DQQOVWSmEyBoE^2pZ+0 z)}#@o?Ii2O=H>tw=LHvd%R&IMAV3Y_SH;ic@tCq2i`Qv`Ri>L6%3;%$!lqTjsa3(N zQ_(E*5iFarxmPV&TcR+>ykQ%cMcC4$nV|BF^ z({UrFgL+K+^_U6pW4{q=f>vyZI$Gff#)FboIz;ZVWo1G z3iO_VPebd|MCRb9OV%jxlc=k$V_ajUE^ngowGtMc1V_&hD!Year#ZQ4QSI9gVf+2p zvG2)ym^gI}b&Q$8;TbqPC*kg#fv0m7tzZ(Z)BCXI_#KQgzNSYvA~Cv|*0mC`@vUgx zaSDkwyWyW8$kuLGdV`~ks?w_}B~F`*Pw_apNdcWaqm$?w6q_4n#2%c4kFUi;qjT|f zIT>9YjIIuX&eqb4O2$`fLl+@MtE*{+v8oCBbUn(GG-;Y{Dc`#$oPa7AL$N=MB3~HA z-U#xY0cA{`eBD&t2vpO+;a83dmaZS6EZ+kZe$797w7u~-{Jw!R9v zmImb8+ED23Mv1QvWuXxm6XUSc_PmXo5o+5-<2!(0_Yrt|j>FY=8m{3>2&}q~^!9hr zc;s8?JVlc`^s%zzS<9B!(X{C`v}}JHt$W@_%l`M#wD(=qY?N(5*%zOr(7Xds><#?%58Tv_?><`f`Qs6w2TJz*de4JKuJ zm`Phk8#5wAVAk+|)${+?C-@o(Y1u(cpr)7{E5A$|ld)`d(n(Q881Qw*80S4U8|+4& zn{q~D!p>2qhDWR6>yX}F4T6M3jQ=mr$egUuGFleUwg|n_T+|SPbu2A3!Oltzx++!Z zv#T-Y)nSy8cBQ8btK2%Qc2;1vQjgXAKhwMh)4ZXlHFivu*sx0D!mJ^HS!)nej5AZT z)mfg4X^R0v>S8RVmG!GD&_v)>Wf!BCH+mDzthvOB>TEN*bWt?xBB(0odF0=T+LK6z z>QJ3-nA>-Q?^lN2NMuQ*tCZ#C^JxYzF3QFWi*xY8l03YaU5Eud*NX^08L=;u(sQ*I zS8AFPY>fd`B!~4%6GqD+_#(w@@2P$g%P+mO(ef4reW*I6H zeX!S#pn7m4S|)a(V`e`#oP8PnJ5Cdldl^x8AUMuwIXSnI{e+F`D4fhj}SCF`J1 z5bAut@_bY6kD|m&$a#V&a{Ez0p(tQ|DrCKq@sHU|A~J|L&(8BmOP1+lMFe7@JEROl zS;BL?NN>jitraiH3<{MI3)DtsoZU+TI!0Il-b<_(^NUoBv%Fq}od9n!EiZ?Se+lc4 z-r<0aiOtJ~JVc9=grh`Sd90%pHu@BgDj9D|;K|3*ILh}FD6(z{^y0}3!qGG;oqUZN z>A@{mIy1hSWBg;p75K@@$#dZI5X=GEUKCm7DdZUIkW1UkcXptF;8Xc!QPDAF=!&y$ z6THnk;cMHEAb}U?KZn4wtF-dRsN3@qnvZ@Pb%b8+zOSQ>psSgAfXviA)UJ7qwry`K z`{1?_gpJz>w^@FD`ZgL@-9_8XL$t4Xg61^@AptngK$w`d2AIo@FtcKr7CuSQN10alkzqnXg_Nv6@yDBMep z?j=wL2|X!Yp%B_>a%}`$8!to?OCA4SN}@FXho8{#S*`Gy%nJ1Uvb2&6BqroMc_r}Y z79&zriZ~-|tipt#(FCu>0-wvF7-^cXp-N_F@PbwIwbZ0CsAc6AuvHU$vJ#1ON+@#2$}^7(`CRodWSi|91yQAx--$~6d>%HikHt1g7MI1k>Ud;+)_J~iXDst}QqJS0kT zP@~C1wJHnMMT<~Vun_e{i_xmerjZq*Ut54-9goFJuv}Y2h-tB=LXQolCTw7fxwgcK zwZ%ScDhgq9VHlfBqS#m(#Vl=Ywa$fAC3tVG%~y!nbD?qa{a$HUg_n zMUyObqA|~o*5UvfS*l9BjQ@UukI-|~A|0XarDZ>jhCklS)r4R;mOzDx(UlcvNq#YH zuRwv`i#bJ%FC{E0yqM3pR9sGrFd;|JifU%%t@P9M0w}JWD{2Q#&lRQhg`svul-@=S z0jTlCP)Y#mp7rdOGv?}OdwSZQu~BAi5PJ1pJodugF@nU{TBMe5Ky+vgOmzb&3^YI; zX@Q}-7p7Xq$(9+$#|hZ#Ct$5&oQ-rtAL@XI@M{{|hJ82Q#^IMg!pMP(jH8UDqpJ~_ zTo3o?gz}8gKT3lenJYfAxgj10CTVlhp|Bj@r7H+J!jAFP)iJ18ot?0&Y!ujaz+Br3 zGb5*|x(P-$BnHB+B2J49r=b#bN+VZnlO3+B_P7nEDEkY#?E1uqhEFQ^Oz9-}1hgBN)XUMv)=|AHXLSDPY zybqMkph{_WyuZqMKihnM_$d=nHuMSJ^J%6TG5zPy>*j1vo@)S3>yCW>M<_}s z22(*YRfPx*)MWLcxRf!Q$znl1U!T&M!DBunF2N_mR^~J2E-0q$l`!I#K8v{eYUG$3 zkX_k~Jc3VIbRf*@mzaXNY8`A1+Zp%w(eh3pJbDEgM%re^*3P5fLhG?_A-(5Agx5Vq zeBuu8ksC-XzokI0bLV^L=J#5+zJdC6FQahv%f%rG%ZT37)j_IizW1dId;;TPZq^U?S`G(ImQub1%ihJy&iqKHOg zbA=~2PIF7}Se;0qnWa6MM61{w;n)+6qK9y63kJ~~2%<3%Km!3-%aWqq1)UDK2|Js? z02?jJ!Qw6_@U%2Nbs52@fu9%K%ZeCg^okiRi15M&_19xcw>;;Pm zy(|RPIS7>Ts9l1nb`cU~3(#oDMwhh!J(dD=XtU6x%g2yjjb(;1Oc*OL!FasZY{C@J z_beNjb%e$Sof}(J9&9P{U}J%YunS<1DvCp8NgOd|u-_QNcE-~KG=QB(BM#V%*kRFN zvq_Dqk|K;Vf({URBSqyHAOJh^)aWXzL`$w6jahEgWqD|QZlv@5NT_*DjJ#>RjI>cg zPlm8Oi#D=ky$nV02E!=lbuZNDkk67sh)Tp2(_2Whm|w_`OBjPo%~)7!!Qyh6)9Iw~ z$&UD})Mi!$f=^`+DMdpY(6|UY4-3Jk^%HuuK5dBBC%w8Uu|5KWmZYz0Wod)8u@BzP z<)~i13C^ZbM#DC!1I>&<{C=Fksp^M?mS=5T1xv#;>eC5KTFATUFWn2|2Hss_dh}C^om?_5mz0;5^WsW6;-@v>X2h; zqVW=X-ae>+GEni=f~K7$QjUMv^QPkMUoE;l?houOUJ3CQ`J`8eXVp)4`PO5vAp-mmXOel*=L) zO*&s_pJ9U9Pt$a{Trd-`Cc?NJTyprXqxl zIY`qL7s^uT+nMax)4m2t&OM#YQL8Y~>5gjKfyOHA1d{$9%#$ z*Wl(&&B(@5XrTdG1%P3~FRFk~=TFkcGSG*sP{F8M!5C=DG_r+nVQE8UO&7eaBZ&19 zP%X>(Go2`L)9^e^C~@-W;?KsG!c{*GTk~qzT4v#HUITMt7DG`of_R7!-c zq1nlt;#3{2t`_CeIU~<0Lvdw{qV!xVR>w#xLrP>kV<90YV;l2~Hl^gUE-7=HE6jwQ zO!w0(i#;qVt5BZLNr!|i^R$SNTR@vzM9@iDP^2Y5Gz5uit`w9g zN<)yEDJ4#S2*FrV*;Gna+L1<0uJN3)&6S#GUw{5R5rZVHPwI_yc1Z$~F{$#5Tfiqn zR_3OqNj<7#olio?1~03id~Rk2WA2OjCIx!(d~be%4GRhx%ZnIsiy3iC7;!aGf-lW; zP)pNg#I?04^Kc8j1IQ07LrHQ5`j%a&>^p`SqiGcZ)wrGJw&hhMX6_&`b_t%LGjR8v zgtv<(*nb{r#@nW?Z=sc@SHJ0XR8K!Z)ymtbpSZj3?^!^5rp^a5BzDjRwn&6BRbYWS!xRQ296EYh3e`!TpMmV*s zdZH>qnWh-!ni3dG)wC}i0v0pUGK3`*MoOYJVV7=T#ch;l6og)i@tQ50bh$?vbEDMt zI4`Vhx)X`UX{dfUd>&XBg=OJSWdTsW@5*u=oU}40o1s>8YZQk=+?;C!P2$CDaD zuo(MoMc7fU##-L|8;UBhzSxL~{89|%Xwb1(kLE>{=+2LzGbf6UtT0-#W2nvLOdRa8cT*6PFS7fk5Wp<;KaaI7P z_0!k_X;cI=sPNNv+;OOEArzW?$|99ga`lXJgrPvOh;Wqnt6}67@R<^Iu%sJd&9uN) z#c0~lk8t~P#Ji>#S34Mu8&N_?s~wFfbG5)2l%4yBU`vlOUQR+^GeH=wgdsYBa&H@S z&PM3?^QJ%>jNuL#6STe-n%&YV*xHw|jKbBu9JUVr4MtWemF)wtO3!QS04iH(k&V4H z$fdB*Bq_HoeRWB@{(Xq&L)%p2Fcqy-d z9~TmI#aKev<&>1MVX8nWW3q|Y%*EFePNb2P7|Jvwy{*c;Sw>SCh9Zw~l&O?FN+)E5 z7AU*%xx8MAj1!fz6{*_=H$_I~Xv@;@%J!lxN=F8vrwmfg$mjL^S(<#g9ofA9UX*8h zd5qu%W**IWDZla=_#6bbi{SIq_`-azNo47&c~3Mb3yKyvx>4XAL}7F#O6xYk*mDrJ zk<$pTxsBM`dx)*Rjo|oY_y$kGv-Ajj-G|`sIs{+mVT1?IBD4D592mE~i8P~dV*ECe z%Wj}XV(*HZs3!cXSKUF?%spi(`r6emV_rdVk+L7Fx|oGFr6yQZWg5O2f{)fyA#+j* zKD~i(Fk4Y(t3)}A&0>XzvC?gu1EAaHB=qKHcUXDiT=LMA(KW!&y*!~V55MnqAt=w> z>^8JlV$|Brd zk%xQZd3Z2UgzJNaxZGKYo9$&doyf!e9tX~)wKyFv#a`OuA)4@3f^KWE8EXnmSe9Fk zv64y*<~z}s=S6p(AMG@}j_d?lvIw=TDui>=@TiiE>~Ta0sZ^3Cz{o$hY|z}fk}-<@ zD9we}J*SuzF1LvBrG%AGqLfxyT{l-}AgLwP3T2oY>CXgrL)fht;DGb)oEuq8VAcdKYhDgJ&28iK6M z$w=wzgd;x0___j)rq$3_uVlO&LYc1vS_0DKYet2$mf)*_+Exv%uSu~*ORYSU8-ulZ z7$(MND^1SI*lKAe+!$Sz(%1ttL1(V(B2YVEAczg=W=7Zs{tcR4gs_vIRGD(;i_q)@ zc0Lr^`@5iA`)yrpeEuhsE(CYFkc%Ep<*@Rsd8xEP_qO5YF(f&tK&2>b~=lPYg zP*y!*S&*;BOS#3E&vU=9pqQl)3-hH{y#RSC6^cuB$~FKd);l*-Q#g@QU?)q7rFh@V zGrKC<?8*lG*-(rh%+b&NS|KpbrCH3Z+HoC3_xD&S4c%9k(3 z!HP)3l$a$`)@06Z4kKD#g$YHbN~kMcDCIdVrTrK}{9lAcMKFbO7A@^i<%%+@5*j=f z+JY=JLu-&RFaV7`2CXZJVoQi63{_=}@JgV>nj-jWm8F&{XP+J$Ha5kX#+GnFJF^Py_31evd$nuCo4_o@qNpx)45DUxs0y4LmuE747bQOij#XG1^_q$}5>_)+r{X#s+?z$|xnu7;WSAF47p; zKoWYnVyhB!WlCBlUMR5Rr6M!_jpdv57m%!ULK0uOPbW0isj)5udn+sB~~F zyM~l(3cmars>W|9k+*u~U7FtmnqCo4CR>oa`~nmZd_~2wjvS$ip^$Ij#ha_$FIxxv1LYZ2R&)ljqunoAI>EQZHX zh+t4fD3qa%k+iKb2dz!CxL!5Jr|noVWy0i)9&0yfv3`RJ>t>6wZcQoHOc}6ZScj_k zJR0ddY@D%T-AWCPZn5CTK`)N2F2R*8WjHbQRa}`}jQi`gxUsSjwp*0Q#B?m4Oo#M z!gxUtJp~>#7lasV*)Zg^A)M6$`(lD4FAY6y&!SBrK#-(jExdUfl_(??CR0BLzDP2r zjMy(I(GYyJ43-7Cg-XPlPb-l5n+u=quqM-+q}MP@wwo-IttE{pFj)yQS){)22 zK=BbY?%H;^YT98hh4Oswt89?JO50Pmf)wx-dl}<{SXdTEmbD3ao~2OB za`DxxVQkz8L*o``>o!AEy$R~fCX{A2^Lty_AZqw7XMP}?8j|AU}JIIXRBlKu_&r3exCr4!)Lq=VJ zk$^~DN&vD5ph~r63Yc^{0+6M=yc`uYHUZgl07{XMpF{3B=;UYRceI44{4M$00z2io zB0twNYT9Uo(m5gzPX%`J@HL)HAdyON2(p6D9dUM*WO+4(<|h`}EUA{-7Q z8H*ECEm*pADTbCU!|;j~=p7tJXIC#{xQCI|16?Hz(@hvNej5ouv$h10$}*%JI;6ZB z)YX*nBIl#6(}00tGcWc$boXXq#k2u^%kr^$Z6Vgq=3>LF3L9tD*i2*FwYe0V)~GSs zw*)l=UPt3XtXihW_8BksOb2mha~1ZksKoBk3hZBD!iiNj+}jbs#VH-G%@pFw#6p}N zoQG4rFW`KC0j>>c@nE?gZ%;e%q_+Z(nk~2)bmP3qjnm~W9Mw3ntDq7a3BGj|ajYs! zU`3V>Q$-m@+#0myH=sVh9?6^r_!qarvZw*YOQOhK;(>+{xKbBI#9N1SoF>3VL?W&j zf|!DAUL*rp%Z;qiMYND?!Y(JD(92h#H=h7pKnszwkXG>gp}JJ+#VW00HZtiwn^CyT z6^4^wbyu}Ax;C&h!o~P#=f`&bZ%4cdu6Q$oRb6mJ8lf`yQC#6ck=}zsy$=O?!p2Of z(cUTuIq9^}hkF@o+fhPrt7v(pjyPj&jMp`aQbJN=B&;m5&0!S#=7yV;%eIIzXP9PJDrVOEMT-%vpinGm|Ftlt{~;%TZGZXxkN%q}+sE zD8OhMSKt;W*rHWz;CWO!?4;3Wy^uu$WTA|3B#uBh1~cQgmT=avh-tpSzni7EqC8rM zN?Nb8sgKZaMTtKJ1FuJ*wHJ2Uu7)yDYWJdoW>`+hm+`$iJbps98i}O*pSdDQOIGg* z#nMbqUINmnth4u0{@mJne+s_&GIXUZf^0(ug|2oK`}?4ck3pB3Kv{AMs@NooB9kZ# zPog9?gVOkFUZ+`9R;^=WxeCaxhca2l2EjKH185E#6G zI1Mi|ep4|#0bhmyOfmK*m)}BS`5ny5%gbZ5EkscfV=kd5MfqI7M;HnCo)-mx8p4m! zSDu4O5rF+?_$ekQ&jbIY$j`{n%Sy!3WBlCg#P*D4lM;vh!GJPKT*eNG)jdBpwKOZH zC-9S^Y^PYq&)MQg$B8rGM=%~$;%#edE4n*6F+4bg;o)Hgdi|rL=pWqr%W=5oR|SF)>?#nT=)G zur(WN*S&~U6N|BI@I|Z`nul%c3vg&pIks<9WA#J{W+#kT*>A-1P76lrO&F~(tihWn>hSS}6h6O}#3v^`_~xlF-r4QJqxEJyT33OaqlLK9k%ybr zC3qNd;vy^DAy%qGWhU$`FklC7tex5jHmKs5%8FrCK`Vyx`_Nv{k4#}3{Dhuk5u@9J z1{BN>V(|-BsI%NK76;%lv*Jk1%``lFz9+G0ZW9>k&6OKnqcdP}9xEuJC(nHrFcpaP zEu@hwVr5-KD^X@}6wR&VAq!H

MXQiN@xb>-b=#RaY|38VNI#mmddEW)H&XO`(#; z;f^*Tl<9&q)PVd_CyLAYb1a2~pMbB($Y^OzG6K^?9Cb92dPdA@6w?4zwm7XMj&c^Q zO`s=*ao1eM|3R=Z78}AHG{Y7`tpVDYKu&sKWw?oeE~5;ikwMfli$u*dpmH-}i|JH) z7-3xmogKN)dSG*u9@sh9En$3JEYVegyiD|eJ`GHaPV*nIBj}z3?nRzUS=LFStMW06 zG#*O?bZj*82+I;(g_1@bKA8a(RaWa(#-3)9iq*;BdYKdurVKpa5r<22ie7X-V?OGd?Ss|tSmqA2 zr9qy*6dRCw6uJ06+#M+Lbfb{Qm*?(7u6qFa-Vqe~$Dj$X7*+?EmLCV%7`QRuXAML~S9o2Yn zAdLrmoOrlHi@WQxaC_APJYHXnJF5zCcUpt1Vtq|{xRofxE5UMH(-+{FJ`cyuWjIoz z!+|^#4ly?DE~&zX+;&Xm3}dWl2CYR?@DY4^M%(fQjFSs!GYccoF7UvR6NDkp2dgHC z08J^yn_dyW1bVcR#f07xLQjrj zEYk5Iwo+7XrERgI8Tk7ZP88@Ij8=?=IyTeS-6BhjC5}ScUy(V?LL0NNVp@Yx%d|{!0 zwe+y_<22!yWL=4+Qiv0F$!gi5teRGyfWsZ;?|S(+oKRIb*!a-43Uv5taXIGmb<0ZE z?zT>ZmJZSGS{V1)7^Iq^X4EZoMPZ`#`kK1fK%^A7mk@j^zW-ta8%Y*<25z*ulqssx zxgo<{WaX6v8^>f?dW`Gf@yGc89mwVXUQ%RaBUnk}b@6-!u%tYK9AgUk)>`D4tC3}{ z!4gvq7MtoBcUzES>q3F6k6<1`sec3&p;6dqg2C#QsBT$}o`J1cF}WYp>rP-0X<<&RN%^Z-g$@c`10!tYiD667$6Ldo>$1u2pVB^QbLjxG2;R*Pbu?&rlV06WD zemsWu#uivrI{2m6IckG7;eeUYbi~=hS1^(-dI7q^#R$1_v3$G&9eoS1e8Pl{n?hK( z#)@6rT{v_&fYs}Y2|G15ZZE`|b@`Z?F2ttw1{~b$!l@GuoIB&d-d&~Gz0HK}vp&p@ z`mwq{h{0+rwhq=|*Jur6t{-PMMR9c2i{tC;xVFoN*N)rq#q9uoaMO)%oYdnR$1He% zw-s-$G2>oaF5by;e zG3YIo$Wzf;m<(RZ&c*y}T3;58FPp~4N-M@E(37J;Pxf0VCiIFcY{+Kh$u2R`;B0fH ztP;5zLQh?Rf>H|#)RnC84wRcBFjx}=tefC-qOgob>m)E~9TjfmNilj+XbD2a7+YeI z;W2&`7(B?=^JjQ0G%|`BX_b~VG}daAyP9DMb-@^HMVY%AYFAPjW>e;iqtq8uK&N)| z=Us%IBS6^6Gduo0Sx`YtNl&wr9$11-hMY(*>=K%jEZroaTOje3kW*GOd1iG=M~|5C zA7NL(h9Vy?X6NCBtXv-R@dD4=OSwE}Y%B^0yHc8?#qNYJ7*g(iW#Ye#nd4D9@a77^ zS0z2}b3;;OmE%yHHtyqjBc!#oENNi!*|?}HLs(c^iLa_m_-d&c3uxT3he>$pAR@iP zD2-Gj*Ad2Iix2rc=2(5mwR(6y*r@RR3+Od6HO|QE&e&dRaX?2~R5Rx4EH-#TVg7%@ zSlMlluO%h*wuVWqN&%ln$7`>Z>rF#u1D34|Vv#n41!ZB(XOZIqZ4?W2aV)CHAj{N% zyvjBd+Pi6_y}TZMDEIYKP6iRG8bf2pYAhSyjm=w6;lQCA*nj9QCfA%oWA_#WYGz=s zS_5nKMp){$!_u%DmCXm>Xg>;f_bK@L&ci==8Nre3h_1MYc~+~LFj5mt@?0*x!_>?)p*4%1 z<~F!_(+15pnxYrZc#z;r!p@6T;r7s4mY`IV4Zk-JD+s%Z869S4J(!trV%Nqn_H7Md z?+!aQtt-RSSRt18WuvQM9(o$*W3pe5)gx9+4r(#8q7WC)+i>B6fyIg)Tg$L@oeM8t zpTg;V-MDnP3wO^p9F=e0S_NaS`R zn%jl&;wHEkrD4t`^opWztJ0{<3&UCzgwK>h%v*zah&OMXmdBVh2R?ZiJ~yPr!^)^L zn~|$#lw~9K5-YHPPZlSbpCi4yMTDNL)22oifhV&$3K+k0O7#4m?3iDHtYR9HwEjhU zWYJIsjCm>p@=DAsR;aYnV;n$kk&Rzx(Xi66;uKV{;*_({APQKq)s7I#Juybw2rK5? z(3ldL4rdLZz!XHDAw>I2qKpyP;BRGNtaaBg7AN_;jJ~uQ4Iw8BBnaqA?9v-c*h!=l zv-{KWbqeU_p3}){GqMV?vQFJIqk9fH>4jAaAt$}93wTtD*quBJWkVrl0pYihU&|sO z3pM4?8LhB7-0;YF)_7b2n~X87VI7d^lNlOaN(v*S%$i9kuw&ztu7=AUK&cp9MI|a+ zVVFW01?a`@I2`pojI8y@bw-hGlR3Rn<+-K3rUmYX4qBBTFY>%DG_YZDhG|_cEGFRR z23Nb;K=8Z}a`N6~(Fp-x4&khpSz56aOtSE3B&y8zP@u=xDsfk)sU>Ni4tEfGLbRAx zpHu2!BjLhJDmUg8u@t&!pKgBc#X?zur96o&eGPI=O|-yvHauNWJC{Po*lP~-!JisI zUHddfSMI{rUFUH0jK~!cn zwrt#l!L}~M*@7o5ZlwHCnr0GCwoH}0flRSD6}%AT)&hhXBUi0;VQRGrn>K_nGw#Cb zWe)VU=3x0q7N#c(F*#g{bt`PxJn6)tElFHG+Kmeb+Hh)54Q`yR#ii3$y!%!Z4<8tD z>$(B2J*vgqPy2A|LKE&^X~TnS4S4HO7hb*FigzA$;@UAc-nbsdy`#nW1=Fb@2|QV~t40B<9X76)rci zTq7BsFMFTbyfgu$f%YX3D^e=2!0$!ci&Sg@-y%jjvAwL~GJ>z1$8v&@M}lr~Apw{# zSx!rq){;su_=X7&Mr}S)#)$v@2)ycV$vOvxaVYh_mERS_Z z#@NX`XIa$4ABrfe70Z-JWf~akfW%kGLma48tC|Lt{Bn6u<3^gHKDCf9{ z=TF(r(MV%*pv0esrM3$ZnMd3?in2r_mQ)5YpWpkcnzqV@Lk5P+LIz7%4{~L>d7cZ2 zzcT+>CK%hh0?2A zP)PWR@y)?a#`sB*adpo>mR?>N)*?erMp}!^l*6(d3Mw&Xz>noKn;ls)iBi3&$$EK|Ybhc%qF1-jFro-60#*ZU=!?<~&8jo)_;P&|_ zj_uIm>M=duec6GxA6DR#H=OwBsTbdPwHB{jPvf-*ZTRfNalCq~8jsEg@aUuyPmWdM z?COPh<4_sCcg2dIT+rityL0jB$`|qT$#Q%>lZ%i1YP`kP_K~U@57omsw|E#QRjaX9 zy&QdIU09JGz*1i;ma4lks2V|2Q8zOA)u>ekkyf+e(eYffB@LCEASSKWr1>PFfHuZKL>V?~tlUos>p&g-Cm;g zEfDxI>drmeW9%y+;8^D8mGS#!JeCp?&w#=2FUXa@BZHx>$StuElor~ck%qvC#L8Lf z3_$A-ql`rtW-N~;VU%Mq3JX7W5Yp1)D8pb%eQ8FeAS04YcWZ$m+)S{=k!zsM$+Wmi z#yOf?5sj|UB(o+6R#tWySND9li9GnvR$xc7dj`5c&U2;7N~eSj*Lg9wfH9R|<56Ju zV$QSZ&*O_(Im*WZyT$p1d|#9VogR8d!%ByXX(xeHLe{X$FxAwuZb+v<95SOw2H1<; z$(9A_Of8}b!Y-7Cj#gV>a`T)p=JT~>%hWVR+R+QK5grR;POw9*T$%yM@#1-)Q1}-2pa)r<%Xrz^q zg#uh27b*f%`CPtNSI798oI61RJF!3+*C$pe;A?JfMn^{n`uh45$c@tA1b9P3gdK}OZ*+7R z%a)H~#rPN|R*qx!)D$+(%wo&zdTigc1v_`{#NLAkux0x;96E3i12nzfR6Saq3A9?O z(d=#@>?3gVMk|f@ppF<|tIEf+^?t107RI`*VeHyhh3RQ8dV37$A1KG< zVbAVboI2TwvuB!c@@4Q+RL=QOF`T^Zo{?xw867}+}xqV)!7_;a)h>b(u^N%Da4OxmOmtjKS|p0 zhAo1JrA@e1IEveAY+Om34ozsfeiZ-+vlSpga zh-q1w7$HLHYJ@d4i0T{Quc(5vJcimNO}c+2Iy?Fl=*=x6DfZ1$!-~if<@NJ~!!X*N zC^eW^`ATVW#fteY;!&PmePv0u0zX-$<%Mi{rbRQ#XT{9b5DXf;kfX*6x%?i>S96Lm zZ!yclVti$>hCfRSR9O_@iQ#D&w+x{;tc*-%Mkjqdjq*sGwikrfB;CHrkk58Qi_3%!qlp#RP~7 zi_6(q$etPYF!BX%?i5tvdK7pwSYY%c%MwJ6*$|BbP-T}m$+o5xM2w4{l9xt?Z2g+;?7`%RXcnz8dhS1VKgj7=tj4rn_yD1_I zjMP>WbP)yGGIU0wuAgbjXb&K-tWp8rygUQGDub8vO+23#%$NDOB_1p)rR|kxkY%W6 zqt(o~-G-dXPDbw@lm-T2ubxDtVIV5 zW;-hC_rTD&AJ*0*uy-7XOL?|;4#B~Tn3s~)Yj($w^G#UR5_=6Yi1 z#@NZw6Pc+oH`Y$Z?a4r68DA&Q-6WF!sntC%&wFM6*dxmeE8r8r$>3!v(&}}0cPr(& z)k$Z>bKuPx-rU!vmv(%`7$zrIVn%spw+5TnZNScrTd{k~4(#5!8@u-I#h$~5aN_(q zoIQ6A-OX(nY3e|&IfPby26d(k{M7kUw*@6ZD=MlCXk1~zn!Qo#>w*`+H_q$#;5Z}go}y0dw%21{bt^X1wxYYT z3SDJ=Xf7N=V`(26b)BdwPNPP{#)p9OlrY+>GYFKV;Vx!lSDHX&X&5!}PHbMk7dvK-)F_L#A{$l)I-lot@b38sci*}b=i6svdZl=94nzAz0ic0DVu^@s;6PfOu`UfPQxFAHM$(e$T%Ci38=%zzS0HFJWkE_`aQHD7aXp+>0>g( zLW-T#cQ~9d6H;Pz5@Dq$^?5I>08PsC&!0n1>`saRP{!E_>|~aPjOml*oqBtF|7db@ z92^|{qt!hx^7ZA*mt!TZZfbG@Gt*O8yLJ|vHf_ZAEnBf?`z{>Zy&s479m3ILCvfcS zS?oV^3Ma2!#_3BJm9cxfW;de2l|YB987WIf+1W`KcA|D=H+r`)5^nIJXQB+dPc-uP z`p`OBh^FB}bQ6Yc%M56r)L`403?9F?39r7l9=D$M;Qkx!c<_2Nw(Tjw+6^z@)QMs| zx@X2)Pwn{TM=^ZolNi4B4Ie)Jh7I5PZY93;9Vgy+V!->ahVkHX5Zk7Uapgz?7j}hk zYSxU;&bHwvM?3KOstmr#kNx2q z9BydGx@ZUbjRc==5Ur&{Xf+O^!#aREO*5kTN%%?<@DYA-T^*dcVdxjR73jsBP3UMC z#qRCLaO3)Yw6%52JzuI$@p=jL7+H8z%QExIGaTA^l=kkAMwGeMIl{dsF7KEBMQtI}jA`nxa>FH${iZ6i@M@U)kZxLgaEYGy4)c9 zRR-zG?1&{Ykh)l-462s$XI4kp{pa!=deU(t5ms#OrL1hiE(`N{Tujgl?8NLWtS2t7 z58+sh6`$u%2JtGB_E|qzWbH`-pR7hC_8ev1^Yi<5#Bx0dFnV;t8KVloUTkCA<$ zp2q5kqbw(+~}~osC2lbnyn5YiV$) zMz~s+qJ7OK3~btgrWKR$H@4CMwJWdJ77QUkfzewn2t>om7*C10^7v}*nQ*Q0JklFZq0H#SlHzhK%rEEp)GN@N zPw2@KRP#%`cuB?gpiN*Q!MD)ZK{epcs#RD+t6RT*odUb9+h}&XcVqwF{WyB)C{7+d z#mIRUXD?sD;S1++^6pKXesCLCA3Z=X|NfaH$I%e0MciEnpRW#uRQMbpEw9dpNS_~r zYdko3wh8NY#4xqlkG?e(XjoQ+>OK{Gb&Ij~P(98*oWkWN8}anx12}nS5J#@pW9u;! zj$X3j&ch%+`Faz+_h~CWeKUb?zn#E0UJc-zZ-(*VD`tH0nFT-jo(tdp$c@LB^muY2 zfKw|BxIE^8X=}s05!px8e1aMp6=j zj46{%X7n__GFG0VGTNU?qVL?m<;8{kn6S%H>5)eW6&bB_56WdZrD7dTuLScJ$C))%I=1m@wbd$(>Q`Cw)&9Iij_^PQ&tu( zk|oroXq1@|GFbTefMJ>5CC}w#I-L~hh>+e^WgH#js@UA07G;=8mOQJQdwwZqC%}^^ zyCgrKF;)dFZAf6}lwf35)~@p=@+>yRHzAvQ3E<-7J{+16czUJKL?7yoA^0c%zk6sfRT zVfDI|V?mjQ;8W4|nDqF$UPOn`wnw6H66$eEjGkg7wilx5$)hs4HKt; zp`~jLmB=pC@w}DOkj41SP?n%7nd{AcsWe1*Cun-r%0i=so*t-!Ln!tSF}e@X{Q6Pg z8bG0=7bX0>#77AFSD-L3@&BXit>d#g*6wfWEv3a>f+QhA+}#~w#NFKpNpN>}cc)Nl zKuf7Yaf`b<4yAs2&ROre_D$M8&-4ENxITN|nmc>;d}pnhYi2OWGzCK}GBDga2czt! zVXR{T)Lmvli)v4YHdoJ=&{#3!p0AdVEqS&v2@o~ z1jR&QITfIdwG(vB+%VGA8SnGv8#9IHE=Dl+Gr+7BQ?Y78AOge3z(0BnLbEjxk~aP;h4tlk-g#5^rTB=^DE%?7w|&L0;)^1}D0J@EZeCtNw= zjGHIiapQz1u6^K)E2quy{YfL7-))Fb)>&bDiXk=!nPHQgDb7cSn=p=k>LeOeOVbcgttWonq;b2STkF`Ldi5s$58Db2lAXY6BNfS~L zI5q?^=82eU5($U#Ua;r;t;cu~dXAXb-wI|UX{E-w!d}}OM&le{Jkbd*c7e#vnu`k; zzDHtGGMrtd+^v+bu~RM1s#5b@68_Cg>Ev@m!|U z$%t#usOw+(bX0M#(X#d&C0-@uUg`aw%GNU8WW3Aw52OkjN!V%9{TP~3wGn7GGFw5Z zC%jhDpH0xoHIi$lY%@w=*b)NL-!(={AA>bywuGTF^-U6{-`6n28^fr8hGQeEV z5{5Pys&7Y|=MCe)SUAQ{gIP$bvPt-B<4o|LmNoilTB7#^8SL?Zjw|cev_*)Wy9RdA znHXs9f_~=C7;5jWY(eRokdANyFEnR5j9fk0V6aRyC7E0u!^e+hS&fAHBrPS>wB)%l z7&vAGRp?m4SRHxm1O9I&{NH3LdJWd^i3FmoQ>j{;kJo_zvkkc-+Y(3~pB@$a z@KNe)C=* zjN-a^xWhj%0KqR|C(5oz-Tm{iD7zlm{jKb>k)N*`oR~d(Hs&puhlPs?yA{i@a?NV2 zU%vsHH*LlC?YppT_daYoZ~&W69>x59+mXI<5qwkQ5tI~~)Zm zf$_1C&5iBhG7j$kgW()H5U!C!F(p(v_5%_AKJMJD2z}@{*aBI5{igyL#`W8Q2+Y*R#OWbj6 zx*bmCyW?brFLs9nV56%)HdzH=r%@trsIqO6`r;i&R7Rx zw}uTrrf}-}U*s8Mv*z+E6WZ5fQ2a1(r#kRDyR~EB0w6KOQTltClW~>;dP}iec zBg952b319HXypb;S_&<#R8XJ4dePd6`)v{g3}UUZZUl$o=n-RFroJ%=f4g z2NA|&#!sNS)mLCA`^EgV1QY9sRCc9q@EY|r404pN98)t3j8iA**Z{mgT7xQTu zAhd@QdZQ<5V$4K!jGUkbwTa`E&@tA-L_=K+Q5%UN<3#OgU=l@*G0WP{hu`7q0v$Ry zQG3$Wp=(52&(~!=J^`N{uZ84%%EF~Gb##(GAwF6U?-O)y^_{>*hmF-h>Dy!Z4xfZ~ z#u%f&rX3qOabh0mt3fq8&K-T&c=y%tRifWSKlCFA`)LQFpKb_ka1;g^#bYRK?r`%| z+W8EOwaJ0HLjg2hW>f7ggpSX0OcLmgnqb14(HbUJP73&}C`U!@^(Z~*lvITj=-;78 z_J2eV@Wk>;062S8gWUGm~K#6$(RlXQWXjnz_0|!*43oLL)IeFbIA5|F~zw zVf%+WP%t|bY3X)I&2xm?R5e)p^oQ-_LGTRL#HVUZmcmjX*yxm zSa&Q`bHO|v2h29}L85LTLdFLnStkMUdWo1~5CJm3$!M@Buh$5gSKl8<0MP#S9fn^02*m!Q@y5VW1ma-4(nn@Ls9^V6pTPtZ zmE8OMtXNVRU8MzmlOPk|NhtG&88YhH12_pef8%`_oDenFhatzmL;CP!lv-^2aQ;SZ zT|L6fT!EWvI+mzlNyj4WMB$0zvmx|sMDek1@qd=7V(QxZN-}YO+307`SoBplfT2$? zi~}NR&7APAt~uV*vA`=Mv{m5iI~H#Zl^xLdc9zX;(9J@_}ttv zTDm3(EDb|FjMLFnU^iB4JXQKAmeE8^GS$a;U0Pdh4UC#N0fu%qFr(88WPSAr3dBSc zS(rcrCanLmK<^}Df{*u)^aCqw;I-*VQIjca>awzq^gj+D@MH}>IS$gKve!i)Nl+he zh5i~2ygp8B?CkOOCxTEodE#B(`~BFEzpu$4{NAVf>!%xvfdKbF-VS%MQU;a(lU~fm6wj`Gx9LKFb@mn&%xq_gdKsmW{r%*uEVBHo3MHFX6)Lv z3w!tO!~VSou>bf6SbF#nV%M#PPGTJNQj=gzTRhP}09up1*)qCf91C@lwI$S9Nhdhk zLz9JMLa2Xx{ReE`u?j(97Vw;^4YxQ-^blRR`e`CM!44k&#yomq$A;P1vtkBv{oPSG z)dhtSme?@E9~%q9ux&;tRuF2d;&iYrc@lOf>El?YJq{*YVOyj&4rl1&!eT32U+0b= z7I@-^X+ikZKNO!hB;lNC9=e^$yZ6NXqu%rnANS%~~(6Lbn)=Gfm_^EIw_-rS5!F0GS%m$ew)FK#D zOs2p=*B!be2{NM~IJ@%((V}fKVcB{IV#D5pIC%0D%i39KnI$n<`t5AxhLy(zm@Kz% zZAsfpu*rh0&eH#DV@c>)!ATZ#wK3;SF3TOU;@gQz zcl+Vh-ZHBCH{f~_sC$yAWw8Y5qf`3pdhD#gt@pclo$=Z`@9@|gZ@k+Z@AQ2i{bg|s zSy^ZTm7Tr;%q*;w#W-E~Kgq^ZQb&}LZsnZdTR?&8>E0}n1>}`ysNJ=KrzC~8NFG~y@rfMul}R(8e#W3<5fcPjiD3p z)<|8dK6|{bW`$vve$eoVfd-Y`SpO)Dql&Z5EJVu2eaPH=5Dv-H(O(u>aQ1^CdxSeUciy)c%bnrLba9dl!-8)#$PBn^zyRmX%$nwX%gLEvdX-@*tc zcIMDA<27J8+j_ZUYHSoHhXp~;(i8?(W>kceV90YFBOdu(mUeUk3iw1Asl<`YEHSqs z_;gG#aD)aMld*W4fPZ^175oT&^c!n|KBFxNdJEcJOR9Vu^d9YiciBk4JBAH|nv6rrpAmn1aa{<)d=Ri}LrLmmXb-W|gw4L}P zU0`hC%$v_ixjCi7)mc;>;U}_O`tJmK)?#^G_&k5SjKE6rus}~17?7D6@|-{}I5_yP z5!W8*$x$AMg-0SJJeJBY5iyL!m?WgfCnGyG4S6}4$e*5r0s?Q|!dY0bkm`=mTeE5< zwr<*tjq5imHg?y}J=nYN0QMa@g8e5?VAqKgn73^!{ALtFFD@2(X{neH8;j9_Q(?rT zbzBr2qe3vwSciqH3j=!#7L*ap&5W>q#ZoMrKNtRfF7OUAfnBf;w1XzXDa#R_aTbV3 zb3s^=8>WU?VMewO;=K)!;c15pTUO%9olo)P?it)Uy#;qpuEW)X)A7yLIQ+0B0XMg% z;@VmQZI(YyX4&D>A{$)a;Ec~_>fmtL2<&zqh2zdPIO`UIk1a#-txFotTSw!Fwj;iA z@xV1tZ+xI(hNBa#ut(hjtJEf8tCc<0S=(Zzu?1F}5xVLQD9{c-wpKKv$HgGVAPc@a ziSRLqgU$HKFduG*Df+(f*Pns_BR^P-v4Jtm+=9R}G7n)3&kAAUg7LPla0-vYqRm^7 zoSRE1xv=c1UZ`B`SjKh)p4d0Bw#pJl3oz*0e^l$&{(7IrI&Tm6;3Ur~*fhRmYG~ zV_8u~(_*R?ICzsPPzD!NA?YKkSrD(j?0@^;(F4EN-t9|Z^7!(xq@O6cbyR%s_U((l zvS{a^ffzDkIMgPLr;VMY?Ea_BH3uXTX+xbbz{Cx`^&K$M(O*$wx`DA6>k$b3pcq(3r$WmskapJ&?@Ta2 z@9_qBcY-0_Q#YZVv!Ft=!$>n)JUtH#FrNZ7?^En9$*jHs|+fs$-0i0c@xE(ET`Jf{`B7jj8sGbqPK##zaie z)4&+5u}WyM?igAd!@$}ER&Fdy4;KY^ei6a2^K^!;xgm5+^kHOWLhu=5l8GMl`PiH; zL9UOC>`ExAn2o0a;X6is5(W&Xg_qn+s=W6Iz23tOm2D&67-EdqhnV8cVHS9Mgf$^( zi+8E|dXI5H?{RDx#=8-G-n=dZpJo8B4Hd6`1cn&JV3>I#8xPuCyDW@#%4dTz0~(&9 z_LfpHkF|rE7Hy7&C*`OcZ&q0-P_<^As^wCtK4ri{K*w`|pE8)iR!ydt$#gTxvy+5k z8GI19Nxz)LKUJQbON53~?L`rOajd8*iAcyyMOt1K@}^J2teG<~Z}x1gTDBAm=gq^q zwQI0p!v?Hdw;o$JZ^fSNdvNObhuF98D2|=^6noBmgoOw8AZqm@n5D*2zI$WhWIq;i z2>R1Pjq&$|QG5iPQ=*{hKr3Z57Q-eCfTsRfsu)WYw_?PmLn2odmt^#8U?9Fn3HOPEek_%`Sc3ZezgyIVjZ87m)DH8EJurhZzep6!@$*GPh&UK$)GM#5P)gsLtSkv8E-aEyePwj1mTJCo70T!#KIFq?{rmVp?kLzQRE zO5rvcu7qCVw0t;vcq$E_Kus2GRg@mVWh3fLmgIMFQr1+nrkXJ_HiLnoF=XDTp@}iH zsnlfMHvxc>*T?%uRs@j_ehaaC4s;W;G?4@EF?$*0;&<5!40(U?|Fh>~2R`pWD=e$N z$ltKAvV)0*wX&>{u7MG>^b9ag>!lr5n}7i$#^C)S!>JgCD5)}U%b*0+o8-?4%zEVu(WkhS&vSxOFhp zUE}$B5_E$z;9jr{nR`zmY1?5KC*(qFY61+y(lEw35W172U_3P*o*9M6-MkkI4jhF~ zLINhz@r;#ytSMrg{3m0g8KFg38Q4;>Dd3w3HJ$MoqdktwPaRq&x`dk}rpB=@C&$4r zG8Fz%p$Lo#gKv1CA|eb4L{nRHSUOt6h~K4WtVc1`qxfDn400`HM3?1kU}}xAng+@o z-TqXteTK=>^4iM!d#?`E!)t>K2|g1lT(g((v!d#=!`ox*@y=Lhyfe<74T3k7p9*}U z_6F)lQticIxJ431*jNGYyw2Z(C@MIwYeSLjps+bHu2k*3+y1{_}MS9<%2VrMkspa zq(LjnAA@}8`>oCKKEcqRl}0n1ty@kuENGQ&l1ryC(L%>WDYO>pac5w4$~hp&$p z;?kiR_$P{BFV?EtW22=j7MPi% z$jlP6Y^NYY+Y@;jQ<1M7hb$_*c+*^X8zjP+g&yh5>*|$;WCAaMiokZ94P1?UVP)VC zePe%&F!aDkD{qXmpRDWyVd)zL2mh&X^qx#85s*}Q%DhkMld~6f&wIh$0p|ARY{4gC z{6uwS+48}|hENqz84c=-{=@oX@Tfrqk{U*iACB>?EP7^otORCCMKHCsfEg8@xt%Rc ztqBW(nu9BB92qPx*;-Kgza#{La#S{$H5X<0*YSO&}Os!a6me$ZWvw$|CtFEi3 zj1&(iY=+Pl4;(QPvMS8`gNINR4pxqR2{cs<@_tzwfAn~DWmzXlDAtjU z?P=i!aJIDiR{Xn`3=3WpGpbb^DqjoM4-@_`MzrU8ww6#gF~Fd4YItJ+ZS#PE=p}mx z4;+C$W3;GB^zqtQeY~wJ(DT82W}bLO#{s>xZ1JjrE8eps%sir?PJrn}PlrwVLKsFB z(s5*Cf^RBDy0a3yQ4R83Yic$|I>kcGErAVtFb3EKK_@H|Anmzdhrl!@{LE>EO{w}s z?HQRHv7BY2Aqpp|Jxex%Ql^r_r*C4d%sLq`LJRK>*2EkAHSqcXExa~B2d@v(r?n^e z1{*Pq@#aujJ;oAm$&&NZ*GH?{d#neQpC~>*^wkbR|4CsOWEhR1W(hCVUN+TU0VcT4 zhNkC2^ctz*h!M0;qt(q6=uPntrF>T{5~vz=5=AHdFTD9*hJepn5)#LbRlp~Z6PWen z+zI&pX=i&Zt{er72?=pX%Sb|IZYpw%a#1+92(uT=#p1<_v1a*dY+SVgo7Zf{zFh~f zW7l5nI&c7ojvvQiTGtFH9paQjGl&&%#aCio!J0 zEToubAV@D3k&cCYHPzMU6?{867LP`jXs0kMBkxrLql&gCK?bL z6Z&C40Rfkbu|C-t?Vf?5 z_6Znd6@wu*u{@8%xXIa=7(5;0{4*J87$1-f>-0H@U4H<9i#8#V@SAh;bEItC5A&#W z7zf3|DJ}z1GZ(>nY8W)EEMY+9W#C`~Q`r}Vz_amkf!ox{@Qn;&Oodm7FQ0RQsk04i z2{s=-mq6gnU9%LKv!)?1HdFzgw&^5Z2Ll+|7-O6+mEpuO7*B<$$LEa*P+g-*&@s?q z`Op@#(Nk=$PEIgSSUJ;+-)r1fM6~9WR3$f#|0rqqz}e0&OnU-e}uQs=aAcdo!WoHV?f< zX*y!qI1Bn}9m-G_1ct=GXG$QIoNAClCJ@VHXld=m=15raIjxrq0(f|MDEW2*xgJaV z|5AEleI?&6D>DPra5o6bkq=JdgO5(*!o>^Ndh{@yr{!VLWN*A(~5-nxMAyxW$0{fM&7KsYz19lkQWR6rD+&EJrE=2PKD<3 zU}!Dzhuh8!80NSleqJ!X_-YG2I8cal`wMV(dlpWuj>U!D+4$jT0nV%m#wV+M@y#Yb zTv+XhV>2}|!@C#uWLx3u^-=hK^JIKH&m2c+jgO@m<7Ad5Zf|u&=@B;^o~DLfVOluu zM;mNrkF$n;_=2s}DMD-?!M4pP2FnbRvCuLe3ur?#tsM}k>5UlU6y(}ZN2+-mqS>;= zm_;GME(S9KauH?h3ts}ybDS;Qb=_cL=nZuvZ}c}J>>L8omx{hWt=ec-UTg0# zuMgCX`(muYyBMbRI)+Sq6>3KBVuE>pj5d6i(FdcA-p5d#K6r1;yBMr76k`p?W0H+N zOxX}wI@3lvTTw-^Vi2-6w2QVZTS=0aJt36fy{F{9{R4m!{{*7|Q|7Ol({h`bo6~Na zDq9`O(b&uk#^x4Eh>{b)$$KT)T0%Z6BhCUmOItf-moZUxR)n36iwA9+s{%V+OWAnH z0{zENQkJ_Kp>NLr%?d+wOweZ>E!s#;hL$oV>}>+?ZA~k@u4#>4MAH}>s2fP)7Q;mDDr_~fHcaQ5su ze0=sZod5PKtlhH>rv6^&ZNZz%%>`qlqhL@}2m>lP{SE72ymuF@jva;J_8qX^x*g7| z*1>Jo0vPxO!*OajCWl2Mzo-Z;wWT=o(Qz32`9eE62@zj?42QD^pm%U31}}=mfIK^l z&N9QqRBIF*n1hPWn|SuD0lz=*#EbT7G~GOlr`6w}?cO=mTtA88%R6xGt0ma8$rE$) zM`8a4BkWyfgmastaA<}f76<9#NWK?N7W!d-t`!c=pMdWUnxOh~cYL|X45vI?amF+X zpIBz%lu0~3whqD}V^{1lpNu62e#p=YMutfk@*Uh!=D8b>UqM#XfmydFGkwR7Wnbvo z^@oo2d(gJ&11-i_lh-ib;!SAU^ycv`OtgCslUxR3qJ4j8*baf3`9KUMCHj{!Ek?kvkUO}+7!i6`FBvO#ZM zJG^h?4lTbFSSHScMbcu7@y(&lO(*nnF^JF`I|qyFN9X&QfMSBf_mb7 zXe7?T1X^LO*aA$9$cB0945aNmjujWKAaVOaBy8P>%)N)(sLs8CSsELB$%?SES$`tYpM+m{RvQ~>YJ#q34PjPLo)-Vf6u_wfPgb+S&}DY zi-;;^aE#7ER+G`1WR9Vub*nMH8=DbPn#=A%-RH>UML2O+A~kWNUIDbS$VwE4D_04;08@SXAD;}q6IL5 zyLT}1rq6~OFOsU|6xa#;B)xz)4S%4Kv5BGrWfc@rZxRB$9>9sJ>w%se71eM*nbLVjG)3f-LarT>U zFmwHCjAMcHu`^}CN5g!{ z+6H1jE6qY}S1hE0E40c+xs4jv= z7$T63nY*zMtgNRJKHeB=F&RT#rm~^%#oJUYgFM}#=HUjHNH!X5`E6*|oT&0-OFvOL z)=o~aB4G54^e}wnU`9V^Xpg{H?Lp`_suzZgV-S3LRs&#QGZ0$lqVnFt(22b;O0O3z zyoO_v13~EA4~A|7q2ur#bR7Fa*Qp=W2*NRz??BDE560QNkMY+1Fy3+ie`i08))PC- z_s|ln>k)tkV_{%C5vFGP{B0(%wvr9rEos4}Ar=*=YLHcKMD9}pOTeY9ULwz{x@LNw zmr#`A%aG6Xe6A-1esb^29w7ofBVH3tePfJN*T(SivWz-yzJ)vb8@u2=eFwa6;D7-p zRBRR=7-H>>;Q~7cf7(z#40D=_F`iKv@0&=)mIO7wWK8f+f{tIbGL6hIBniGVRw8cQ zAp|Yk3+%|KLcI)VtlfW#;y=cmIvcrxrf?J?14 zA`Gn!*eK}1E6@Y>9@5{cPuNl28EZ3i6zCaS7!i8<(9zR^B^9Ovt*#@>QI@|~fzO%B z+@ANDDf$mpr{Wujw`Gp5GEY~vK!LI{%Ycc>)U~$tXv9uW(87kn-o{fd`7h)CF-TDZPtM@$ufD=p7r(`+&pt=yl4TfAJFA`$1>M=xVZLPp z40rE<_QCxyJpKV}4t@yhEr(&SXakI9ErVI!3|JC)E=dV+i3vkMZZga#J7fQ${b;JK zz`ot9;TGn?meU_j+ZQ3}+G%)vwh5kJY=zt5Ik3ug#)5tMIQr#o+^zW@O%2zm4zHoE z@;XXye2L~;pQGdEIdtB+fZ_`Wv36EC^5QL#9jA{i%O~UD27jz6v_jrgbu3G;(GPUu+b{~APF$CvQLvY+Z5T8s*$H(3oI6_G6HuJ+yyI8Cx{ATDTp+GMe z(+rA`VvvGV#{i^x`Xa_=Dzd$jQ9#&bItL;~%MziZ^bk$^6shkDPYc4_CIF-D0x*D| zdEe0&eH^{Z4pFQcaxws7W6CmX%U1bB022TU|!<iFp3{!g(BgenZduIT~>%50i8gCI)1lgo_pkpx*+Li-h=B5S%$6?U0 z=nDhq5is){3mv<`RDFFg((FwPH+!AOH=t(O8|pUwp>015lk7)9-%bsNcH^OMF%Ehr z(jCIqe61z&!~ zBr*e5v6+aVkceBh9-#}C!7Yc@*ux7W2$XRw)5(bmm>d?h=OTW3Ho`KK6{{N*PZyUE21jp47+C4U*3B9& zlbzt^>w=)D0NBy$8dCkK>x_ezo+cX)HdanHFtd_eOhVF7M`^$u9at8&*2+v3D{EWG za$2Sq4j89lfWad)@s6y@^4>_i`Ti)vZwy|4UqVgk_j`AU4&EX7-W@KRh*R;6vQu*S z-dFd+yOPSH<%j-y!5C~5P4Fcs;8U~BL9a zgq7KzSe|Q(&GS95ex5h-A|_!?nj?0^dg5e21P(if;Ipty9PtUmF&|$Xo*aU$o>^FE zlaEa8Jml!lLB8R9WSV3m(>)kjzEd#Gdn)qW0+4Luh8Pn&B$_!RjS3^m$`{T?9xx{G z#t?Y#JNl!yE5n61fs-qSxe$1?(x#JrXklC+8~-@cS~%I*!_k(ih*r-pEEu~_9>&J~ zJF#~A2CUn%2Aj65#KuiauwnB;Y}m38>o+gL#;r@SYu{Sz*h|H}YbmzwU53rO7h}hv z71*?QF?JnWg{_B{V&j2@*mZK15*rUL!uo^rvHH*~tUOeNRflF^&EeVDaBKlKA76s4 z$CqQ<(G}QvYz@DMv40JA?&aUuwHDiU@VH|gcJJPdt=l$W%hnCpyk$Lu=UX;n`}VDT zZ725a+k^f4_cHb|_Ve*R?A^Op2|1VNZE0(R_f<D+;#+WHtfgDwR=#shOurRW^6owg7y1RxalBf z?>vb`2R_Dv{Su#G?WwPE;L960d+i=RzH|)-zxW2*&t1gUuP$Tbx7V@e+nZSZ-CeBt zz65J8m0{hba;%s5wiIhF-o=W~Z(!NEt62HvO)Niu73;pZh8^{5?I}Yr|mOb0B zW$!lZJ-#2uJ~^qx@w2CK^7D`I;l;B!^Y!OA_0@Tt{PHYLvVQJ6c@W1wI*Of#wqwIi zmdT+VICx?o4u5a}XU?6%-orbwar;`VSi2Of)~~>}U0bl1e{aX`?bx_^9oDYnJ+g5P zwzBNDZQaE36CP~Erp?>1VbgZ3Ubhtsm##(8oMo8K1|omfQcR7EI^Pg3abl zu-LO3W(NV+&d5SoQaU1&GLV#=kMx2<_=g1{ zH7OD`rI*lLcNyz8q#=nGGk&fuyk|JVKGzA>@wNyk48iOpD{%b!hd6y0H(>!2_~ z7t_OC5NN55tf`jRH7^0%r>7#DE$^O?cx?Cb$3}Nse3~4J(;-3F?e2zU_Qu#96^7-L zlTm1&gLKnEWO?ODGK?9FRGB^K5$VVGbW zfRRqY=u3sy+ldO#Z8G|~xzpY{Let$D7CxS^_hjpCOABYg7TVqsP6VFJpOyV?qf--5 z)>4P!#wrxom7~0_3{|zosI0z?s_Gl4sk_7TyNr9NZ!AY`Ln&(NOHk8Tf=a$t-EbGB zH8)VvcpIhlH&EVq6SW=pP~LnK_v)^oy!9rETd(78>s8!oxr)1t;?^4|k!WT#-ax4Y zKU-0Mi{Htpy@iUZoBaE?P%VGEsst64B`B{bMp?Nk%FFqo)< z@o}YcR6SS4^9nv!{xamef}g8EX;~R=-@AvKcZ+fD_C4IVSB`sC^|)P8hb#9gaQRL- zez;qS>m{|gSz3=Q-kmC*5SK*^|(~pgsW9;xKh!IAIh3>r}jST zI)6m_;}>Y|euT2dHe4;Q!}kQ$cNI;zRNaBA4Uh0$RTqA!dxUFE&vCQ;XWZ)i1-I`1 zhU@o##nt=2;D?T%aH;hf3O=x)E0^s)})^ zx$rC924B00Yd7xT`yX!N zt8cI3^NT;=?1jtt^o#G6`0PtQzIX*6eSR6Ae|Hb3KfeWs$w}xv!W?f6w^WjsC5Nx~ z1P{EgH3j{20tvow3^k6$D2o*IGPm)jjQ7GMBR48NH<%EDaY=bdBQWe}ePkDPnJVNU z{d%+oh6cuPa&#r&RA3Xp{ZrlbsJ|XtE32Xu6%}Fr{P~K)Teofrubm8w;Oh>T!gObn`mpef(=`9 zuw+XlHlNDDzAu-vWln{6s1=Ilr=XlR_SgSB#1B`_;@QJy;MZS(Uw#Mv{1f=?7qr~F z4gCHS{`|2UKRvC%j;$G(TjYvmGkvgYc@z$>iN}#u(I|}8Lsp;x0!$6yG*%l0zCl%8@`(N`b4r?_B$U@*3a#bRS*CJJ0)kY%5UEQc%@Nj`KuZ2CWt*mP6Kr3NyMbOd8NvE=d6IH!~H3B1pQQlO;3Rz8XRilQLp|-IS zwe_W_t1m`FLm6rb!?o+PCyfQB_;PzR>tqEC|3YiTU(2|x;iClYHF0{YHC#A`)5>DRr2%YC}yK}r??om z2#f1?igB~JLV@0`@>;@PDR z{9IKNZdEkmR#^j~(8BA}iRQNZXl(94MRgPI5_)&}`t9;Y0;qvNX~x~kcEYC{#f;lk z9k^cEhMRSrxYhgsw_6|MPU};Ycm0gAj%TQQ_!GK*`Zu~?{0mLpPjS19Zie4^yR;fN z?h@d|YyfKb8~MN8xpND5@7>~W=aJy6sH;LvQyr>U&ZUH1aV3Gpkn?huaYa2F0hV(q zk8(eh5P0`^&F@yQVXL}_iW*+$8os~sjsm*d#n*7{)+Jt->%4~7aqG^tmm%jj2s~9> zCv2|}d_P?M9@nn_pg`~D%^SFJgRkGbjq5jVDe${|t_Hc7rT*==f_{O{7T@6p#+$rd<8-&3I5g1__FYD>~ zQr-E$&{PzkCrmB95fz_y^XUipgckMU#jo(q<*WGM%Wtsq^ci?8 zSp=6I8)0>17i>Q{4zm-7;QZlf_<#5*0`?z3*yiQ%DoBJ!N+=R$W+Egl0twl%RF*NA zmJx^8AaBG6Psa4L7#!QZ5p}mNqvrNyG_Xawe)A&EetQC+d~*cHPH*S$S%G86wxN{@ z1%Et1J1bdRYZdU{KY$lM<6qqm@cWbd__4blRo6bjk4@j9@`pXRczh-ftPH~ZWK*ol z@x-akStv>|MQ(x-;sXs4=WPpLZA(OIPsUu^Y^-(5#qpRdoa65+U73mxldZ5LXd({B zJ2431z)UP~oR1>Ag~+l^L7sawW(7oIHtpt&IDgEDb-^@0ZKT+bLW;FMt*$##{DU!h zN(i()r(%HnRJ={a_x6-Xyh|HA&^-{tT_-DPDe7*nF!1q!xtBYvotI(4TnVbb9>~=;5IO=oMtxHS z>YDk!1{L(G8te}_D$0FSV{eTRzV z?mYrb1wB!Ew*(+1mAHJnl+a_`y-U@17dMIt$kGZFSJW^F8UpY--+QI3iYl)XKU7px zaq+o|THGkF;p;UhspNe`5LOU$ckWf;`t1sSe=VU(#mM8$@&+~#52ysG2p|58`nJb} zP#f-03Er+GKr5SZv!Vs}YCEaIT2a;d0QZ0T9glwg18onVvcYLVDSu08DgPdS?`;CQ zqOuz0gnBU>gW?J{F11v>4K*yQdMb1RQ9xG4YgxgvA>=9uy=sE4nhLO-jY3M(CAR+{V4q8@P7!d)&Tzjg7!9mJu5uMln_1?b|mMuw5tMt`Kxr zZd@k#E)#5*aOKKnD!pq8^i+zEaa~m34JyKG`1;$+xNz}1e172@CC-x*=P!Ow@G&l2 zqjQNwpW%jhbBH0{8fk&I$JpZSagOM%E{lr{`>Ggo_e?w1Hp4i&l+}!_RYh^mw&Ye4P z;J`i{J$?|UK0K}rSbX;R7x?1)t2lG%23CCd86vmsgztgMe%TIXl`~hY8P29b439U6{xcuE0Xkx{BNF{RV{4tc3TtN*hRV!`IW7?t) zTB-Wt%eeFX30(PnGp?OqiG|rVi1QkT-HXC-Vc!yLpO=FCczeuBn~YFLdnCJrAju>Y z%ROgcv*#Qf@Gr#IxzlhVHwXt~CgDJ=4t9hZ;c#*Q)`x^+_0$5|*!jpXnt^=hcr0V% zu_-$gOVb<)Q)A@v{V6_bh;!0J>SP~8O$mdCZx{?F2V*FsH{tgdANQUTiuc|8F^not z&BY57oZX;HK$?1a!Q9mi<~H`Uw61U<_>{S#&N7Y43IgrT=FMMOOV!s@sYGLIjS{tu6+Bj>zNM1tp$v5`w=G6^&FURE(8WmlbtXjI{*j%cx{U6QxWZ17Lc7J+O6pn&yeD|{;t#y|?Z3GH_-9_{E`pyht8PXyfq9SM zE3R%KAgM6xn^E1~N%*y)s-=zUtceX#E9&c8QC-!@>%ypSWEnK0vbh1Z9nGj~BXn7= zVsj+~dKGd#c^%7nO)IK+oe8>1K3_%13H)kU{*^5MQocu^SSo6dQN}1&g6fRWtq;VYcKa1j^z_}s-yw7XZ}a$_j6( z*-`D$=Bm5ny@@```g=nR!qLmh&P%bhMrK|xwvdc`Z-hl>ATd4n|3FXH1k;~nfS^Fh z&HJCw>p6<$O-!UM%gVy^>C+XJC$@I&+O^oab*s`@eC+5Ee0t_Xocr`Md?CcXd;`ZX z-@=N|zeViP58$+8H>~&W!=#NHV7_S+qCWTlX@`#@aqR|pB%~l~<{V_CB%vrf4b!qx zFry$Fd6`s}$x+Cr;#;{iAKTW?#=#wnaq+`l=&ZVq=WW&a?S32Z{73xp=rR8F=P&p# zXj}jI2|rQAUA}$^7cQN{H#q;t)T!+_c3=~}K7A0y7d}Jz#jjC!?HZb^DdX>a zi7gun;A}Srjy9uk>hKzDS~?f0LBUuvEg#F`^O0edfhC?Ru)=jAb_UMJzVK8WO9;nj zdBHf7<$=SAcGwYNkG+Wz*c_dN&HT(PuLLae&BN-DY%B}%!_FLk?3w9~C8=5{iXDlp z@Ue*Wo`mSYDe&@+g0X81#(0KdSYR{;1Vy6vTC^NJ0pzI8;kzK-iM>36`ENgngt-Vgtc{4NDY5S z4FM>suc1X{dmCG8m5_6JPGD9?c-0eLjqTNFXruaSAq3khXmbflg0HcyoSz{G3B$VP zQU#3FJl3`nlx>wNl}F%JQazT7g`$P7lwc(+Qy?esBLw+aDwZmEUP*XXso+;%Uq=8o z@K{d>inZl$XQhwBleamtfZ!$5b7f2+Hj+`jo@p=HG=AfD710{Q9wrvd!@JzH>z53i|@O|_uQ&! z!7Z`61ZO#c)&Afqo>95|^y|OT!Sf1&@NQKDZu9k<1m``zSKxTBp&6xZRDsQnxKdq# zyNwN~YHuf8Yk1unQPz`NUAhkG@I7vEn|%Laz3ul(h873fJ< zK?Rju6|ZkC8@LAA+9rOcPRgAPi}D`n)=(N9mEtZfyRAgo%cvCPO?XzbOsWaJ@-kXy zUejBI*tKg{si3Liq;KyQ?J&W2opJ5PRYFc+M~i!dc36&7d)KdCBlxc2haaxs+i$-k z^uEN|^XGByi!X8R%WswbJqNEayfHunZw=MQ8^et7+6Ys;KFShriq(B-bKjph1p_Aq zp_iGJ8-eFemFEpJYd@IVPC-O$7828PRPzoUWU&D|#ojtOIV#Yb8rZXfN{_u2g{J~w zOiVP=(o(4Ma+C!>7cX9d)vMQH)21!hwQDyHA32OuADqP5k3PZqa~E*_5-sgD0`L4q zMD0BY$4y&dy=@27<}HB1vXuy=>Wf;t5xJ|^DU;!&3A@y^L@X%C#gd|2ES{B*{LDmT zCPZL>g)uGhyVWjcl<%{fyaOU z`8$66*RN=M-i0T>x8dRQT6A?+qmz~Qw}-9xrM(&d>3$5}h4}r~7wCM{iq#vIK+i}U zAz=|XarhG|u_Z|N&O^Fo92UDw!6UiwH-)x$G$jU z?2nJe_PBJcjm*GWUh`EkDOeaZ6-Ay_*qP4vf~ zmP-`Id8cCN)MWIX8iU?`LFnT(8N)ohF^-|;a zUr~cS(5s^5t)_~U@9eQz5|y;DO66o_lTg7=1-)v%R!3MjHr6Zf6X>Svi$wRSi`Pfkw4=yM*?&vW_4jsAzGo5qj5XU2l|C;|}fY z9V)cjCDnvqE$&sdpoGvXX?jQiJ;1GoZo;nvH>(IDMsZaezlR{KY@q_|z;#0J8Wr2E zx;EUdZN;4$LX0r0BCOi(KSkH$7rb`egj@ryEQ2cY7LT{&x6o4Gtf7r1^lmrR<8nnQ zZr1X5H#O0QHW1V`RA$Y1@vr~j$-n-zgD7-*Sx-gjTX;E0m^lj=Tss6sB*n%Z3()PQrcMoUgbSrZ+?!du$GV%=vB~R z72mt7fJ~rw``&F_xp@^o(B|HF+0S?NI@RA*RsY^CB?Nlc2|!g}pV;0@m#FrnD~#|v z`#JeR%X{J5>#+9-#w+j19w_>FbLMsO7@^?=Uqq;~AZS>)%(Yd` zu}w^fK~`o83JYY#l$lt*Y&AA+*iMzFS_MRx?(9s=S(cCG%c+bCUYKmJYl z0r;IqK7*hC`)~aE-#_sCfBwK9fBXx|i#+fN!jT{OQ@Hm6_g_52{8g)?)*#z>HC7od!%2@tIOq|GlTmK?tiT6niX8CiOe=gg*A*Wxiov$zFsuuR#DTc! z*c6eD%?W8(n-GN!>A{#EW{DN?CRmrKgDq*gD3Tr6ri?&Rh#_V#%YkQZCWcH7!I-Hj z7&SEwLjvP5z&`?m{ev;gmzI_`c8HxVM%dUulSI1q)XbR5j85HP)e?0BLNlM+=o)Ya64rj^|QActaCNbxI#z z1!`YLjr8Y_?WbCsg*%2~0hs47HVs0L7YE`3hb1fSR{Ryt7@Qo-t|T570ZMD0m5H#gzYqld~M zMo+(<612V&GJqii9{29i&eGPFmdW??Z#OidxV(xguZ;Guny`}ox@z1K&{6eCAD;|x zhzh(*#doia!S~;-Bm`+G%bFjcyp77L^%-v0KfsOZ4nnO1ck9}fez@C&oy5&*Rfyud zUCY<%I&iO^K?oMtx1pp#lw&LI5_WfJoo`X?-Bf_b-@xBg*3^d5#uh%VqSYn5YwA(T zsNr*W%V^oFYti}mF&_W=Guod&K{bI_-qOG_Xhm~JD_T0+(Aw3;V=I++0~;1z4_<4z z=F-^I@jeg*FO8cL^;FW*wZUTzkCj!t{*_dB71Doqi`VZa;dg@#l2}{mXH_*w6$El= zsa)e*1Q=~BRi4BRD!uE3pPb*4$5eXKzt^MqZcy;Y zDsyVrtd{lfc4OzxgE(~LI8J_e3?HBUkP`JGKKuSMwtoF3qIYb9_qvr(PfvwGb}n3I z&qw&GbqM7#EHw+6S-F^zmy4MNnV7$_5c5{$BWHRPrsp%VBd~BrG8WHF#=6D1gk2$a zuAPAo_O3+nr89U?djl_MtA6gP$8V3?f!|&z^!k%PqjON0CdXg-_%}Hx-vofa5OBZn zLCkmCf}Orw@v+}}oc2h^<(x=- zH`@~zS9s&|RUSA!(-NOA3&OsFIBZNz!ST%b*cwcg7aWCMX>r(|9f%!y?%0%RhE?$z zSQDp?#o-f>=cj?fL=OZ}+Kb2}dRQ+|b+{qb80=pONq3^}(;;6++L< z%@`F_ZdFYL1r!Tb`i2{RN6! zo>TQb!ri6^xYN*~va*#_Y>e9k-kq8b!mbl{sSxET;JZz0e7BzJka3UjyDRp#R%L%h z@!jU{DQRdyWosuY8bv+U;x2z*DM47y*BV;d2`O6O$|{ucf30ih?_-p)Ov|X&OL%R{ zUfNqxdNQEV*w#o8HmRb$fiP@FTX!?h>sh{3dyQqNlK#~iHdqa!0(t%UxT;3LC(yge z>nDAAH&I?8=}jybHc$dS>3fv{jgpc(Y+!ET<}HHmA93sUHQc#-QyJ`#zP)SLuJ9-W z8>&a^)JnzjQ?z*tqc3Wk@=UqdTGZ-e|lvKohU^n}|0D84!FXcypM! zvbALI(YAPRtONR~yP}uAsVxjGs0ge*U|~Oniq8#E@mWY9Y-AZ974%fIF=VMJEp1%{ z2C0Cj1Z`~(^kmdFH6lR5CJ%-~aPvOj`r}629Px0wj7xBS2 z-y(necDNN5!6GjoI!S48m_7^POV=Q7;W9+zfgni{{P5$`$i4Z%H9CrzavM zBN7F<30OQW8EXpCv8FH`TNdVH>(T-o*t`HYzd4TT+vo7Cr5L|<*5Q|q7X0VMkF>PE z;l-bS-~}P~lwf=EE71MqPquO|Xr-RwRz)YSS9arS^&?!Z`Vp5)e!(|C{DMzTKgO?k%&69_!(z(Cmh2f)_L z2j()b(T3$n;5j+Ua#5<4Y>muKF?ON``VJp}HwM0jR|oXMtAw7pn-Qv7*upnc5ilBQ zeVPcore-QN!l;SRYo?8DZsW0?Hn_8vR+S2n(cDcy5_&E7YtZqy9`~O$;=yyqvwA#u zT8GX@)o3FC8wkF7+S`V1g0h=dxI>^v_0S?pk>F{dnxn<7Vdbi>lhhPZ4ODZ4Tmuz{ zL?dA+QBT!TNBdPP9qv_Sv{I~?gkE)ZC0bgV@ZiCHrEl+_(34}&0EYDE-ID=~;(NGP zN<|`|ph6O*B!d_wRRoOW=@D`&=t)XQy%J?qU-hkB{C^*!sq29Ppxf0=ih8@-_z-1; zUU|pQw4gttxb+EcH}L*a`uG}OLa&uj>qJRoH;Nm&l=C|T-aQ3&?SCn~V%pSu1fD8r zZv}46?QER*dQGDOy?d21f=pG)sHDAZXlX+wVSkrUub^tLZEHq3p;sycQ+zD_vz4^7 z(x+D;{dl~F{~WokXE{q>Z3FLxdNxE21Yc8YwGs_f`Hd_eiTZkJc&O}TRGm>(b%!?h z29I}ntqDaczVfoWC@a0gvMHu2FCpBnk=hK%)nGw{FR6 zFTr>92EO_32VDH-2b{Zb3C>=@c;o%i%2YLFk%9q}@cJObzrgqQNGrTM#twbQJE513 zp&9he>|kQ!1`7vo7??XDDj`#Wo~&P{+PY4lCo69V^d=#2YB0hhL;r$aXqe>dsW$V; zNKd8$%fg)5MTFFHY*Mbs9YL zsqhx8M%1juh|McNZqW?PoG~2>=M-Y~@_AUis0i74$w)|!L}q3prllugVHSZmEf?zx z^9jA#*ueLlIJ%21#aFm}^AcLCSpiFV1Kesx!@c{seWM*eT$`(?b0794e~vFQ&*N0U zMw|~>j<4gVZ`XVcwry1*A(Q~i+UVucx@5m=d$iM@H5*qRiB)lt#3vvJrN zAB zwieLTn}`7;Wsixsc$4(Pn}bzh?O{yt5Ja>=&8=+pTd3;9auRxta%`nCkQ<&VPoURI z*tK@k0(aH2;<|{ z|4x-8Jo9fzI!q7rMA7v?PeAui=+)AaRntaQRZ>;Fgr20X2=qETJN|-R5BNm+{nejG z&8DQ#^f6;h32S4HKO5;7$+h(PGcV1|sq-mOxlj}+H5qpF$T-}w-=9rvlE zXi;lhaZ9W$)m=&JbAs-e79 zS}7k@RM75LSKs3`Cgf;!D`X_Mf@R1?sisqo=8H8S@0zIiH$|9nyP_o>RwvIlg289xOVM?D~s4|ElGc%%; z-OI8Gp`yZRShR4ivg+}sja#v2_aPiPd=e)=_z>qlKZno0x_~u@4!}Jz8E%Q0@XeZu zh@zF4O3Rl}undVgvyqoyg!!{)VBxGhtXVPxD;5?aJ0lKhw1U~$sVF2^i*m9sFQ)*@ z^5(LAhPv<&lBtibvs$FTR@_tfs_-r$S&F5pHsT6#4NXNSsyxr_@4d#}yHFMHm%UKy{aaAzn!s z;S~op?{JKE^Th}UH^R;d>V%pxA?FenhQNe)xcU1l3tgI8$r9c6gdWS1>dxBM5>|E= zm}sbpq2q_(?LqJ0wf?W-onf!yz0t4Y{ZX&Ld5Q&99)Ti#d4xs_;nyUvqxBTvHMddO z(b@{|n%W64C8z>A8C|r$1YQ$i)=FF3^+eQO1Hsq$7x=mfzNQBPJysxsuaWPq?-b~% zMsaHiz1pTSs!T$af2)o5s;!NmX(Aj2a0H(mdla8yajRscP>uv6lI-1q4NGODDVCerR|SV-ui&=?;_w- z0F)4AS4{gWQO4IwMd{VGqNKK&f1B2}QbvgDc-}z8)`pt)Zj?8wpm(pTR_V(t<*}ls zju2}g#OesWR@Aq(pp-VYvWfpY9Zz|E71dxdN+n-UM!X3-$-}H|ln~fepq1CWkquwX zOZ!?~OVw6Kh`kH}ohZE8+IvdW*Ow?!M>y8V`>FhDYvnn4AIqwqN|Aq8)VPf5mP$@w zsie2uAn?T6-gpT>v9n@rRU^3)J>V0S*Q54s+`NrzH*Vw7m7BQm^$&3O3BeohjiA~a zgE#xDz7`@fz@~L z*{&{}+4Kk-@~g2h`ZjiFHDP;7CDtb1#NwEXm=paGR%M>Uj>3yLz2pacz4ki3TJ{CL zUvV0@Hy=gifqkexz7h8h&Bo=eX=wgn89tg5O*N;5{9t>m%S*tTj5I8bNX5>aC72VQ zhd>igMB8{H+sPg4{M~VWdKfOu_s78uW9&;ez(<7Lheh_-7^#C<9uu%A#tXA^G7yzj z1jD%L7!x-O!(!)Pc=Q~M3Ym`KlhbHr<1vCD9qr@`H9J>~wRMD!iyLeM0^vij`bS5> zFDz79kW*PciRGqDq?gr59Bg1|Yfd$z@y2)+Zw`44eaF6uLF#W|@c1|2 z6=*|SQ-#*HMzm8IbhI}UJhf=;AnZCC(AwFEwyp-WQQe8cYwZ@L$JYou1$bSwl@F@W zMn%{8xR&6nqs48&WGLTVQQ>j3&p+QD(oB1A9>WV+k~EyAY4iCQLU-KMfgzl-Il&LLaU_t zA&>WQv$hMj>hDwGJtp{mMCF5@Q2p=)Y9IWFiZ;H7V7p1XEB$ zNEnvzy9Ij1RGXqYE2!@3_?v1wx>459ObGM$D$uK=Etft$+ExOun6|y9i49fDrh8}`L z1bShr5nNWVh=@o;N5>#JIR$xnxyr(zOBOH0#trMSed{(HI&c`LKRAOg&R@it4?jV{ ztobl^^M_|l7N%y+LF}|8NXuS~{LDotOs1-vu?WlN&B5};GqG&JG|VDUrX@!pFEs|4 zSxJaZjYr0eLgX))kD2r5Vd0Xcm@|JVRxaO+?W^}=|Ek0IaKkBF+<5`t?Y)At>n~wj z&KWFCIF608zs9lErTAn)Db5u9fYXKNaCG`XoL;mCUu->qOUIAn=DCk?^UJfi^W6p9 z`Sx>^U%r6aE9Y_V`~ehyx)q(*j^kPRIefZv5i*1Q5a#BENbex5C|rys*)y>)B@fA7 z0Z4ImLz2}btP8WpnJjmFJ(cA;@Gm>lTO z#>xSfmX^w(g|l=~uBAFC zDzYv8($3%YkSg{O&$|c?6@>2Ayc~d_Jyq8 z`xC0U$4ctTo%(KquZybh0ZKZa5PXE*BU;|hCsc!-1Y@fba$eeepPze(G9HT+=!v~$ z@NbI6Evpkd-HwLthp1>Kgk`{mpu1O9LnzXUODO*jZ9A=dr3_+F?bf$8QAIbPriBWw zy-8(fYf4c;J1Zem3F`^G=GJPow24BN)F0`uE1@cuC?*8&DpB8XkFUvKlBl{nO7uK0 z$J#nRCIsbmvAsRuYm$rrmXknFmD~3>_+<8mguqWVxIxQHtNSk- z)^-<;9zH_oeTcK4eohGPhNIJDI8Tm+cX%!Wl4l}3br$XFGUO)|U`6gcEYF{X)k_v& z*|OP~Ju3_8NuemDwJpd@L`-}rvS;TZWp+L?=FdP5)!>rVD+#`pSigJ=KH7c;m-b%3 zjh&z4^Tmg8YUW;?UGgD5TXGKP*M5hu4;1604d3JJ(u??P{>S)g^@sRw>rqsl{RAC1 zzrd4*@A0(j8ag|!;K8H2c<_*RtLj@6U-<}4x6k8Y$py4t{RG!OJA@B+Z$o@QG$JR* zBPBc=*>PDYOi4safIH$`%&{cW9ml8p;oHSQ_94_}P&@P@9d2ONC; zF@+6IAQhf$v=kH?3=a=CSectE+Y>t4vh1u?%P848+Q8V#h`<|zexu&QJ0o60uR(+! zqxUF6PvcDtobU<;jOhhWKXcyrw5QE=R994VtSGIl2%@~2sPr0WeVRM#UgqGjl2F~X z-j`8YNsl1xUMf4Wu>w39xR9efC(wKHYb&~5G@zMkujwJJ)T1i2Kd#~X2wHwm_agxw z&pY_NRGiICtcNz)K3NXQX(m9(?9RClUqqtzy0Xd@fi?<@IoHO-xb4x{A()mS%5>N{0R zk2dsPV>^LG1=jTh&4gO>!=DHzs=L-le7!?SR=(SEA7x$7P|ep%3BB7?bvFpNTauGPmJm^pcDr72QPe=+%qYP59k!fZKtOR%hHyO2W7KPV%kJqu7 zs!f!(s5p5~u>w1RVk2K`YA)gD%Lq*Yo%Hny7zKbY`(g=ONjFl>;7}zsOMag!#O|t~ zr|Rbu=&90JWHeWfN)F$xo49)I8sBr5agXpTMq)}9Mo-YidxO;Qc0WSzy_ewYHvzBq z)1af#BKRbg#ehmr-4GM?Y*`_EU}VY4WaOme>BS~yLfNrOfu4<$uF{jT5*!+$*xLvJ zA1hjTco^d26Of*siRse|l*I+stXYk9Ygg0S?jiIJ;r!Y2_~gUS5FQ)@8~z>dDe0IR zIg1LVfXZzarc?1PrA=L#KNA}lEWnCow5+SZ~V@dfe7 zT#&1*_ce3g0xT$;hl6W&;gfC0@#%&W_;B$dT-$aLx3_(bJB)k#E}`l;rT5_ye7@{c ze7Explzny`r5CQEyP_7qJ?aMj^&Efvb|3ifWBmDhCmyyJSQ6kr@$!g;@c}4$#5U7&9Emb)r3}v4#@_?B}>>F^8tUCI*ff zfY%4Tssi3H#*kikd(bjvuKWpAwdj2~L8s zxwD+sxdPn}_&1tZd78`k8wkWUzE0TH*RX=IGB)vCMv=wZ3h=6`?)|0oYH8bQ#NyV{ zX4Be9pI$?QDqTf@CyK8}=~WOCwGEAEXlYf_J7mPOhIX-@3aXAFxo(y9&8ncKEay?I zWp#50YTF*5w*3)m7?lKF8C76q#{+`=N8JDU4?O(o5B{J3;qhN+y8n!pwG($5+Hr?! z?lzTRDZy7kt6P5m1&TYK^0@~pCD`_a3h+6KsSrz=A1P@pGOAnFbYFpAX=4ZKx*nst zjjE8)Dx%w7ZojlD^=ju#>Z{#6%09N8Mg8BFJD)Ik!8k) ztEbE)WME1{rW96G-g^l^Npe<&%%>LciPgPx=cY2M+XFvQdP?8kE!DyVHwnJX+0wq*; z7j~?`2P>A~^s3$1w_q1ysK9*fJP_sOf?0{)SXSrPQxJAbPV@Q#)!!=j1Ua- zoJ@u12}56BxP*qnkFfI%no3z8iRjp9O!o04;H(KeTf)v3&UOy4=TYWMTUc8{$4D1L z#ty|hL*K#cLtjU)!M)H+pvUMv_BD()?uRi(eK1txRrrS45ZQDCoQC<#O zG<7ym)io1z1lGfPf=ogbov1vOg?&aPC*bQ*dYw;d_?jp|e&-`v*Jt$v11+;akFRym z`bsWeTXz-i^ShosClLAf+o?7MdfgrT9vM85!4z69v0h@AdiwOl;);Thj`9X7nR-_8 z`g+MNL9w|K${+zl1~w`i2}!EG z8Y;iq_J<0D>K{Hs%d?*p;63^6-}vzl-bw!gUi|(aJoxcfG(C7q%PT1?gjvU9+?7QH zx?iC3(XXg@_zTK-E`5Gw9nY00Z-0)8cEYmdF@gFBRkX`h?Svga+wg!&kPdKI|eU5oqo|BkM%Ds*;I(RWlR(aQG;D4S@F8wtJ^ z$@3)m8eR@;NWP^cHw*YAk-2A9hccCo*0-$up3=uB%1;(6C@v-hX?6b|;J8i@-ci7J z^VS_?ILBRM#H)5s62WRH@2U`H=7Ouv=;c|56XT@blSF`{(OQ~TC19tTc~Tf zjNSTl)`Kuw{~h>6+M$&IYVRWSSV1~u;6nP>6e~-+`+!P;H+?G=S=STE!)sO)oYHUi zV*@LCJDw7151-egi%L#Hz$Z3W6kQv^*UtC05{Rt?Ve5T9N2S$E*43JS=nU$f8?Wk)b?AqEH1Rq0ySJx!!wONUJ zs@*zX51C@7jK;S$p-mdHZk8(>ujY1MD>fo6JhrlNk^VeU^#VPywW8LVMbS}pw^8YJ z3h=tBSstqBQW__!yB2vL)ty*iMw>Y>1$*IWn z%R@FnS`a=5%Lu)>@rBqlo$yVYfkQKv;j_h?aqsZQDF6IR;LqRipa1-VU;q0Pez^S& z?MkuIpY;6qHWXKWhuWrVyg{y`r1(pG|Mf{6*s&7ZS1rWh&FgSx&o1m-x(K_L&BfyU zI4md#!RonwD2PhkLkesvL!&$S6z= z4Mz~wUTj<(ffocPC#pJ@QO`yzj*hZeDm!ygGOJ|{}A?V$>KlFHI3^KQFX5hS%mqC;#ERRq<- zBdW(ow9Xy>fL<4&r|Mjn%2-W+{hgewq^`V7i4f>DH>rRpgBYUr1bRKdlcPYdiq=+A zRfCzDQ0pm$Co*LrIIgK*`Un6xL4n*vbIlu!h>J` zB=mrvsPuj%@O~5U{Ruq&_1|cDN}xTUn!Ep;;8R6e_m4^%OVxv)QA^cVOVwA^^#aul znZ{N@yDR;BWi7PR51ym`5#QfNyDOHKs;-9CysDAboc~vKBkeh1UtKSAqNTsG7Rm}a zGPqM$MGMPDp;^F2g(qo1EY~I~Jy|KIg(^?dkYwbxPV(`TzB-Bm+Ejs_SX@zh-E8E_gAzw(-uqr9zWkoV0Lw|p_An1dV!wUUj7}zuco#{0iT3Q z^~q9HcL=x}EFV#RcUd;Kl@QA-qr2B{Tv2SU%-)dxJsHi-%FdxEb5`VaWCtBXXIR)%>6v>&bCNw$GG`$wF5@rg$^HXk zZDlH&Z2cJWaxFappFl4%Dh!E2#;K-?OaQxIcELgY*%a$+1YO1^$bMuflBL_LNsOV-)N5ZrmL}g_l zCMOd`%N8M{a5jR&axv967g2r%Nc1g2qE{|beE6B*eB_7ZpwK@R+fwJ?XvTcTY-~@> z!=*#VPwXPs>2@8Cu;r8DDJ`3;;o_W^e7-GJ>o zR^iyeE!ebt7S_$r!``(8ICWqewyqAtig{s34w;12`9?Uo&K4hUx53$+j<~SLAG>FJ zV@+x-@}n~l96t@7d8?qAvKBf;2cbP{FVs@kVVM6S40I;&9A;p+*BlJ@FT`M7e3k#j{_U@%Tj}l^|{KW0kU# zbe3i+$+icqJha8#w6t9WXeTYweI6fDLH_ux6%X&%E7tbDz?W7`W`f9cw5DcRK8m1| zj`5fFR%T@MK(B?CPJq_~Jy|f}pU{*3J(-CiQ_m!1rDRD^E-n-3m8sw-fGZ&sC2x=C zCFLq5C_t3N-;zpM__JPto-6`T($YmJJwxY<-|>{d`-#94rT53n>5Web!qyl6qN@81 zm5lQHgdY`ONgIJCwv~_TAO4KG`!5t#C!@bIe^*9vC5@%Hse>S-&Fzvj7nzP$i?TY} zZCdlPT1i)_K@~w)OJyg(lYTr&Jg%&%fUJxnA^n|=jn!ywp^B!`>w47AMog?P%Y@Kt z<9Q>&RNqAVN$5#3dIuqQzq^jgyI$F9L7>+ueRx!M0=o`rsM_TC&k$8uqO!sS<$c~W zPae??$bG}_ZjuCNg0D_;{Rmkl;az5bP`Qikl{ve`B{HS$@6lbcx&l2ps&e`M&f~j% z`xbI?vQ+|2Hi5EtfSEN}rEi9TBQ^2*dm|N_`^x)jRD6Wq_=zT%WGs~=0QzR`gq|%@ zvt}VOHcf$^y@R8YvLb^Rs%dCCgr2OfqJmy{L$00Rw9%e4siLHmez=1Pg#`O z%=s^{_t-&f-M10@j&H(|5BS+bi*Rt?Y#iA;16$TcWAS`@@-m9beSl~Ch7v7rC z3-4+6f=ieMD_;egc(b<)@E#I6GBPXucA~)exb0ybL!igX@Ygi6T0Ew*Q$ep?0pG8G zcGBu~;`wiFcu4q3|6R*{>DwddsOTO(ZNj5xO?WJdk&hogtY-yj#Ea)`JW?qVc->u% zO1@qLRh=wcT1W8pD7=66>508WZ=~JmDs5=*cuR z0iUcaEFmi=i~18KC`Va!MM+}jYZ9fjx%bLt1_V_eZLW;k-fQkcMdxEQK7N7DAAhB5 z`17xM+&}*txc|$)(e&dVsDA#Z5_M01McpIG*L#j8D!{Iv|AU95WBc>p(a`-IH3VTL z)m=Hiw}jtc+T2bx*-e$#iQBcZG!#L{qX4g@nl@aP$EqdhsP3xjWIiy@YpWF4DQn`% z$|rWZD~(y~&~ce4Du?@QML&tLZIb++?0 z+Ex|NsNg!w`MRj_^8XRtRP3S%A3dnYvnS0g&vu?S@^b_xzh4G6WWYqMbSwXE2OG8a z4we(aCw+aQ_&w!Pw5hA8WqA+72QsU-d$=XBMdHPFi`rtEs zc=;OEY~P8cE0&`mF9)+{6(E1kG~~>khV-HwWYFHmr)3~MGY@I=79nBY3WUvAgMiGX zm=aL{Kfg4jN9I%Ir6Yf8Jm!SOV#oAC%nl2|%*YTVPw~RMoD6*O!Et=>!3oS=G#5+P zF2EPxom2WlWYK{~&wy(;TXE#XMI7UAI`Y8*>^Z!G;LF8<6InR%aTX4ripQ!=9w?e? zN^9o=w_~X79E^6F2X+6Y&b+neWQsZ}vPRypEe{%4G3piIbvfDm$vY?xzh(f8K*(WN+o&BzxR}2eMlQE*0xnT+Nnkaddli9^(t%I13d}p(`%u1 zYi*&*BlP~Zw$i^xmDofp$M`4odVu$T0LXLYIX+fkSRuN^}-6(B)fSS%nc<|y^{7j`M;CuA*ztH&TCsaM6y?y>4RR8#2G!kx2PkvG? z@5A5zi=TN9{>pg#>wnQq3*6ZK3=LgRP~F;1_)+Dxck_F@X>(ioH>vIz75rbysagem zWs-A9=vCBIE2FkzZDj#iv9*%MBfyhT61eNxXtc>dR4dD`QyG{N@HG*5l5{SqNnQ8Z z*bq`(RA+5%ROPJ|=pgXAI%^cQ_lV{7=wS^W^S+SdLzZbz$m9Fnl~n2iJ>CzGo9L)o z2tgT;B5)c(`r}Q84@Ag1X6rXC4LqN?kA@oG;^;lfh;D$_BQ!P$#^QNS(T&KNF zC-hhsWG^4q1#5dpSlT(mz|0P66n6cFYbjIIUVU#YdTC5DhoyrLY+VDW_#82Dk`>aj zidfl_2|byS@lWVEDsyjTaRI4_kz#Eb(NrEONh!$5nueK$3$S?MDrNfFh7IenXYV$| zCq%;C-3?ip(-9t$fr5f{gvn=Ez2_8mfA$p)oWF=u7rsK#y!pt?%BIqrj%-@n^jU;l z5pC=AX^2Xr;wxH+n0c!ZvurE8@|VFYb0M6Avfwi%1x2X~X=}5PKP4Kg5;L$aGZk|Z zf)N+s2se8@1Wa*8Y-|`#5PY&h%Fd$)kT#!P zOu(E%+U=z)5xx5aJhq>M+1gJqZpKcGP2YqG!5c8jVG$;IuEa$5`55IfjkY!gWBuY_ z8JCXWoI)hz6(A-#1u^llw6!wTZz_Qosx0Ub86J(`pfFlm8#p>SDR$P>%0f|i7JO_- z@EP$qL`N56Ei9o$o2%vQ1P$H{W3A0F$Y3Jg96y-K?j3^fRrDJ523{Te2417edy^{f zZLMCg3o}DK0oU}9pt#?l8nn32R-RD0Pv|{(+DzcFQao=$H&t5qQ-bR$Ezpnkxc{7B zd{#}sHR351-?QIZcvK}MKX}oE4!%$N^d3H=B^Gc}4YYSuD$ylYimKv%M-3iOX>_x4 zw2ShP{4rJrS<qDj_1nYMMEv4hN@1w1Q>*$L<1WKSroK} z*R1t^3!eS@6iqg32 zaxVpXqV`mId;+{%FM%gBIBw$3Jp!=guCnjO)$2bXEj?M8VkywGv6Btk?Fc?PSbJB3 z&mN=I4bgi5m0q7w=%r`o32Uby*f<4Zl57^NYl)Qfd_>14(IcyX*R#TkG9!b~3l9rJ zRCpk*T>zrPf)E!Gg>-^uS|-(1;UX+uv=-~vZNa83+p%uzR)oaG!7U&d!3mkjo4+0l z*B-{o4Ig0Bu1^%;oV|1lAAk1)a^@{SeC{+>`e{fe_>zk9kyx0II6@?P=0Zd+SdY*Z z`w+hR2&T^32+z3L2#A=2xUdD77Pc4*BKfx?W?)G~I@af8Vp(o1Qlni^m@fk@zHoK7 zLI{8Fo`XkGM+MtNpQWvlBi%;DL`0OvTAPG1##o8FQ9oBWL3#WFGwt;m0q)Vatawntu=m z`TH<2W<7?u%)xNGh0yg~2kpQGQ1eQMnrAGGLsQ|KRe;#SnMmUQo|KW!MkfZ5QL=w9 zuUl{sA|t{PL+Aws1i*z5vl7tB)|^iEl=8H*wzjZxafKlprXjkMpk{50Q8xBab8^LS z9*0`kV4R~XG@RX`?e2w zT16_bHr}{hyy5RZA-o8zc3RsGD!tBUR0PlE$d;TIxBV&M^-}{L5qc8$f2KwLk+%3b zRUV<&Lg+o9N_<3sK4!?E#Y2AoBdSUzs4~RXcC}MwurkPKZF37Ng!JVx1bPBHWk7?f zMHZ`UR#t+M`MFf3o&3!$0zI*}sy@9&QGL?iRY%~l(G$2yl$D9X`)5$uF)C5zU##=yr$2o z{JK!i|D{Cc(Nf7t4xVb|W9hr2m93RQ3zemnrSqi^udbfgiq}Y4i%09{1$|)4qj#Htd?|_Lq7D%Tei;hhs^rRo}f3C8k zttCn?jEW-=Q4tI(zF69=w1k(?E1FN}t;G8E8?j^eZmin45&n_U2#QZbbk$?h4L-dkaTD{St}OX2Lr(mdb86;TNt=5NTt=3|*ye=rhrSG!`_ zdOu|6IUz2_3K7BP*s>}H=Z;6?vtuFn;9w%Q?#xBex;e<%y$6Yeoy+D=V72Bm=*>O} zjl}I3?YjbFTo*&ncRh@!ZiCj8r5Np*2Az;JIHycU*z`F_nK=iUg@s7TOrtW6rK*$p z%&Ohyl&%D->(Jm3_;`B2PBve+w}!K;Bc;3@ENyMra8b$=bZSN>7-eb>b!Rt>bMe4f z*$K_g6+>-YFw)iyBduLAmhe;ebU+^?O8v3D3BNb+ruN%-TmKEbLg<-A8ljeeYw2o7 zGc8cFK#!`gjd1FIB=)um?X;zB1Xnw4Y6qd$DWkUpo8;bg(B?`A>>mBrO8eGIHPeFb zpQ$EZ(0WouK7LLWNVO;GL7=DVBp2%>3Xh+q-RW%SceRNs;NNPef}_%FP&&LN7mpQ1 zEbdF_H8j@n`8vhYc69J}H~$Si<@gu$2(N!YPc{$wpP_nQpjRb%dsULpM>||qkLo(Y zjw-IUsU4L9yy{xqB^V{2PiAkF%X)XRC;$Oi-SGgGoexn>ut~pNRp(P>6t=4M2^HIq zyk@`gTK$HWCqJW>R<@c^+4WSJnpVa0`p1Oc)1UEBMuDjSU(hDEK6<1myfVVAvR)F8 zRm(zE*6_CycFHKN611}~Lw2iERyk%vQ7_l6Np-BH;(PeG8-M)yEB^KG-|+1DLnS?_ zmDgJ}-b5=&IoJ0=X&$c#r4~9?{}H z;AfQJ=ey;*S}PU!b zCe_|;LhrUfulP37SZ_=%OkrU~=W1=KKu@}VELkV4Y#d-=Z4W&oGmKDEM=xWWK-jp1 zE6~%{vxla(IkIvHnwSKBP&H!P13f!F7U+da1(eal$Z#Gbs7PXwoScT7oIK2&Nu{-9 z0oJTtitW2LV&&T92n-29Qd&Opi&kRE@`G5oWFM9+-GL*=&f(*8mvQFoCCp#48Iyfu zkQh&emso(fL;^3h2;sT&5L&PVQwvwXzhE_jW^P9OoZU#z--4vL#R&7uK`ud;ADxdB z-)Iy@CSgWg0;UrjGjh}UJrVEa%9OO;QL~edOt!y;ru8c!Ufj3g}eGrx8its2Wc)IH&JJl2WwxnS9-W<%@T!7?l z%Mp9{FsAG}3FlR(;J);8SfzakP=M z3sxa(;RPu_;H|YL; zUaW0pC6yDETyU9@E1UWV^pwF00iN{TN&E#p$$^u;v2tbLLKJc%6|?k-s-~Wm zqvL)ne*F0fp1pX4`w!Y6`IrJcSy4w8jg`Kxwi1pkV*y_mzpq>7b~melC*W)2-;5$nV<_z0 zB4Fbf1W8{J=;chC{r`rZwypv_nffI2J%gokMiW*kDae+6F=iDIXmhb)(=u$|xegmQ zuA(B0L{@qMmM_|l6^r&`*7Wt*wc{wxocSE*&R)jS#XB&?D-k~KiBydoH7w;J(b(-G;P0)Lkv1bYS|(9H)SlYJ2uI2HZ@ zfpGTmfhUzxbP=uNg1MNSpAC<^3`{A=gF}2GrczBF`1A{u@g_ccVTEVL$&f z7{(mNB+tFjvD*c+$;V+CbOK|XmSKQpE_D26!zE=Y5*Kbl){?czp(@UpF;`hEH9j#x z0bXQSD581ILV`jN5-N>K9Q^%*VapcF+?2qRjk#qIgYc6@A!G?D9cAFc0-Cmt&~R|Y zC@Tl5I%h)82}3LiKu2!^&<7(Nyr9GH8sp}KS9R2=^!lQg#`}0vuP@#<>4VqJUxj(B z2`e3MezyFLw405OsImllgkAR&Dz>Kt3ss&dysjVl+)pipoXWxq@C0%KI|X!vUnf;x z7ZqSPU%$`yJ>X}a{Me!-Axk`x0gbL2RcHCj85m-3A9NF%5BT~cv01GO^c3JRBr#b= zY@2DzYv4 z(25$F^IF@2^6F-kRW~WKG-P3druHuUL?Hc1fIa!?ceFfs%Ioqo9{l=m!tOC{Q5}}F zbh8nmoqh1r%T$#6w5`A58P(f!ey-!^KT-ck1-;U?hbU=%fO6ViQHhfD+#)vj36BpS zqqbFcJ5xbVSqo25crSrR@C)!{A7BCA|9~FVoj|W%`pjg?8P#*AG$xEzs%Qb;1FHJ_ zY~dF7Po;4uR$4&SIwywi!0GHO-)gI1m7)LW%(ua?v~v{&h%VZ zJJ8A5TEfV}1SV8`R&+r&viJtUCt+ph08>jF^s=P2wRKe~Jq;aejMp?)pcfe}(EHoo z%1Rx!{GdQDj35b#WQB|lL2y(E!eSzkkdlln1$sqTymSFJZd#3P+ty>v%H@a*4#)J& z8Cbh$8)gw=OJ;7snUf#k>_=y@ap_Kk`((n$E(JkTX2XB#0tCdYhELpDcqgobZ{h|_ zOD;p7v?@gm-!d{L<6l9Zh(}M#0w42aW;$$eXhOpPapjTi5Pl$NJ5fJ1q|dnaRk>h($tD zFcPvNX@?^bm=+DM#8?F8{fmR)50?_jynxa*WEDo zH~`0~C!y!D1%pfqG1|TW*5Qj0S-251*6zd1m76eq!BS)u%tU&2E>hTN#K*)SiolBu z4MkKqVHY071}2IC4}r6bCoC)}*DY+6JuVz<9jN*oVC(1tBO5!Y85(1(xfQgWJ)ll? zH^I>p;~YFOn(A((jXTwyAI7-{U<^Y)EEYzD>JSGj46-xC5XVV)%XkQ0GkOm z1as8NKn9^xPZiccE89v{*71zMd)BIiqVireqw6PCbker=s62t*qu&UwAR)QZQC+X+3{;766V zBG7ATsZ~^7YYP=;TO*&7X=kjw9c_GHivqfTMx#v2`XA7v#Z~(4{tkhigxFe9cr{dW zwe`&eUJC&ub6cAg=t-1T)Y2vsdaa#!`qOX942(xS-lt_1W%isvYkB@F%G&PZPGbj3 z39>3etxD$H5@<4QO=fVkzxYkb?-Mmx$q?w>YwlK3V2YbNmE62`s>nyb{mJY3D;m1G zQQjac!2Dn6spcK4pjSiS$xdcsZzZ>}K?Z-Mv7l0xDQ6GlnljqEcZ34tf zKCY+wqh%)aI$FvV=t+p(73ej}0tbX$v(jxrXv^GO*=s`GY3oPTNuNH%FohXoRB@CN0DG-151j9URF*aR0<)muw$jrG{@-i z`pC+gg@~wlr6aIswOtkTY@wy4t?FlzRg)t_6`LCo9Yv*=O6cXGXx1!3Z!y-cUyU8x zH)GNKc?j_F#loTmSi5ioW~9x-fi1`J(aF!Sam_AF^-hD2Ll*r1KgRw7I*u!i+J((_ z(1~Mau`Fgr%gnYc$&xM0Y?+yvnVB((!^{rzB$-JjU=%a6OiA|L`?TBRneV&*{g>9e zs=BJW)m>HRsl5-pr}t_M>a!8UNAAV&q=Oiq%ox1~aY@@THf1xWOkRW3skFY6$7AgH zc&y#B8k=`*LDINnjGZ(I17i~~cE&tRSh5CFHf%@2$_@_0DMjzI9_WJVm?_HscZzm{kg{um->Md7NTS4dD4 zU&qbT>&PVJatOV=ifn=`3%OKl51>aymtRZhF{JJUz0%uRDB|k^+F1e5q?O{aB~?l! znt>vIo-D5F_R6XH%BUU-3E1MIG?bTRp@LD&LRehH&oUM27U;>=O66Jdvt7Rry>u#* zbSjffqxAmYpm$xJcCY?H=vjdGtLo!4{`P(t*j-T?c`3(q!X%Byr{VXj=>(pFDv$9S z{hn89KqjDFrQ*w|yoK!QdIGPG zz$-I0SaAHKpd42;u9o&zV{~PWf6FaGA%Umq4T@@S;buV*u4m_%hKxUM@H-peJ=8VD z3cV|86i^55-wpIMP4J|yAsT5@KxL4EUmYiRZVJd?vi>b|bvtn3 z*b6wg_j$y}FGWn>#f(jqo}1CH-x2ihxd)>M9mLGI16Yu-1B;W^WAWH!STtom=Fgfz zRYWzE8jA^wCSdEC-I%s^0g@LHn$sp>#+Hp3y=(~r6XFm>7|z+b4WC{69Y3buz~VKl zFnH)-#10vZDO9ElrccD&$rF$?G=a+n*t6vTUVQdt?B2Nx^A;|^WWp$MYAQz0pO5L= zcVp77qZqsUS;TIA11X2TMdGf{5WC?c3|dG_JMLL{4nBl{5hoEf`gwTt--j-N%i&0M z7e0I;CNJKGEeDTd>;3~+xp6C2Z`g{(OO|2&+_{)ObqXe?jHRuo4JYhI3wmm}NJym3 zABFw{1{>G~1cn&ptu92tA(ZODQ3#-Qb?^>EM^9gLmB}AiTFB{$vkHUXS5U!M6MMiPD z0bW`e0Z2QWRgr<*${gBQDg=Tmmk`UXp{*t8@&vuQbQIPTXoOzboop2A^+UibR=O#5 zaAsl1rD9O1pI#{~ZaG0(sxA?PT~%c+3m73-Als{M6Pbjb>gv=`am9o(z)0a)IwMQr zu>j}g7;CF3A2T!6IoT3|-c5Dspe_94s+1m8-Tg)iYY28%uU;oC)Q|5b;c?R_IYIB* zO=VrlGmRL(-^j-AYS6fzVXUu#AC+KcVJV7fZ%c0diPAejUd>&elY4~QU8I-Sn}!P2 z*WE0uL;kJ5QFP}o+S=RPUP2(|(9-5p>1kv}h3T|xqPUXhzM62Xr-HtXd@8=2@+xGP zmYL+BSJN~PF+pvG-hHK~FrAR8@&rBAef|e}S4~m@WurD@s&lWKzH&Fy|BZ|+s1?4<3P6qJDZ7r6*f!g`QRMmGM|9J!^x8mQsBRfl%P9DCNVg zI1MFKl@{m`x(sz*R#qJ)Cr~;nwc=X4uxPqZ>$u#X3A|r^`IklQ5qfh7y`F=6BW74% ziklb&^@)IYL;%6(1OMJ;#EDO=N#0i)>aXRMAnosC0$GVLhsPuN=@X~`!Jvjux5t;fcRi?Dd|49uN1 z4b$dN#rOpiku-lC=I&UAH79mp_Kwv^Ubz6V^9etyw6O$V?DT1fr-C|q;i4HcdFbE~ zOddB0@q>qB*0>2+NZ?JU#haE$=)|XD!^)jFcH{!K?>&OKt2SZMl1-So`3UCizlfC0 zCy;vhbxb~d8OeLUMbh>k5WD;n44?ZUqT-)N)Ub=_mv9L&v1j1fdmB3Y&xKvsEDWEr z9qaa7z}_Qgu!|3Y?K`(%{iY2B-crn)I|p-T&fs~QU{QFZMsb;BZ0#tjyyVfe)BOh$ zc$TJj!A`T6se)GZcwv!!VDCm{?hydnu%74~8AH1ogLbsGt;2euGgV&a$iY;e32+&b zOjS1?E~BTyK6wfpQc~fho7o zlZ(n~g05VOFAJ3wv|fd@&$${2z`{f5DRMPFG*fkWYEjgFsyMaS@}ZWWpUcQIDla=b z`~N5Oe!oJ6L^bz^pl0H_v9p5Rb>-6|%*=Kwxf^NO$j&Q49s!g|C|#rNR71u!LhlA) zXW*Bf!{1qlD%x7r<)y3T^?uiro5eLeXLo3IZxduSxKU1U@t86xH-aytvWDu85GyXn zwUP>46X?oojWsT#MJ`vXZAA?-xUabG4$5x-iTp|eF*CztP|-A<0^ik}7VurWulOt( zXJ~0@YfUF^HTp;)XIn-;6jC{Bss4Crs!vlw0IDw8+Pr*1P1lBiD!Sj<*sAc#=m4tt z@RHKAw%(Q&ss2u_w!fKWW!Viv?fQMdT`>TZ@)Pv5tf9KUU6+S?UUPvjkFd>5{~6h4 ze6Kn}tE0JKt$vulnC?lM`tv`Ni__}k_o4R#X48#D4xUWgvvm*5MB zSGXU%LVQhKUr2Z;85V*Dk$pxXBsvy8LH!9mUvziyHqeVt8f&1Z*1>QRQVkg)At7*a zab>|AgK?t-y)j6ROT_pwV=*IjD(23afyMmi>o%>zp8Y#;@X&tj+`bFFBKjjGb}H5_ z*o8^43os&TCWc0=MDKv*=pVWaBYG~!sF+2V(r+=A52qrGTaFzGi*aE5a%`Wp5bIJG zVs0uSv1kRREM83uyBtY#QjxMS74x>N#HN!65I2)HbP126%3Qj2H%jC%(m^^FL$ep&yXA>Qf{w`T}v2KE!|#uVC=VSCKIGUGy4q0nYyG(1X7x zI&M7{Za#-?ho8gY6VGG+{*&0bbsx5F+=i8_mt)?7S(rmOPZ~cCDPwsaQ&LPzZEWmF zBv9=s3-t)DE6saAP@u_&rw31PScpmc9zcoi<{OIc9zn2`+Uq?K9eVabtH@qx6Vo3Z zdMoeVNObKT3&(+@;T|^`Zb_-|m^26O(-**L(kysQnFrq~bI~JZ0$j$Af$ivlXcB3Q zNBeh%|FlS4;f<}XD(Qq@M&&gYLMlDptXYI!c2zcQTN=0X7)E9l6$8~?E|py#qk!;x zV08&ZrH8Mm%SO$uJXF?XqfoP?sdF=*_KIS`j<(qZm0n3nrWuKmn?ozcD9})}0@+tu zSyjO23V7+p-lnm@t244}ty*oR^d5$tb=^{*chx}e$~9U;13D=>LCz9VcM5G6^nUw; z%8yXX$}J|KtB^zMnMT`rGqZr7orh~wfxl7dT_yCgxWD8!6&ypEP?QS(kK8icD71jj z1VNZleH(eV|3WFDXTWz4$gQUutRy7Ms&J#U+H_UPskwtH+GMrx7Ss}&rIiMFHG1Ez zJIEm5Z)Rs0#iyCgt`Yv4g#8*VuNh;Te$6!Qs3WrK^Q83D5F|S*wO6QzNIoAT%2jBB zpt%pdB8`wKpvudqLLhrmeP9T>ml2%$n)il=HI!sM{_B6kSIxA~_6m?fp{p7E|5qkgD z>}o&bm!JQMf0=G8KVZ(X`RGO9#SnHqNAxw&3mXuPsKLDu6delh03U>e1tWqyYY^2p z(R57q4(vm8EON)n;1q+($^@4+g4fIAQjz;3hIHag$G%4 z#!bY`B`dM>@F`50vlIy_GqG;V5iD7;3#rpqValYXm^xuGCXAVlnG+Xc@r)IiJ#ikD z-6pJDvH|mEuEX5fo3U!`5p3N19M&Fw1y)UAZjGijr8e={R5$_Xw(&nP{H6>|R@g5j;&Ea0zVLM;ci zyf-T86a>Di@|#@Vd;q-~wcM9dF)FzP)&8xz0y92WjaX9nQhI61lAWPeWJ^Cy{T2cJ zni_93xAAYk;NQRfOegmv<`H^*hW9tniyqzwz2gR=*N6e=J7OTk$smOHv?NL~oqS-dK>uMIv2x=joVoNdPCfS)PMmuMhmT#vuAN7* zdBaYuS-BQV7cDXhZ`!me7&k7(Bn?QQ#U4RB8%Om#h5%REsOVn3;TIS{>mNdut;u`> z`F{w#$T0X4fDXQXu+f;pkY4B-)gSF5`{HTZ-F5_C=b@v}VNg6e4N8Q~@D$jM7!RjW zQxPze3UuXGM6BM5prz{(vS>Ym=dXs}oW%%VwhaCYrondNU^MF89f7k3Q0b{tGHqf7 zm0aa@0*{ap^s1@y2)(Ro!H+=UwhTfovxfT!zRVh_J}NN+uh4W`$zUO++T-z+v^3Rx z!dKPiQSDK=@yV_k)ii%@j?zcb(n{57K92l+buzwTqM(4Epb^pycaxC2e$9aI+6_$%l+Li!%l${?33wK(tFSaw zq(PaKa|%mQR8fmUsxVpIjPzXGxSomMsRT6|bnm>S3=90TYJ|;Mn#<{1|bGBM?v6#f=_;#FThUoIU|F=FY&vrHip) zqvpFkh*L+-AvI+d0-S>p?iz(?w-}5F9f|S%k}+fGXv`cu5~+O#W9Gn-SeZNntEVi( zyoswYZQ@!?o4y58XR3vBGZrjbM_{Zb2_*qCDI|nIovoJnk0g{K$!T8wum>9PR z%VuoD>iN5{YT-6o?&(;xU=`*sSc+vUs3=osVdBK;NJ^Z5S+uuvR&2wv?I*C|$cxx@ z@dF%u{Y&h6<{Rq7C>k+hg zJtCKHLCosy2w%P)q03exY~^C~-m(bp^M)Z}Q5>O{i>%T-WKr>DRk6TSQR(qU&Y(Tb zsL3(#%OuD$st7)9C-^c5zKr|e%OUg%>a%FgG6_8bkoKmO@2jd4^a@Z?o`d`%Sw~tI z*;s-ukD!yXQ-g&Roz>2ob%t6V6(vp~;; zk`54himMM4o?vH*KN!Eu{&KtKj!nzXqYba1MZZJ1+@eaYin)3R^m#YGA~zB9p6P6 zm0&qT!_U$PxHQ5xlfcU;ChQ2kwEPlW&B`PEO3Vn2tYUt4VIlI%N)7n(ON(%m4}k*0 zFRze_T)FdjzLYIl%bNs9ZUI#}&!_3WVw99dUZ`XD?_B?bHkRtz3_1JVwA`wjit6VC zDuGX}v(^R-0Z$4~A;3x9N$m-AR^Z9<3WC~a0imUSPoJko4Rv6ypw*Sd)u zsXR08I-LO2Xp)<<>3jLgJV) z=-H<){DVUf-McRmlgDGm?1fmmbRE`GO>En|7kjoJM8dF92=xg-l)pcQ#zZ1%a33TO z>5qvcMi5q`kTh~E#wO3g9+)T_N2or}UU_}2Rm_2C<_HW*d?W>k!_r}b z1bg=G!|olsuyy?gtX{GNi{{bR&YX_%Q^q49c@$xnfRS-=h*##6aP#uu%LZS$~f#}wI2s#sb9eNHzhu%ZcX~0Ny(_u2vNgdQ=S+8OSE^vNe3nK#!m^&|`s8-5%|3h7@0og8QlLG8tKn+`25}vjCV5 z%!Fexk1tc@b#)#}D{@gti>2($CQZ43N82hDXK9&LrewlRfMaCbPd1!vZ|npEEn!j8Zb*d7L+QG1L(c&z5lxW@P1 z%%Z(5E~Q$nML|V1GV_X!%DZwU9lxv7GOeu|BW_ZuWz^ln&C1)jR#;7iSBBrxi*O~A zcDS(0bVycRT{h!-aTPMkczkIst`d40gR3EF#r1!h+ks{|qGDv1mLh}DOV8)^;xaQo&&+;SR$Y$#;yj)&o;ywDnV&`AU8icjiUQ@) zqsmjmhFZcc)d+gH5Oh>pCBPM^CM=)$iWyGHy8uB>>oP_17e3@cw!vJO__w1Yu1?TP+?>mm0m2B-e5Snhoh5?7o0sq zjJ+M6FouOK(&T7UV}{k@3VIp~I%Mc@xVd|wW2dfgbaF*RL@%VIOu=$OZylkxd&fb{ znm!L9fmAAfzK9C*!>~b-NQjNb=u!PJE;*hyaT3N(oJ0A!3}Ys2;OlLeGj10x?p|yj zyA>PbS7O76Mc6oEA=Zyrf)xW7V&%YPSUGGtmXBG2*`uao+>k^}CNO4=PsX&dahNkP zfzX?ey&G0y-QtB?rmuVdli53us|M_6|JO>Dh%8D~BwB1W{3ks-;}j2^Pht&XsO~!Laf2hkZ~KT>A_~m!1RAF=h}|o*FR* zqf6gdbmJ127z6?Pt=OXs#D$F>y6;~^ZSfGkn@YE-dR#SC$M#*I%U22ceGvHJ3b-G|k z$Ys*b@S7%Su!MBpqF>QyNb0=Pp!8PLobbL?b>y!w(C-OH;mFVv!HRC04LB1 zhz5#ghBO{WFkVYbhvv%o{brh(ha#h(2x)l*xSEzl;Bnh^o{QXKV?(dgj^5B*7=&JC z?Vm_1y=BJP{(7ST*RlwvqH0=H<^8x1_QR*#-RV zJUv_I;h#2L}hln-9NiJ1)LH-Q_Pbs}L4qBa&uhDY;fxK*WPm!{- zbUl$;vo>fbtSV1{vnsoX)txm2M1fC=v8vo+a}5B?(hT&JT1?3etkBcn;42A@xvJ-=vd=1cEWgU*}elaX3jDFTpL979%-Ow z?-GIz-8|sr5p0y+uy`rG|3J^G^jutAOl~({PY*cQbc3_KEyD)(-MgT7&nPTexB%O? z5PCazW59qx@bL7eS_wvUOceSIh{3Rt1Cca-6vj@QfU#4jV*JFpm@;+=W+tw|@}%w9 zG-?mF58s15gZ5!huRYimu^R_^AH=@C2e5m<9&G8i85;+0!m@#DF>k;U%o#ijiwM6N zaig$o>O?H&=T1&e!n|1vXlu74b>23lE;)qMHD@tz&+C}H?FGy@@D`>Ycn52r`x-~y z`zN0J_&2=u$qiioBmVs@&0#Dsg z6uG&C9z*p~QYtF2$_!#cH~xL-WoHqXgr3#j3VJy?mij!az5NgLEa0x4yPmUrWP=z*nhp*yZKS)*eQmLet4Nn}%_TGcY}V2^Pm~po-g!P5pLZPhUbK z`T&lE9>S^6<2V&`3eUwnkJHiTa6IM&c17;Qw&<oI?4%m*3!#7|~@9Ee$X$2P2 zCNE~8o|U>3Q>LxQ*wk&9wfb4i+W0C`H@t$xW#=$y%gdO)=Pj&0^DXwg{BN9k>leIs z`6|BrIuBo7&cNl@|G?Mp-o#IzXW^|EzQ=2qe!$_q?_u`zgV?p>O?>`Y7CyQBD=xqP z89sRXeZ2P4n|R^E%Xs$mvp99|2o7x9ik%zRV&mGCShR#HbM9W~k+M8QciL9@7@W zb>?D3uG@r|ts4=tZZ0}Zh(Pr6QMg76np;+gJgTxh!Yh{$%O=1CJt@1a8Ulrpsi-CV z2)zvLBlt2EJYEeQ*?KRbmnZ1a-U@sLf*w^~excx}t`ix^$x}U(I?EDvgkLsaXE8EW z8BefDu_$h4C`}Y2P4@PNX~f85;TH6)Dla#e-zn2#aRt9LDmDYQ`=LpJH1(qi+T0sl z-cVL$f=+Rr>&5~z{vZ_p;C{_RaV1?-K~nh zc{N-wu0?KXJt}U&Kuq@4nswz`fdzWm+>yiM(~D(?HM5#PR*M{^mMSbW(7TyWi%g5H zxS3BYD}_g!omWM>P0O8Ip>erXezdrm1qB9r*E7SEI}uDa9`s_%;O9MdseN>7<@WH$vqSx{>yV=XP+P^9844HH_g=lxN~ z>n%km=+)P%fr4tUTCfxJZlJ1yDzDUHht-$QQnzQZwn{dtZYvs|-{;$eo z@PVg%f$E~8l=v&wEH@*p6hlXH89NL~$)gcVfi`aZ1Z>^51AF!!Ky=SOXb=`N82%xB z%t(wj9qeK2=!J!<(~C$Pw2<)^>n ztIvPKWfEJ7auQcJK6%k(PJ@SAkV*l00II65UIR{5n()MmX4IZJnt@nL9h!5MR%UZ?vVq~ zC3+}2MG<&>SUC?$LbpDIUcY$Q4jl`-5fk8+G8cg}S0Q-rdW0?{%vSG0ueEzHV9P;7 zuG$9I8H?dPe>Hp-u1CPq&4^gN4H0YCp~u|ucqSzjkB#nzs5O0Xt(H&Raw;(mIisq} zsijpU;Iaw4Y(h-S9D*+Ueq`~r8aA>7JKp>PqzS3LDobQn5@xC+ETfVuWSPMcrATcijYGcRvKf8w8x{^fVo$soSGkyvn$8Q`^=w7~VIK_i{8E3B+EExB0*Qh7WV1-ZDcEYh^O1$=-N zRTdGzl_;(%HWs(Ku8hI|q1N)E4BY**3U#-XaYfyP1fLsJewrVH7F2l@X>-+$MH!LH zilx-jaqo7aX_Nl@UNJ$JK?Qg7zrbfvcT{?nw8Z6Vuuxw=OG~egt>*nuU6J_!cuFE` z>6mV5#4`1|nKx)}dA(IX$p@b1Gqxm&<%3DtX0HB@H5=9-cGO4$Z#aQB9B~Pnu#Dn@ zkvw_~c5dH^GiT01iI5wFM)!wLP&7Jpal+GWy1~}LgV0-z!NW$H!MTE7i0l@j7Zel% zcQ-GLj2!`AA7^y#)Dm`foeL-**@W zdX2!Ska3t2HVgCpmtuwYTCDfphMm3#am4>5&IAx>e#h~g&q2K4yBDtn?8U3W`|xte ze!LiV1TRLO!lfueG5QqF_CA3_y^df{|6|xT{0x>2KZ(hM4r27MgGfv|iP4ju#iaRf zAbIx77>3OkDg1rmuPr^EZ8p`K#Z>`nB)ixzqo`hi_%z4(4L-8a9- zo3DL=mo9yPw_g7YAAj&2zWC$^eDmc$@$Fat#AhFWgZJL~2(P{R23~l9z&m%&^hZ2! z-~hI7-;Q%@go8!*BLl1k789h~?WbX!}tN+I0-!w6u{Mc45HY;~2Q-1cvN6iGJI5 z!DsnYv`qF!v&5&+duudq-2N5$e0t|sKfzn82c<$SLLVj2znDtTKKERbZz22eJ^R6$pH4J+(m3WK3aX z(&{Nso?2<8>hkjPOju#}{{=lO_$;vd@3=w0T^HmSR@mL3tyNs-|J7^7&CG0ECHSuA z6q$Ottm;3JUU?5`q+Axkr&d_aj8;;A2bK5!M%mrJky~>cH>vh+lxliM%ao3)+smuD zhvM7ngnSQK<#h&rSyX$b^|pv=k8vZv*tF!z)@BsbQq#^#-Q|{-Bb(66EiFZ9O*NrM zRjBDN^K$sWAe5Z&2hl5{}OE8;n?zf+FtTEbCtla!>P zqB0xhROThLoyvJA@G1SfmI75R_ur{2An;1@=ba)dy=+rYX9;UPo<-qV!Bhh>AHEWvc^9p(G)d^kwF-;w>enY+%fgqc}F;B6jpWhxPr>V)>Bsm^t(;#tb=u zg!nTUJ@Ey^j=P8<$!9Py=>*11e*rTVy^U$}UZ-_`2dme9ghPA2#&f5B!YA)!;j7Ep zc>j%G@ZK9g;QhD1$Ez=0#z!A~i*LUA1^@c#D*pN7ANcND0`JqW@&0?4@y6?KVz{YKjOrf~7|FdxTak0i}Ao`(bIl zF9W>mw6iw|GSf<1sXlfV8>)46J~l}O3M+AwYVQV@8B~6n zBSV>4it6s6fbXlg2a{kRud0r=mylDJ6-E|SUv6a$uSpGIS7BOiWoa`C3aON--YP3l zR#S!S!hEW{JOe#-TF$4+E2}LxDzBub)I@1jk(udEQ9zYlR+)!NKHQXPC9gm#>k29> zb5KjDRS|C0RXMb(`LwFJgjOc1_<*XZ$VMF>iY9oksKc?s0x~sP{0Ds1mDf$k-d2_= zZty)>xK*3W{h7u_8*5u=saGs5$tL&+x?DbZ^DaI@)w04E;= zJNUt;y*na1hoE1l7{qp=zwSa?+jRowI?lyn=fzm#z6NVOmSVlvbZqoW#x}2b?D2`m zLBAv%@=L)U?@2fqG#`h|SS)5`7G@ z(T6d(_Yn*t_y&zUjUfqVkvRSmQYODh_*}x23C|*R+y%_#`tsSY;NaFzaOTjLICty| zJb&g(+SwoR?i=6XgLl5gcVGX8U;cFiKmYS8e)z{P`0C5=@$tu>;r;hN#G7xvg_mD` z85b{J#Oc$gap=$??Ao;pn>TO9nl)>%NF9*p%)yk@sTeyh7p*q)upD)j~orfnJ1B2ld9t}G_T)Oh%(lurfZ2HH-c5nimMy9}R z%v3lfO@>|KB-kZSgX@$92w%Pxeb(yO~A>$any{`*D+?OOujR5O`U% zk_9DYw1XA2<0a;GMsB`o%~k!H0TDr^=^d|?)Z+JoN|P5)x%6&Op=D~8J%-lRfT0=F zlzbq)q#8F1rQ|Bih>Ox&eq)Mhm z&LiN8XlV-z3A+N+-o1_6fAh173#pJ*jll0;Rzww8W>j8waSl}qP?u@n_x}%4yHx@2lj-kUwATZdXJT%dU=E(>O3L^CUFnHiF_Zgxy7WNrPTm;UJ_t!&`(d)p2&8t2!vfo6EO8uzrEalU?lk~w z{CZ+TNI16i45M=EgY7*>VN1v)Y!6s~J)!HcCvr2kM{mc5n7vpXa{x;s_hEX_Zj22& zgoNN@7#>W}g|IM39>w5(r!jok1;oWYi;=XvaYK${eB2q#9s4{MCO?bWvB$AA=`8jx zeis`TUc#aohp}x!nJC9SRPU6UsBiOxrw*lVTwQI3-=~4r{Y15`*JWueLvB^l%JQq}W0-e@% zG5YrF$8#BI8Y?upk6;%NLg;Zlkm}FT+ZQ%oesBm1ha-WfbWvTS`k{O8!LS`bl{Y93 zc5!3TBV`&K#-_r4+;q66E7|y-fkrfH>1ynJ?Ot> zKS8$z{;OBQpOB8+xDUOzA42TuBXC}xgvTd4AY^?{T)%sRg)yJ1q7=DREG#uF@O;we z)~Vz14P@05Qn&KBv_xiIu8C}}D{^laAh(`&xQ^L*dc&43I&efl*(JyP%z}t|X)jP+UP}#|Kw=T?Hzs z^zzE`QCLyHeFREz7J*l6>h-E?i)m$f9`mX4a<3CO8K|i#q}r3Z(lE3P+`d(Udw0sH z_zG!fD{#B61Qmp1evZYKYRXPc-KnUnq57*K7^|~sZK?kFK&z^x?PXNb)=ClHyIq7o z@0Rj73rv-0qPi-d$M8E+fvRP>u#lf$kcC`=FPjiIz>~El@X|BX=yV-QxUh2L8Vnm1 zi=lDDF(QEigJmv(z!UhCt8&$4b0fp3^!(AIhYNhXe5vki;M~m)9`?5Ibm#(a$My(u?}9Lw&Ist*8j*xw&yKd} z-NpfNUHp*JJronVgkYv!6z25^#{w4Ybv=WzZ+LGUjO&YCBYI)mkb&6LKM9-qOvk3E zC0G}}2FrriV2=Mfr222cq=2265U?9b{(CTa;PV*O>pVt8pT>xoGZ@gcR%C%?|#BJUw?-$zW53sUA~NW-g^hHyhcm=+*zDDbDY3CjD7p|Ve8hd z26)zbyy?@YW6~sxt(B@1@Dx&fia|pLBOs8LmgiO}-!)0M>P?mMJ%GS-3kpSNH!nEy zx^xSQK$nnc*v1Ti7K#w~xZLyBUNYZ}_T0>jwip?OrMM>RoL@vWM;lur7?z@t zHo4-?J(LrEIs7hZxwP3ixhSC3&7;*-$K=BDQdHE{(7slnq@oPfw`y_g&)WpyJrfo6 zw78W;sJT^+JAc*?kKFfHEvk8aH6f5XifeeZ{IDpXa`;u1!+grJ~QM$m!# zZc+W05{^Yw;#K9jxO=-4x9bT}+G1mYE3)oGP)oh0?Jd{H3nd~@pT3+2z|)+#%AcsD zvC5RK@znxeVG*x$p;FoNTI6Jy`aG#U0Z*;?*VC0n`VTB$PpHKYr^_3P;e=kC>dsm2 zl17il?Afz1bLLFM^y+0?6`>ai2WJm->SB*)+H``eyAPHuU16Xng=ZcA8LnYazCN(E zwS`UBu5h;L1}~@X2={Y9OemFIKu3glw?eqrGl=qRiwLJSh_Y{ws4i{Mzk4Ud5{#o= zx?*~OBj$xWVo@(UY>o@SzHx)Fi(hg>p9HK9V?>Oiq8o=fktvuQLW}6P5XnAkG2VAO zCiow~#NbmHA8`(&B2Qpg_#q5u#77@Re8fS-h3vuD$OD)?@D%0^IEJ}>4`A`2gIE=N z1e?a5#?Gl1uxIA8IKJi$yngN*Tsm_Zm*4&o*M7~%%`3V1`KLedO5|7y4}^?0*pVe;h3NJ&XCz>}?&veQy3Z`g?8 zh>VVcUw|Kh7h*!THY6g#C_MMzFxdJ8qMKg`?81AYOK4AY3hRaLy#~Q!AFmwH*Nahb4M}g;&x1bV?VCnqb(w&3TRjJ2r7%c zEuqq*g{-4OqjeSZtbnsZEw^3@uK;;k)^VSN+TuUS)r$}T3u@+hU~4t_J4z&H3UYblDC9Zo<-?pWaT`l z+tYexcCJ|pdWE#QRfJFN-8)`6rJ7;dzJq zDoV3W-JFzP6|HeCAD*fU6aY2UO!aq~&|NLIme;h|rbDzEFt}bsg;$`+z7M?|V{dcw zc;Dq(Lcmk^C1scS?ZzKix^@Ky5_kh+hfwj2H2w1gKDn(0ESoBS;gJ!>RW`g`?o=#fX)~gV1vFM9nsgd4F-F(!sy_3m@~)@OJkj}G07jB62q}( zL~ktV(;rL1 zsIV}2g@(hyFNnYkhJ8eDbPA3}x99=1v_sK>%Qg{x(UJDnhDxvd&{41-H3{}(rlH5g zIdGe{2o96yp~v(k2&Rnj%;nZsnaVc$YB@JH}J{sX3)xJ z<A9+N%QX!m*#-O^gjg=Yr^)99!Ax$uA+<;F<ba@Oxub`re=cUNJ zued^)UkFHoKChUH+;mFLK@QK4lwLt;kr|R!d#4V!srddR{wiygDTZ((V2Y`#G#W-3 zSOh(F7%wX?ptUWf+N)$#pqAIYii*FYA`f-6v@+P$REvuG+A>;HD#kwvJRVz7#_OQE z!h&?82y3bf2+3k&jcaT2P2He|xe17Zoi&OGNsR_6D&+Rk926HbigF1(Dp>{yP@0S#E}%DFnS(EA;5N7B5|b!9#~2NTZy>Xm3@Y*Qu)` zo_eMeo@&(*UOqvD-bxG_Jj7Bs??W##Dhz)9-ssk?3!EJ7;AqnYZjN2x;@Acuf!)wM z+75kVYzV;i7|_!meL`%}Kg6D@uOoVScfiQV?wA?xiPcksu_VzCb9)70end3pg~nh} zRDa9}>xm@;;;^KD3g$;m!h)b#SQ)Vf3j)?+PQVu0+g(WZ*o+kKjTqy#3bT4|!<5K1 zm=?7T3kPh+%AvclZp2<}9!Vf2?#J4pn{jaJahzOy5qlRL$F@aBaAMEPc=z>x;JdG{ z4 zBAk+@!gc&ycuZLY_teGkoV5Zz^VT3>(FTOC+K!&<_9Ef%Ma1qugQ>5y|Mac^~?0 zKZwBxP9y&GtLVS~EPC%fh~a0?W8@2OqW}5l(d*0+to!~WT>q;Ag}3uibn7O;M-@<8 zNa&RzpK8wnG%7)1}z$(K~qbNpEC|iHZ4Xz zZ~lB*r7JntuwnNmj7*6~gXTP@NdxpA+M74~;G;Y)o zCwU#Lz*AV2-E+@9hXxHApi!enMsaIWZ>c<8mxc`+qG{8nX1_I>H*b#N!-tz{3PDkz zRHIbmCOWPm_U=1i)SaNG<=*`VxxEn@G;E9$=bpu(Q|I`-A47vi&A5%xsJSH?-j62S z*YFYSI&cElGpX9B@}7VBt^50QOk+g!>_w}2mw>9mho63qhRq(~XEZ^B#{3RiDw^oC zxomFPH)+N!1Kx>KQ~4Q<(5P8+eEj(r$mO{xE-%Biw42ztm7;+5u0dn|)*y_+c2N&dCX}X84(=u~4FcX)~sq@~4qy{y~9g;LyVt?K?RTdR@@6RVP9(m}Fdq zK|_ZbDX2Vd0$->KXrPpy9b8?U(X~?tbm`a{ojN=YC;O-2>qSc&(go3B-O)S39??{J zy#gIDD9jCs{Q{8G*Art0xMSK#FHGp`im6c{gqR=3diY~e??G5JU@+kohuKtlb3&(K zLEvI64P1+9E{ie2X)#h}YGb2`FVgJ>b-+vkQOgW4rvrb~gu+`WwY7;K4 zx`a2kzJW7KPT}I(3%IoNMZA3U4ZQQ>r}*+SWm-+cFTZEt-~YOTe+zd1_z6FJ_Z`0d z`U`yV`N#P9^85JU{de%zTW{i(S6;!h&t1alGv{#h*h%c;O}~A|PHfz?8LKsy#-c^0 zzCgemrw+(t#u(@+A*hs|)#COW(BE|MR>P3BrB=xV!lP(w`MQUHfJr1^uX?<&7_ZHBXl;)p{*||p z{NaZf_3ran`@_40UO5VBU5js}8R+E^a(Q*7gkC8vZ!xVe6&+z$bgK|Ws_U!FrJYmd z{7v3;LoJ1kH+jQHElVr-Cd?j(tgZkY0Z6Wj@{2Vwa6c z6DG1yH8jz;ci;a4UU_-BQFtRqjx;b6^rX18Pm!FQY@nxojT<*M>(-@p+qP}n4EPQn z6xjIx4fOUS(+WMRJY9b)^bQ<5jr~VYb6vm_pa`)Bf+d678XDlW;OiFHd+-!)WYC7@ zlwjtZ#rKbCYzab4EyB0|1U~%aGYbq25bp!889zhIrrdUaIc@Gd9{(4n&X~z<`kY4i z_J<#+;((uj`xQOh-O#u>zgJ^@SN;|Or-lBfW&QmR-$ivz87ixaFnNjvdWz;P8e_%E zrKqc~Mon!c&ucMe&X|fOx{loc)mI;x`n}4^0)i?RfBgP48uDBzn(&%wIe*>^0*scH z_sQM6)dr9=W=!BYwtU8S-+fB;XQ>;kEXy$=MQ8`ocO(4l9GBPV#)GlrW~pVij;TS4`;}f?56g zVoJ~eO!JS!OuzAr=@{)i8F3D&h_#)L@m@1i3G1e=#g;jn@yg+s@bcjo@#g8*@%ihY;g_#}!IfX={r`OpKmF@B{O}{; z_m3Y9?7sW@OMLz1r&MZ^t-UTF|ehDKFUqH;(1L#SWH-H6jz~NKyU%Lr`8@3{3(;kFu-;bWBP9gEb zmoVz`Q7ruO6t4YEuu;7g*JqnB_Oz~uz@zQ0q1vm?N9k=IL)BJXOPf_wgc5GQ_|gRn zw4~e^V<#tL>g>swFk>8?yzH_4&=zF#CSSF2^#izh`g!Bvu|qg?>>&CM>|-f}+}^8K z4AoZ_@^kYrbFlTK`$ed1x@vNgr1b21$s@e@5pI<^W$&Wf8->NpE{3& zhmKMq3Vcn_kO17Z_b|5ZJb-#ry( zE5)X5Tlt#jfalP~!yWq$9>Afahgsl*&9)}Z8l(S!KE&f){Q1`{D!uV&*8Dyk`I{ek zq&a^7{a4!BI;y%FOrODXr|ZQ1pMLr-!DN}MMB}5+pF3qhCk3Zv^JYAkyl$n0RTb6Z zpZDr$lS^1e#-arkzo7TaFW=!FL0GP?GG*%DSA-f`=Q6&os3@QXCX8vDWwFaEicnfs zXrN~vevE9w?|-59>y1CKaP?C39We;~M+`R5lf4~1HW{netwpasy-m3bU>OYLvVk(I zJk!A$&$M?!+m6og3yL8f*PGD->YS`PCpBgm=mq)_c(!QYz8z1_qj-wY>(sFgI(BFY zo32lxbGsIB?A8(Cw}$aOg5@m_TbeaT#_} z*=%371$)=+#&gG?!%OF0!o?F8@$!Y2@byPu;`>j(!N0!!5x@QT3x59LpZNZpZ}A=D z$AA1l*nN$!zxW(qe0G`Y?mc`+S^v)4ui~{=U%*Q*K1ZwcEY6-ihhs+xynTnUW9MEf zylq&yYAx-*T51=Xt|=;9)_OcMIX5FAA;I*~8y-8tC_ANmx7gYc_yzbAdf}$E)=jCR zyu4v6l@}TXTlK}Gt?fiB+p%vfIuA-j$04I(lQ0PmW2VD#>`Zv1E`rzerDn86=n}$i z^ZK< zLINxFN_m4%o1KbA%~|Mp^B*|69~HF3s+X&vEy~w8-imCL))m9e+uanThD{pdw<~|3 zu%yUTQ0EusAu>AB6hMLMyKlcmZgvjFC8rqJwP^APmyNM|*B)bWrSPQYq|~giQs}x^ z>-4m4-L86C9dE76i;Igh+nY3Lf)A;5)ycd?ixvikf}@}(YpXiGhoQIs=vidsSD?6} z4n-xEw1hR5ef(~MTA^BTGxAYXUT0KZdUg>N7L^?LHEP@pGiJ>*pVPScqb8qXDc_%4 zREqMNdcLO;=~S2Fr_jC%jEuM5|IoA=tKLube|bg4n4CJz0IvbR>o@=Sp7yo?&Tg(2 z=rMF&t7>bISCEg2stWWU*q{I3avpyF;}`t(_Z`fbIn{C<7>%V2xlcV4?%lgXF#bj8 zQRz{k3w&RE`JtJaPO1CL38Se~CK%8Ocx%@zH%rCkkKRRHtD?6YZeBB!BmHviH_TtL7=4EiG)m8EZ)eY&i@EdXner#mA?TTgjJ939@N|0@Mh^o& zZ~vZ{zj!SM436c~LC{ko%TR>Sjs*nx!Opfj+P7(J>=2EiM&{#xKLlnQO6W#U|{f@_O_2 zxA67X-{OU@W$&es0$Mesf$NBSTaQgHq96NR#`}Q8hj_td#Nwb(y;VsdmKJ(_A zCh!RpCK&4^TWc&W3y_vldulFM!-Y<`q_q`#M)ie;!Y3pQwtN_LcJ+WmP#9cd`oSS) zfa!qTrSAxI8=M5Y_=)JI7&R5P$+O@*X(2pku7oQst=r7i2wAxs{dSxp)SkuQ0~gV6 z&shvQZ~>!Ezk+G6evI+Yy^S%1-jD<5(0k`m#GZN{qb|Hc=p9Dzrgezfy9Lqvwxah* z#tW<9c_I;QH+iD}xp-VdCW;8EQYyQWnqsc!)6x=zgrI;|MBu6OuwIuCV%aPVmArY! zj2~^Ws)XKmKM`0Ze1g}Pp{Sa6*d!*+Aoy|#M%gk8^a_fL`1Gd5A^37~vsp-FsboJJ z;G>T|GGO!a@Iv#Zgb|^lh>D6f4H%`RrKUcwU%!4v$q8O6xK2(^50s#;hwAUN@BR1R zH^=AFGH0ZxnYuqgPVc#V`LY>ZqxJtU=}%?IseZiJf<;i z82`8W-xZW0PxEUOmf^sW6XyL5soq|E^$iRf9*2g_9_6`ehF|`mdMYk6x%;y63z1(^ zhDpkHQZQC~zsJC(FZY}N-dedi2vV14T4?li|GAg_(0#99D{*(p2 zQ)LvPEU_9;N=ozj`9%hJC1rfD713o7dbtd1eV&2d4NZu275~2a8x{z9!z|EKLq`0V zB&^@G5wRmj!q4CTfzngfq6Y13{P1)KcRbbJ6|FkB&}zhB-lDY_I5Yu4VV252lUtth zSgp6tw70G4Dk=cF#gY>z5~$Hz88Ag55h3_cns??1o7Sj3CR9b zgo7|8Aqlf5O~=9+i?L?e2An;44zIrY8s6j6`8^hlcX`p@z5F3Q{On_V`OQ~+qQ6hD zUB;K6eu6JP`IvDTpMCrhKKkHYy#3ajc<-I}@%EeV;*Hl{$E&Zrj2B;c9?xDpN7$Xf zi4(_g_^|rn?Z@_QJF#(tMq5}qAJ5~{W5x{S#+zb*XL@}zr0x_dV1i)Jn3xBaR=^Yd ztn0pkL2&S-rFHW{XE$$j_YZ+%R3A9?9fGdCXlbJcqC@{U+Sv){5;qB*2|b&XIfUH` z_$}NF4_aFH1)C7E{t)`?J&!?$pU05nFJkD4mk@XEb&PxQ10+BDHWJUgY5*8}?0F2N z+8cQAEQTDrh(SkBW8~SBNV;?!gU@V1USr>T*z2EAZuUJs+ANsF)U4R=1$40L9hCn4}w=z-!#1F)n{`+4TM|zg3RR;tUkm z6{7Ob3gnb!n`4?k(t>uYDQ{#8@Cu5kB#R4>!h)y*BIsS_byde=)v*Z}&8dtU5gLtX z>jW@?@6%5|eNbSnz*0!*33|F7)_w21^A1dM0zx+J<_(NX8EX`!_DShk>+~LmUKW++ zf7j_X#g=`?crN}%L3usZ*=-b;R&bxbqvd<1=kS~;{mO8;^xb*6qxREK~6&d(t<`rVxWT`MqL&dj0{D_L0TI3WI;$}uBatOY$DdP;} z^x(M6`}{^mI*)IFW{(oQT$WW;qO7ulkSaFkN%!o3)Y$v9ltq{^J=L6N{buzqwKn!# zym*0jvkVg_@qcJP;M2bz&Ysl*@ zND6vVbW(Qpgj^k?vQnL(3sF@ig(piZYg^1E9R$Cl?g@d<3O&nk#57!&(j)i;y+x~* z8sH69rWHbOV(JvE-LM{EbX6Ku5vWjkRO@YoY2qT6Foh9ppTC$qFo&@(AyV-J)_afCK`k6hhac>erdab80^>!BfWYe zF*F8Kh7QH_QE`~fr_^#PtesnS;Kk=&#`|v(TCcs0ci(!?0p*f((L`IiX0=S<>&lP6CYX|JdK zu3fvab;~wvSho?YR;z~fCaW%5Lf9h+P^rQcwIgG(9zu&?L7R^B`69GeTJZW-(d!N?KH8^$noehY6^ij8;(}~GS+y5I ztM?&b-9bd`JcC{bpT$7I?%8)R`lTnYPxt@;|MW>jK~xWsckR;Is&SmHkGQc=xzxLZWST7rT}GkXt^p1Fmc9YG-yUJ@FxHCo6k^v zw}!yuduV^v;E`6GhFmH=Q=fOgkO+FE<)!9no}HVC$x|nyQ4@<|`QW1uP{zV%6bykf zGA_YDMeCb3Z8BC>4N8wc{D=*x>R9y4hvh>edrnG)fD?qK8NheyQcqLUP%SvQuW~X2vX`tcQdcZq<2@B zmVaHnfrc&A`q}_dy?W8A7UJbM-{M){QK{}VQT6m1FObV@1K3~9`Xxsnqg`iHm<@W zj|$)o@Y2iA8MyU~2{%el+caHcU45mgmz%D8;{Nyg@~e+ceO*;m5o+lmN|f}Vh*phCnEx4KK560c9bMmrvD`{&-V0hw4j7&608%+(MIZ*Na{ zd3j>Q@F8&PVS`R>pF+oHTA_RUE@;`bCG6Wf!rwUvUbbFv>FkIuE!)7QOBeV$*wFHJ zf}dk&c-ysyzkO%)@o>iwuMqTijz&=HAjEX;kG|aoV2E=+40ek`Y)~W;d-uZFp#w2$ zaDU97J{89gAENc7{XBOOmo8qyOV7Q8S6+A(ufF&yUVHg9y!z5Bc>R@E@z(2a;LTTF zEUAf#v_IGi9S^p~@YeDSxm1CL zw7|*%$M?|==T+pH!r!EsrO=d=(H<)iC~xYloD58|S}KC?BdYUk7ETNFs75AF!K&43 zj8#=`$KQVY4Ss%pW}mtbNzG_oA$23QrUcG+3slWUA4=UIu1YzSAaB%gcRW2p;cz*c4lFDjJou0bS5PaYN^b?g^6*5#G z$?uc^>Od90mtTMh*1Afbs}HD%^9l=$t!>nl=auK{?q7crP-V2{8p&XRzxDTC{^&z0 zy<$vDwQOrl<^1WVpV8%<F9^YTDhQQ8&~uhFb2z4Z^wY4BjHanhDCgtzB2xceaC57khX)xS;DZ?a-Na*QRZ2ICN+Ym#*#M zN$`2wcS4A>BYHXcBecP&pwOipMA~}J;IXwI9 zMO=9H0?u7Lk2B}b;`F(*I7!f*IB|-~?kJVr!3VI@uretKbxC=6q=luGb{xSrhUzX6 zqedqpJ~09DqvA0#e&hp-8x}%k7epHy9$`+nfEN-Oj==Cxcn12x*~<;K9&T_BiG*En z6k5Ca;b}%UP4v@iFl_n^gVV6l=n+2wE~8W7Gi4#d=B-2YvTcZ7z5_j1?ndvm2QhT# zIm8`&8KaK9gM<_BVf6XSnDO>MkU(`e^!T$FdHQ*bx%euQFTRer<1d&ux zcMj1zr=d-%3)(F1hK}31qVtKa=yawt+8^$K!7t0+UNzlOO6zDh7=>IfyhRIJZwYl- zDZW#R(mT9SYpEnE^H5w{jHz=89ktvNd|&_c74piJw1CH|0fZJ<{rHM&2?!R9MgkcN zLSbP(pZMx`N6Va@Nw8UoK|ejK zZBtl3tC1iiP;q_Bmd!>1?%uW2^aT_M4(#8D0@}!Iwcs+29kbNQsm^ZSk&`H>s5KTg zr>M+;$}~*0c$CY=25O~MwJ0pBL~%tmrp=yT(Beb_8jLHj((xgCwp6ppL+&w)J5fOz^aicJ7;7~Z(I-)%R=g`d- zww=15OWStn)an^@XxS3&pL_z{+dKpN4s8j&j_|VYPQ}+10rpOCf7$_lZ9Eau)gK|Y z{_wZ)MwC|=5(bULvGvN#Y46@Dk#XoRVymo+d<()Vv~rO8FjUInjh(Frnug8UzOq7j$;AL3<}V zbfLBH9vFsBRDA6jUBfiwtS35z_kvBop>T~K3(w>!@S8Xbev{|IZ|Zym&R9YKu156Y zO-R^%7D@YGz{uS%V&wkUG3LyNnEA$c1l?O0e*7YmpL-RPUV0m;uY82bFTRh-FT9Hh z&%KUBD%*rJd*QyUFIr9Qh|X)fqU-*yXm_F`S{-YPHU~Rk*lQ!G^R61WmC}-y)E83G zF>b31i(b>JG71Dc({-g3<+saFb*~Z$^ve00TkvxUsV2=)fA=q{Kb{xDEj?Ryp3qhI5~(PyHy|LJ zs_xCQE>P~|{asjIem_lACC1B^%ic0Rp!F8iGO0#oZwq+LIN4u9u@Qm(;m03SA*o)` za^4iOv}y!WE4M=MR|8RFD(Fu?|HRnEDXA9g+2YaW`0CrQP*GKmCCe6@Z3+)hLgN1V z<&WQ|daGdhIf$leXyX5{#v?5(4I~PIPhD5aXisaZ%Q0(a>Vth>efbHkY$?^9GBg($ z*i}`QavRkxAy;2tjVdZWYo}!04@xIxnGPV!JQM}J3_@SYUZwOj4dn7wi;<8tg5@#} zbLY=Cvkgjg`1#A$s@t9F^!(uI(}VW5qYoZ=${CM6?ToH=ffzSwA@&|TjediMnU;=F zGjc${3o;`#ynTI*?;kp7Fj7X3fv=|*q1PJSJ9a|1_8ri*T|0E3!t26q3P*15(YXuU zx)FG`ws580b?amU*S1dZ>gWm2P9E^=>VYtiPz<5cNf;K7gpmoDJaIDS&Yepuvl<&V zZ^HVGo3L)fCT!Zg72CG&FymeYEA^n?vuBTimnxoRNd&Y14Qg^f~Dt3Terfdgs(sagp>O^O~GW|>7! z{Tu`Mu#m!Q)2#y@Yugf!v~PvRZJ)stb~b3`(gW>%ywN>83|(o7yF^j}_Ui}xL4)8! z$cFK09-KN2KD@#Gr%pr6!lf9zW-~@^J&2UU&mv*pbBNvZJcjMPgi$A6NAiWYFz&gx zG3BNAFzby^u<*TaG5_r^G2z))G4kkf3_7q2{SPdM^P)(!nb{fLH`}7y!S3jAvJ;*m z^jhz4hXF4P!gc-$a8h|C>Yqm&t0*SyEYPF%rOGRk5+w8rje;w}{FU?2;1L!Qs=Xs; z4)gsbD6KCu;LD}ztNpu<3aJ2|z8(+i?bHfaR#9r|Zd6G%0y|txQFw-?wYpt%AfG0Kg?|=WB1tsT!8dRuGPsim_!R2b~Dj_2Kr)`kKXEa6it$V1y2V+-X zed`@G7A%c5R_J>Rbe2X8D=0^dN;GMr^2^GRSx|^E6D?3vU%mG}BJ}t@H0nZ;Nf@W9 zbMk#Ce(}}U<~O-}dhj)ECso+5wEk(zKgfrH&Y2zzf}ZyO^sgUHwZgawDTKcTYF~cy zCBd3y(y%wugNxgw@T|*EsnRs?KPf5UA;9rr#C`vH{h#&EKf8?Dnu-TXPsjc6k8cd* zr0%TX)9bogDq@o_Pl-zjeL~NgKhH81fEuL;Jmt~TG@raKn!xGGbxq*@8JIoxS0pfN6(&=)BgSs>hlCV zP15b^?T+@{JED23$I$Re7L2EOqd(OgjoJ}_-P@y$ha+sm{bAoT1l^;9&^;y;w*8{u z8aohf2}96zKu&~x`RI4lfA$NAl0v&9CT_I5&>qwVm_(Kcwi zzdiatHvl*IS9+@$WrAD18Z2l}8O409E-EDiUD;iwR-*E|RbrH$p!e*n7mUg?^?HoS zvnL}ZH3d`WrNT4V3+s1oFiLOL`c-InA9_CiKG=6~9~Jx_-nfaDLdN$-@}VosX>FaA zN|Vx)ZB<8NEAX@w7-ivfy{z_CfV1|~({X~NKFie8$+qTbY-h2tfKrnJ-}m2tXWlDt zKJmm8rkZ2N&Rtl$ZUeXLe^~(a{l9)ie!0feRuWQG$SQAgkvSJoQn?cApZ z5J9d9rp=m*)90VXNhw-O17BdOe{V#chQi0;|TC_GceM1}?M-t!4?hEJd! z%U*joJpK%!_p}`g=s;S?Q)Z5gsGfaH%WXiAGW#mqilyZ-G%NyMv}U$8-4GfafTZ}5 zh#fi@Ap!nyw6R5dTH8*9UB{=}pwm-rVArk-?AvvN1H-Ys4QyL=hF#n4=-Q?`f_!NQ z!+ImYF9eYhJPGaiW zOGr6!5hWj~z*Y+iFSvVZ+C;OoFWE-@ZLuJ0X zE86dDi>LNIjVJd%jgAL9BlgvixGCrnaAj)2Wfb1FfUl&k)P$6tz$fTg?QQw3QryVB zhWM0`55U(@V{f&7e;C@h>Elh;mS3;`h90y|4b@*yYC^%|n>26oKzV%p>Bm%B*(Uje zY^)0HWZoZngq4)bk3as1&p-bhKmGI*ifQ+x%B)sa$A0q3CuX}WueK?)&2(s{QpzV- zq~KIvCp9M+3Vyn0tjDX)ta*z^X(3e~X93r$^_y`so7TVf4u8un6qMJRsUVGNlhSM4 z%ydza?UntF={Jb`EMWWg$DeWQPoT7_9yx{OD6PJYIg3`9_cnaw2^>3jk>{|$q?H;! zjY>}SeT3e-AAgGLnK^uqIwupfgkI{*Sv z#+a@tnx?bI%cNM#H{Iz znKKQjqzGk$1uhNz7sS-~pb!|Xq1W24<8-`2Fcs)jIWAlDaJ}6Fz>TsbB78WMoKOmH z)aWErXQ%!@0-o%wX}I9|3XTYYZ?GTiUG4E?`<7_jvI&9L1Wli5ibky(p=tZ3XxxsH zze5u=>C_C3I`ii4%tFsr~TGSa&=< z-Ud%i?to|J^5L|h1KRCqjkZU4%^2N|*dX@hSfmkq0^WZ^uY|xWGjl8yQSBA;wUnQr zS5TEtD_e}pJLP5s#<8#d(Xg3 z4=;62mf90QO-7cY5;Ly0fZ!`BDWxLR@G8q#UB#tKmsqGLvHNQCrrj`Z@-V6 zqH+{f)bV$gqnwhixat=2N~`((3C8;g2&4cDs09D{+cnE}Ld`cgf>!trD(QRkdAy>i zvL5GOc#Y>ros^qn_L3FIF0DXDQ7Ohxr;-yyn?8cq-@j}=v#5fA%_%@--7SoxBQo}v zp?-YnJRf;QMJD0E=Iz@|!U5BgOVC6_M`0cBkI%pQl1h%siC`(OETPh)_2>Sk%^UF{ z^PvG=dgcw<+)M&-+ylS~SRa4#K9((CXrQKe_PH}=jBLFo8{)FEN{L7d;p*o2;61~eMO1dy#*5>p* zbmNvyraaDDun0qjjv(v=Ju*vWGuRvcK~C@u>_X_ZM}tS7>Q0r{9geO+n3OsnYd7p5 zgod*qhnaNd)_!JMhHCf}ffp7YVL)i-U<(IF($n7;0|ySk@S(#HLLcqk!wsFn2d!eW1*yz(ALr07x=td)sUpQ_wfkJzuh)aq$Eu^D) z!|J9@9-WMF$>T6Fg%F%L**x*3$^=9~ON0IudeRGIvZxP-wrQV^)qC__*&Y=VsXD>V zT5qRTSSw%yN0cJ1GKh8!}h!pI`IbR#2cwAVQAM3&7E4Hh3jKz?A#p9Tpz{L0j<#~npU*G z1DuAqz$Mlf4nu|MbUV`xolkUu&0$*%eQp?di&43iJp?_1 zjY_WM{=_n++^70HHDm~UQhGWzzdR2$cdJl+r^dWO50>dm25y@Q!B0D zC-7M-@3j=*6goyr0j;pG$UrYGJqwkU)l?GdD^|> zcouSBh9&?nP~B&}nFJ`0)>vuc)z9xrCLzu3*QqYEizlh_v zJO_Ea{(3M`#VOl~mKtH9L73xG>fLdPDTek`M!Ym$}V_JBn`gE+ip4|KM z7XSM_9#_fpN~mUBH>yriOvqL8nk!#k1(#BIYRR`kPr$Rr!((o*Gyb5$`~CVK27W8o zuEC_KEdMj+(AxGTr=kcu@+w$^0=(cKXv^|RkrvfX(CdoFp6W~j#$W+$?TpzA3Ard! z2`IQlMnxNmsEYZBNNrcI<}gad06sk;;qB*x?zV)clM@2DPqU^)MMT5X-3w0kPH=JP z0XHW%xY)6RIO&0ax&^GPUasLZBYdoy*wEn-=_Tkq91`dL+pZfl& z{?0O7Phk=X5PELD?r7VsEfpRi*Rmm+JVQm-n$TB@NM%EziSmz&mP3sBfBts_e%ILjYpeg7qp*BxXtT=&P%$&b`w8uzb$Nz zbVavQw7G{nz~-PG;$Dm=@_5tN)3Ta|h!PYLdPTR&2`@pf9EH^uTWeHa4R3xzuZY`f z|EytQFEp^LRDQMkaw?QuGn}osMsTb|UU?w_S%u6ZS{T_W!Y@w)%L%?hMyAH#7Euir zD?wLm^lMt*k7FWxh;z|Q1y%qos1PQ-< zg0JfK-+UN==Tz^hH+l97DsR!w@*HzpzTQuDSbB>}P&xXzo?f6a%e91}&hIUh)ZFI# zDj22wKSflIvevm&fP)ZV>Ixbl0|)}Xq+4)u5M5I29LlrStH3~O%H6M`j#;9mq@ z9cqpQ5qR|-YoGiOq$7@}GtD5`tup!j=zZ6n-YP2xJ%?QBqay{Ur+BM-*j;VLF!p;&J zjgf9iG>~?UD(^Ql_BM_7cj?NNn7eQ>#!Z+?!KRrTBgratRq-bP-Kg}sBC=-}3>e&% zE}(fUw58n~G9n4nXDz_U_@oD|9-%DI>Tf2HX&K7GZ6>o(MLi=(_E)`}8JgA0)5iy% zUOw>lVS#iZ=oHQ_aCdcwAGbwCL}BRQp-4<1=!}{o%w%U72?CrViQA?2xEz(F{TBEn zQ4J;$cuAwUBmk`{%oXC(HMGt8KCAN6XK7u%)%^o3P2qx` z9s&k>RD1rk`yT#YXxp_N9&f99yvAtsbR#r+hW725`|7S8msEICca5p;nslcMRM--J zj!n_5hZJ5jJnr)t+J?18*Iu0oy>77W*A0$??BFuomEi-I5dr8iG62pA{%}h4Lyxfm zaGw+cuheLS%pZZsC1Vh>a5P+}4Mdl5{%AkJ37uyWaEm&_VS@u4b~wOpe|Oj(?TRi` zd7ZV~(+$Hf#)77T1)5K4jlaDQy<$SItggb$w@`k&9NFc{z(N&RQ)JRgmDJMmRu{2w z6HtVlfTy`u73K96W(q>7#9S(Y0xEza!Y+dU4x0~!LIaFEP zr$PSZw3N601}X?8!A&qLrEC-Iw4Pg1Wq>EZXUJ)VAheuUit$|>5eJyU?y@fyjzH6YTdOQKIlGj5~K~Ty1R@YV-(B)F~ zN$u70x@##LE7&RHvVd2?hn%3NZMAd)0;80kIwwC2H_MD>mZYH8ZR(yPg=Ynxx-Av+ zVP~?c+|Qq91)gcZNKeD{G^#u*ysK#zrFS*$IyP+Cg2l^MqEG)J2nf=!&j?dCgMxi1 zPVA`m1Ux%}?+G+${zNuwC8-PFtPexW@^0 z2fCsA;m+uEq?2jH=)AKVVlRvYpRu%17U-49)>6rp5OkVjp@Pbw{5BzQ*AkVqvu4x< zEo>ESttOt)NDhIooQ0r-c2-eH_!VoNwnvbYx+|v&(6Xee3^}UTsn9ShDzefdQ>Rx< zn^s-Tf=OjpTFio2S%n;9Lkmz}Pw23)3ux+-r)>%~MkuVxPO!5oLa9SpS=rm7Vtx;* zI6+R9SKBM7DD(6A8ML?pU!jgErkW-Asx*h~t-G}Ef73=%l@Vse0v~N{)h$|8!Y-2* z)e1h@T|rL(lp>^zCU$pZ6pGWk9MuY#XnQcuNpn~IRORn~a0I;tQlHG;2@4-X|675K`lt59A= z6<$RsOJNZ@s^hERH7F`6FiSyD(35Jrb-Vh(kTQ#96|9s2h3b%?tI>7ly0v~!$Lcsi zQmLT?x~%Lpv$W*L6a4bHPa`k*aI8>ug#~!(j4a@p(K%9j26~DtsyqwyRG)X{#uZ$r zO;&y0m7CYF|L|d?&YVs6*#|+iwwi-m4mB*ym#WVmF}>_CWSA|X_Xrxa>*9c3{Rtfc zuV|*9#MX>YQ@sjZ#Yf#gUxtPbR6e^_LI7x+uTmDTh$eg+Z|!Q#~$7HQQ__HfDQ*cpwoWt z-(ib{XA{BaEER!f(4xhajlFvxdIVmj><|H0LKteKg`g+cD9#oGf;Hhpem&&jKglmGG%A=*ixy%Ssj1jO=VRmr{BHpVi(9 zXo8;gwLm8|C)nxr!xqX(;A5UEi?%1Yf+)Id>KUPI8|rW(G(-&sS**BanS2?~HxgIZS8<_eS*#^%=X_g^L-C_71ex*Pz@GS-SV??XlXMGOYb+L&?*zYN;PlI$fBsIBOv+REBP>} zy?w_dC{T;8z^6Wb1^oT8x0)p_w;+#Dt1^uP8kwPi{pu^J^i2X^86Q4UeAaq6DKdo= z+O6A`WCF^da_`SNV|S~0y#+m~JHbwMc(?D=n)m4a0-pN#<>Y1%lu~p29|T=xWjP;e zmAuA;bvYj{{4TP!)}*0U;ALdV+WrT6%C2%fgElvv>r{GYE?ht?9i)Gd*c3&j$MP6R zmW6oHwc4QnKnKLe*-wS-WG349v2rcj4lO8RFwa*Gag`j2) z>-+TCR?z8v%E;TRS1)*Zd704f74(!fMbHc8IrHZ^bM|mS>&|V_=!vETUK0Y3pld~I z+lJAOfRoa@Z)*+oXlWbT5PEh^(Zs1an!7%NM?Igwf=`=Qh?A?&F=Q^c>+i)d}9yxqqq~oTqreezF_7PPRvqAC#ZWOFU+RD`!^0SUTdKIx_IidYbIldA#4 zI>0*8)td=DNS+VM}G;)CjS}M?LWgk{`+Gd5JRG(m`W3<1rvXa(T1(E8L z>P~fh0;7)AQlDe3CoG@>%#jk*I9fGkl+$Xe-mR(*HB@3+l6hubFel)ylBiY1jb zRGk$jAwfm$Ei*<}3N%xeoPaLoZ!47juc{{SWX}m3{syI_(%9V`Jpc;%8>y<4$f2f= zaOT5B>&kQ^7?xL-5r~z>{{F=P_&ce}iZz0Q*Gjfjb#>M{K7me+98!39@9}*6S&u*e zQfjF>6V{<+8qHxDV#d!Q;H2*|Ui0+aEpw>WFM{pZL!FQ?%8uo^9UAzn!Y@PzNwui^1pyN# z;Qa@BQhcE-;7TtQO0^ai7J$%TKZG$td3+e*W`clI7nTq~lk28av-mmxEZA<=y|3(!)1yS zocP*)Y7caqYKP9#3B3ihw`*u=x7wiF9)fRwXLLKnIBEmiBaU#|>w@GL#{D<+2)wd; zjJvdWx2ueTlbtOh>@>8jkn4h;U{`mq2H*elTP#|>5TQ{aXsmSbjR^b34Ujq~6SS%w+}!55Q~;Bf{VKgnms~w%;S{cLL&t}`|MM~s@$}`Tf3or@x>PgT55f_ zc1%{N?7tl^DA zhYm%v<}L8ZqmT3bjWK@WRIJ~))g=AgfB2Y50&4owDb$azvAU zl$unV*2@UMfBt$Eis?w*Wg29_I1NEW^)x&_7W98SgI@rW=i3 zH{nC03bl9sBmlC|uqn?WLw)y}w0HzR|Mojo9wA2%UZ--+C(tMG+*z};C^Q_;x~{oh z4j(;?`rCE5efJiBJO2mITQenxFz42CoT9P18Qou(6;^%VM<2hB=`*Lo-O~k4nl&`x z?dxH>zr6bTOH^^1qeEF#cwZ5;3c*jnliF0K9XKHk)iAgZf^E)bM6k$9yaiHbAYdhGXlI_5fb1{xM_3ehdvBrFx^v6Z9HA!(|&PIa=FB?HL_;6LjXyXzKC=y~b$f)RfR;I5)#19*?3` zaBFmkrWzaU2*+VP7|sTGT3Vs!64!$Y&xycugliHNU9uP4Q@r6Z!3&S|;iP4LY(-xw99E=X4BS%;%3tl$&eqy*2L zIb&X1->-+5vA@MsoP^%tqesmD)c!+<4)U|}af5|QjchtzBQvx#1N$5B8WL{fCr&{r zL385FIllKkgdV_2@Dc=@V;rF;<);>34Oh#fLes2fzgF&KdjAhQJ@)Pu=_N<{Hak93m-VpGtDo>Wy3O(fuG_bQmkINewX;jPwShQlf znN=__G#q|`;!H4kMQa=ANEg|eqNo!_4DW!%QG{MVa1_EKqv7MPUdz5_T0(!qD_n31 z3MN4O2sw8%FGi=%?eR?OC-GFvW_afDMri+3Q*?T!1>+HPYW*0xb$A-KUE0CEdnfpM zQ&#$U8{2Ayr&@o530PAX7*22zc7BARZ*U;(AyuGkuKq8fDA-x!e}QM}(u~^Tc4NB< zRH{COV5iW!fau}j0TA*IXiVTSx~hAM8ZUVB z*z=}wq{?`xdrFs>E^rv+0FQX8H>2uY;55P+&Qy0UBQ5sUHQt%PvxDnsg)3Z>y$L;k zc#QX_(({4WR4>|FPk7Cw?VaHUhiQ)JK7(MJZwseY_Hf!{j~)m4xyPL0c$_Ly&^yNE zAwqArE5<%Q4h4i>oq=8{p;toaF$lkEs=pdRkD!yi)wo)%SKn3ljdHyI@q6Y5S9tk* z;`o_kIC$&;b{*V>$ulQo&E_?tQdQ9A z9XdiK$nVg!X%hoIWed4@;hgC-+k^nna)ueHGX;|-P2oM{joo|pp`7;d#F?{vFCo#S z8OCwn;p3-p5JOEOuy*~L2M|n78O;aE0W&fFu07kS z+>>$U{3+xWWmDdVj6c0o1x@<1j(|AlaE%}UjbKp-# z_y^M+vb>shgkUe0XB!0gx1$Jq5`zb|Mp7c}t#42SeK3`aS|81fXFf)$g$4P;$K90> z>x8yXJ&sn7H$%Ipnxk9$C*atnCA{t1Ai%L5{2kiBv)fbX(eZIOb$A@kom-+u*H-A> z<}q|`^$0pVMIg3%0(PC+!QIgwfnJ^nrCn9Mp`~$ys*W~T_E(`VtNK7GLB)UIC)FqQ zCzYrSCOrOu)%-uQ0|! zZLtm_{59rSsA>2}^!YA@QQ~ap(d}wh!;XQ*2kl=Hi&fhu94vx#5;IYLGUVEt)54ysM z%Fh0<9qbOm;}#Ws&6;AC&Dic8RwQj^ltYlTKxSXyN*b#Ypn`agk#(9M`cR8+1*+=r(tfHQuF`}O6YN*oc-i3wO$d%N$jr_|MNJ)w%PUPM=mJ_%bw|-; z-mk8p%QE{qW;YsH(1^>g9f(J56XMXi4#X`q`(ZPF3eb9fpq|Kgx%L>bxKlN zT}740hhuFuZLMYGg%U}8_T{JC&(AF_q-D+FaphcQo9;2{{H(e`HDs75FL_XxCu^IX zr<{TRsmHSd?>bf8)wCP5wHAe^kr~P@sF5hs=gcy~$=g4`G^(2Vyikgq!0r@V9a)Ya z$AJE=5Es`P4gA8Q;7j!-Ldt@M5O%?SENr%&(e0VX(4}QlIJRy9xAraI)u{#ix;%oQ zZjZvhV>1MHYL1Yuk08SKaYWiZjxd`?5Xge<->ES|Y#v1*j|+Eb1^=#3!n4C;aD1u> zoLeYc7#4wmP^DW} z*BE|RLQ%uyOe+o5mtdq0(yE6Ps1*MJqn5JW_k$o62o+|3kR?KcLrqRREklC|STlZ) zDo?@f0VedFsDgQ7?R(gxRfkq+{!}wGd4fQDydj!V0k?Rj`G5N8wWgIN^cr^&@c7!6 zuL-?ow6x88G{<9}kD*g!C-fNX43Ah3c*fJFKBV+|jHJq=rFBc7s$;l~a)o;m!8fJ{ zJjS|F;d#Jgq9;|JH-qnGc%}Nle@+0rX8RcE+0G>-*Sf-c7x(Sw|2X7{9tUWj4>-b^ z3eRC5V>h9D#ZF4(<)nDRgTX-|LlPk71Rm}@=XCNr20~nloT7d)z#HKP;~;= zS6^w4g)%d;M%xsEp6dCm{qWR)QL6bQXz7$MQK{t{-!J412`R1XJys=XwZ2L{mYbJ{ zsj1VbK2^9I;+t>3Gsy{x2(b!+rj$S{E+LEww}Vt#jm!*ajj3u*(7M$TEQAF2X?C-s zvP#0KmI_9#qGbdWEpBl+Rp3d&f!~Mk-MM!^@~H-ksRGNYYPr9RU@Wrq!{h(VBG}YF z@DQPDv93*U{Nx#TY2(dycq8z;a?)&^6`W=n1W$SkA;pgi6YYyMH-+jlZCMnyT z{5(Q$vZ=?^Z&ni4ni|4M4MYqL$Fx2mlzc&5hE(S!I31&^GteX0RPU!o5RIDAvX<9Y z`S-N1annY){K-eAo=#CjpeZVO-PKh^-BgUyBly%Q*)p%KWzwJj)aTvrrjnI~o0*ve zUIwm7=?Q@9pqz2d(%6xmj~i*(m^O2!DU+rVgJm=*#2@Zn-4N5)5m7N6;O)~4A)(C) z^$uvD32OX9E$S(RAn~K+aJTOYr;bmSR#vQ02wM} zW>I<;g=PVtASl%qWa{B$Q!OC~s_xLAc2~!#u_in$lE9p_tFaQ<#GF|_VyBbdz|2~*9mqz z9pSix|989W?O5axds1RZ(hfLHTZ9jfke+n=?j@0~gy*ART~f6BtAf$D@_ z!)A@Je#-{DfBAjd74-unBpGTc{O-GNO~FzKYJ!ytos^x{1thi1YMWHuqmR)j#mqWk0hu6@I+1BxA&(fmub7XsE z<S4Fi~LVH_|biyxJ;Hh=iOBgamf|5L#4AMGd)l&X% zjW8)N2|cXm{+C~VG3VTRuB}xGO5pm{S86okHRJVCmy+Wrj+&|nf$jLoW9H$dsJ*43 zYGxOILUuALbC9xk%YU7ms~t+TsrJ9xR+!pq$e{=V)6l&@(; z4k!5ZJ3^`4EVh)&PvBEV^l7)cKYy2c_&l9{s)Hzv=8B%(*wiR%X>A`z zOW&u^Hnbf&M07x>=uW0@o-=K((=cZ^kC4hE@a`)-LC@u3=#6(b&~qJ6FixQQBlz42 zKA-7?-Yh?O%=U)!9D;MXE4;Q5di$v2_weVyq?<+nNEC@OKByo&};F?WBBZguZ)_K$`kAazN+e49#@HD$4^s9X(ly3 z@ETLuH+$5CQE{}d2aX&kYzShiyzD$GDXy1R)Z*&R4D|5uLZb&c_M}2Bb$qktF2WB# z|7Pm{vZNYo>#3mYvHQSbG}gr5f*w_63ID%FizxC6iV+>t+kEH71ml$}*Kqafb@N)# zi|*CSs65qwsiU%#UgP^Iqi|(!HFc@3mEs>iesB8n=|QDByJLi&b>Hz* z$B{)q)l$i;QA+Up^ovg)?6-y>=;qxl_BNx}NXJH!ep}^okQ-;-G8W`k-ut--#^y~^x-{!RZ z9S|Ay6#Djm3Jpw`5h|7-Z#VcjcZWMwmTLzBryZ?ox0VR$_BbKc1U}CYdaWA3kKhZW zg7bd50sN`)9V#(mi7J+#HD%L^u4^|jb!Z#34R4Dsy}H0bEw)1);WXUo zfweWzyAM7$L66qfYXYt6M0Wy@ARAAmHjzLi^t@;Iz;~`cychVwWq~UkmpY=y27b;i zzIUGsoC!I%{RHG;PZJ&oJ>h>e0D*@BG5*qci@m*9ZJ<|sKeTQ-Am4{x&7XBBsiEz= zRf*gDQ(wILzv@lesLeaJz|qwSO&%eL9%|9+-M80&DLsKqN>55dedz={fzR46Pbx&J zPD;!QJ#ABR3BBJ6y#H;<{rF?0l`5rur9!$7gTsf9n(0EVb3iBuqb#i&Nc39WTcq@+ zQV|*$@qJ(Nc&nwA$}6v^Vgc3I(@JOpc)`xJ*b-VM1hD&Y;>>xJRo2ncR~g`GDYd7* zen0t3mDk@!X-zFH@o`I?9lz7jV<#*&njlu4 z-Z%gF(Y(JIAsQYLMSE`wWW&f0*HDipiuT z{*y8Ju_qog7E?n31w6H`9yw-d^wC4?;L(G;A8Jj4i1L~W11r_begDIEn73f=g9eR< zW5b4ZsIMm^i`56R=z+EU-^si!N>9McG99axKk$E{m!6$wpqG}OgKO6_sPtx-vT49a z0pu6pN_*Q8L1FFTAN(YI{2pbweF6gpwMByns-$2q7c+N-YsZ$fvn}A(rWxD_wLn@@ zE1(3rFc#bpg2|%Z2r@=wx271-qb2$}K8A?SjS$|kA^K4Lg%NfEgjXOT=1W)#e2UO6 z%?Q9ow84!L(TT6A8pHTGS_V=@3WQRVeq4rhX@Ssggk)#c9nv~GwnVfOL2UOpyt_OC zcYZ$~o0bT4Yy-dlhqb?ex3XHh{_)-2-QC^Y9c;S0yVDlC3oIlR!9qo_JC3bKJ)W`= z#ZD9fYy8HTYfs$raL)6-@BhF2T;4VBSZlMsYm6D!ObbUzY6JlFKnlN=U?>v9d=MM# zjg;t6WT(cXfEKi*Aji}f)>S*Y)F{W6e1A39Yw8I$!mxsHlYK2`RM5f-gxV)%s1MXJ zcv(R>5P$`=+|h|KaPxCF(37PV@SOdra(wuQ7WAm}Tq9iJMzFgnu|S$TJhB`eku~zd zo6rj=4Mb>7xPe||LnIP96A8U^LXTE=IAJxCFr&TAS6v=~CzY2!As2;H@=>I^yqN{a zn?VIgrDvcwhZc815sH@3$}TTJ*2-+8ot%cub25-~8Ns)Xz+BHCx$BBizOkH9fr=X{ zQNN`D^&9Fi=g!&SKSc-hXju;(b%cT5&-@aYIEbHqKWu=f_>Q3=XHt7V{QM)n{qZ~e zc=QlnfAe)*bm>J@F;u(#NkjkfuYchA=bz&jm7fnoT~Y)(*;duZ34V&7fBxCPPk^&3 z?&X(XHv6plQz$>8rUOvFI%Ut+^6j_ZF<*Q^PxH|V7E*dre=RMo29|2sSAto49RW}k z{&VeovpnbVC!R9OP92#AKiS+L4;{vL-~ULvr|vNPax)e#T8eA0--Ii#z7Csj+-_9f zQ_sFYW&0UE{o-4r_GE28`BIG>UzpKmYbS-g)l>TzdIc$F3z{zVhZj4Ddcv zJtCpEahtlwI27x31fbNN6r&nHZrX7x6_rB;D%t7R+;@QBJN2|POnEm?FI-MV`yEyD zN1uLz=~Q|Gp{=*nx&@P_<OBa%PFb_x=3dpbEjgIa&%bZCf>Ix z$(z9BK7z6)!3AyE0jN*$LNzUR1yy1>pRZgEEVReP@%|``=Q24IMakjFjtfRQfu0^4 zjEuNYq{oG!ASW4x`ROPv%|%5;5h|+6_+eENarDwW<0Dm_9^wl;4j_Y-*ea|yl0 zMaWy6kL=|RYg=$0E$@}ZD7lt#eK9IGRiJ7!-_sVpx6Otp|S$scRy)F#hW9%wM{(sE%xrc_t?GFQg7Dh2C@+CXVH+|&r&SgBR9~l-Tr;;V73G(FZ?et;s_F=5&Zffb@8taWv(HgE zAEd>l4J7oo-n@hR9d`8^D%LOf9N+zT*kpm3IzwGh_+zWSetd2lo?bq*-@2ZQY0dpy z^``v6n4v}>DPvua;TIUlr2~Af9_~2fthM;zr$e-?{6RbTJrydSkMO&8!*x8*QEuIJ z>&-|S{kSPJj_UN38An411;DSr)A(J-urm91m-_ok$9XNOJqtalJPkb)@HF|60iJEK z73kh0@E8VoRDEVxUB4oHhv0wvo%gVC@d83kVx)vFEDyzH$tWm^q{<6NZDRyVD*RE~ z5Q^4zby-P|HNY#03qWbKH_9SuGb3G4LC`f4CN+eS6k9{A>gzlSE;j?c8Ywe@iuSZ6 z$qij;-snm9!Ju?M49N;aUxq)rXnT87z0ppq+mY;!Rw|v&bWgM;yK}>QyPro7qm0nhU4uT^?jM8)Sr+p>(4D6IGSSrt-b}Q5ko-wque1C2!jBz7kuwJ#OvL-}h(=;e%((#xXK%R%AzJQPmM zqqQZ-rg5FtwrCdB-fY5e4r5*c3Knp=v>vSl_0{TNO`D;ORVBNu!)-X$3zEKq`0zg6~Qe#SrjWYnV|Dz#`J%Im%I#+&$zWt2in9hEn1p{g`sa>2FN zU2j@%HS|n%dafRXx;jKt>ACf*TB}qQQhMsXqLCQ~j~vFlC5w-Z+Iado+IK3>wdbB| zy2L1Q^7GAbuP?v-hQ}N!sg$JPx*Pa@`vG?C#_RF@PX`V71U2R*3VPZnC8)ldrb99TrfELavaHFEjtx08;BzQFg>0_s_f*ekCrJ|U-g@U9ELtMy zN#zyt`saVB2V++XyoOu8~Twk=N_w z`F&}+pJTmK0F383JZ6AMz_jxIO*~hv%?(MOXeKzDQhiaK=uP+qp*kx96&bR`K}Zbu zK{DZ2n44jS)2ZH0U^C6&{o$z7bA-CgWaMTOdZBRlC(r_@@@R3*^oN2TRh}}exKlNI z#!}hw2S5ozHTuFM*AuQe?(iz|g-?l}fnG$T!`2FTNxex(?n^=1P+HOvw4_vdx$2vz zdOX?MNrYa1t8GC)^or*ep?DrGEp2VVq5@@#a4&XIzYc#;?y0H`8n>(=bz7?SPz>l=Ahkj%H`jhErcc>qF3RzmIyTdFa1*kE# z>eBMmb5A0#I1e-C&Ljvnnz-Vs%Z1^`|PoOwjo6Go~S-f-A@EAbx^i-f11zOe)+A^`;K4o-Me?2bJc)hms_@Mp_2Os zx81(Wly@d56{_R2kzY`R%dWVJU)+PHQ?aeTTej*XY`b|UZXo<@C*`~L+-LgcX?(7_ zsz~W+x%a_GkY8Lvn>ia-u3wK0e6EWwzKr+e{Y)L9W0ZxI-j{^%H-~=0mDi{~&Cvzr zy6ZO>MX56P@>=E@cy2>etLa9g`FR!B5!fwl9X#$(bh~!nW71E}Uf4gOkgFS>eeore zdj7)$pJ2#{k^TE!`F>Uz&?!5Ml8_33*IMYgIOF=wwAcjK*9X5f!@$1!{u^9;#igik zX~e2i))-ZI-3`}c$+9JUPmU2B<0gzVcKM61H2Om0nLndKR2ng*E_33zY4^Ok{Lb)5!SI!NpQ-e zr7ei`qOEmDX}GIVd4gMgvH4(H^}yHyZwyIuMjJudK}d~axk*K#=;AfB zt!N?i>ImN!f^Z1c;qWXk^bl+UXbbPt&GlhfZWx~Jj-F(KlKTY8COt2&)0r$_5`x@r zCJbAsDy0UcO4aaEM_a7^g)P}(Xf23EWqJg%Vgif>&dW$dy*fx&QPI)zIyy!7x9~Em zYU?PW3BBwrLNAO;kH4Be{PIZMN$F`eFxp&~U{|>FbM=U&(xa_+=MRJ%q36cWz>}YW zM}Zf-i+vGT6@%>d+Kqj7UfJXsW$2S;!lgZB$<26oD%icE*({CWAvnV*aF1sO)=RbIsBzk+c!mEOi`)KS^hY^^rHtKU+Ox=nRx+0u-T zt?d}Jts6tO^dcQ zRh!h_h7FF^TPZ!wTPx_Pqp^Uh5jVCWL{L?|;5SlNw45VHjWNJ8O3A7xhV0_>nX^sn z>GhkWz8ugqA$zH1zeSxje-V|>VFNxXJy}~VFTP@(0gh2_TI%`wJ1a{yu2^%5ft^xE zeahz%{I1_D!0H*Cal^*Vgw&@5%?~(j?b(idGS*#v9Tk&OJgGn2muAS>)|<9F&SRW^ z@g*h$%fjW$xvjsCE1r7xc>_?5&3*07zau-p;MlWmzvUK!?;tJoXC@)(l~=FlFN)xJGPf)-DsJ$zJ2G-{M^(LSqjeqJuR(jQwV+z zi|gp0XMrb+t92z%(2zRKhW0Mq!P`nc@bhv0_JKEct%~X7ws!+GN4ozE|(734)?OWT> zOW+N=X)s3KI25Pee-h2@SA^RUg6s&Pclb}xQ<%g91fO6h@M-(|!{6i8zrBL1uDuGg z=FT#9*8Ia~ELpagDr7y?*B4O7V<|n=;|Xl1o_eYoB52{Ip@LF!Qgwo(tc+SnF>*A{J^w<}M^1{)+n;vH0?+ila{x%-n?7egt?Z{}REFRuYpZD?-`jt{ z3?rL8UkxUDHob2*%wM?FG=wNE{bvM`IwpTkunB;dt-G4{a5b&5S8-jSQr%!nN4L4J z3oY);e|>`>QJQ<33-BugG7G(el2RP`{dY`KXKw+W JQHL84}RQ$BkckS8R4?P!x zq6ja)_PWW=BJge9ev<=gjH{{SWP7Co)%QpMdUyyut$0TD*Pd z4tztatWg}FeZikpsYk}=yoMB>g`OkSn4#8NrN!EBU}sgH6sfYtyuWY1f!;gsy-lS@ z=q+1h`sS5YWTLb@iNYiaxdlN;NO49QzjMVEfhegAGOf8z#ROhO94%j*4{B(`>SEk! zA>GidmRf?RiwdPTg$gg;1)T|0N(4~{Vb_;NTS^f15^$q=(MkDUm`3$7v&aWivRyDa z(+OjEY-+X>#?gk3X4%QPRCR>alma(QDe}X(TptW(nc=+F5URyNf+JPk*j#_K6HrZ5 zjXi|o2&%lnsjgItgdWx9uuLkyTwf|pFSJw9b@BXR`TiK3Lojo_FWbkV?s%>&ZxfZD zZ78YZbz9<{9cnO%R-F(u4J@gCXvz#hRa%H?3C}79hZ8ZsyTPQ&nNrl%BJ-2@7x##smuh8=t^uik>5Z4)p z#GV8P^oINaz05Hgju@MP?D1L1o|uKKshP-{P8iO}M$YscVt*#^ zjxvt$%i0e+j^^dI+f8FTw8CaGU~sl^Hfd%)6xz>ldaXzvG1vn)NMtn z;D7jmz&>=C$CS>9kmj{?O+AC|t?TI;vbUC9VdJ1(}IrP&Jep$cemz|37 z;34e$_!HBh@Z*tROg)}*?kPU!`d8ooXkaIWCyT52?$A+wR=+U#*%GjVmsI1YpMQ-{ zloFm6`0Il|nc37n=lP!!fCqT517Ce@8ZD&Wl;2M+z$&Mi(KK{Tb$lOxs^d!Qu8hau z^7B7pV5az-p*b{^W%)8m}j|j4T znw5_60m1hEJ|!EV+Utj&>hTm(d8(Hb^xl^8d+%L>?`^8Re_@$Xdc_1^CW=erDN);fEFTokZ z_@x~|yE=-{8An??gdpo81jl4JVPY1cM$j#(^uvm3Kb%}2gvBN9SW)GTQyPP?tkMfh zD!ee6WoHz)Vs1qUrc&MY5K#72%j(7D`3vjXv&cA*2RzIX2&qcGwMr2)N~)Ers7@j<^J7p>yIYo>fXbq5)K)3omb$Ri6L<~BpqH4E0{_q;xcc%} zUa(U}5iVV*@-+6=J&d+C$`c-Oo^a*os6KkG{m}Es7x)N0{xJe70}$2_j@XXae&|v0 z4NgJ|?QQCaRHThcL;4uTSORc-CNd^wAZ;@DPfbJSG%CP;=o!T~uK>A=sQ6aoBKOo> z6rIOkzspNev927I*HobHhFa4~+qj9Ac3USp1iTv^z#D(_XiVHb25av-17G8NQ~z_Q zAACO%e1{3W!?du1-av)-13@PhXu7og^c?~41HL`@t!cGYsGpZ$=TLd-d`yt=AHLuw zKnXY&f&!7?r%)$k?GvC>&O%QalWm%Kfl8rc+P3=yyzi-!1UZG)f8>{2OV#PwYwA80 ztOlMurV$wGrt$;J3w%e8{&H-ez^4%qwy{FcI{4F3{QNuc)6c&V_&*zX$a)HTvbL(r zldV-K@qm($9z05*Q*jDB2R{D_-+uoSt?B24Bti9^GAjRIWH{dnC(sFa+GnAt5g}4~ zQg?!%=IH&3_xqIZNl6eilujW9_nsgpWyj^fbwN)`Z$IO$ciwYA?}N7py?C1jy)gH%!j+#;g*5T2@z#lGT*uBs8St##6yfBlJ#f2*BBGAvm`^1ZOt*;j~&W zoX7Q3>wIu>tq+!#xL|Gpqcjlnt0FOqws;W1HmV}YRMkx zO!q~5syE-C8@g$m+j*>>&{p@EHiEr1-3JY6KB&(OLQ_Eu>hcp%TbzM<{({uksY7!u zfmes}ng*0sR2ik`7ZL~;AKC#yPfCyWR+9m_hY)Px1YWc!+y%WPH)!~oEA75puA2d# z1A1NvsPIQ8?hxp!jB=!+}Lz9p^JQ*n?laZ_#O$d%nMe2l9Bu`F3%9IqO zPt7pU%bTgz+aeS#EJnf7LIb`0(;Z5$dL0!W?QQLKHE7z@gpQj!(S7qE483(2hTk#_ z<8B*+skcwW+&gDr-maOrxV^y@D;LilN3g~NoKz-JYnT6L|$QwVxe zdMc~FU24HQN?@vQmO7>?luSU-6J!*Ep1>pU2}ar$5C^J0?GvB`MLXooju{oaRMsj! zT}$0nRDUPPeIqy#L|O`bKm72cDX(kk8d};`-@`|a8nCIAR!c!o>dMh#%lAhhd`|$X zW3bfTw?F)3p!Wk|rzcC6ADfQKKbW-k8vgbL%X~yAe$QtY zl+BVbS5{>UJ|!dFci?lQLO&-|wJlH!dP+v>P^wClpt*VZIrE(TRMO^LLQT;7OxaPs z`^KogeN=E3da}G$^?gj0r}{wKNbxDls0+;J{F}*6OSSF$_yFFQ!s`z~?w$9QXh2<8 z93dFmOfiDvd+&4q`|q;cKdJiufu&2Apr|Mx**S4YNee`IRXigWl{NehHzuN~Jr2!X z@u+W2fRo0@sxz=U4p*tJF3yd>@HX8|#ubK8b@^a2<>AaCU(BW|Sy2~?CA4Fs2)yaE zvvW(hpLTU-juRFYIpMT=Z=6x@fir8}aCUD8`SSxg{Sx#O(H0IVqU#JoZ; z%qjGxI&{Xwbcg*NLvT)|1)fEPIJ4Lt6M4-^xy~3zbvT&N8$}pSFABhn;s8^3IDzn( zRUSaa=#4S7)MIj7F}cJK(+JLS#r_z{=kB3Glhol6 z2|cUy+y%Y(0np3lv3}@z@t4P&zd(MaJ_xB%yZ)F}z&MraSRkdi@*ktKPkO$`?DE4vZGM3(#{HcN-w4Ok|6w;@KRl! z>3B@2z5C$-!bNIOkdYcB_%xp`Ro}-1;zv@8gpwu?-%n6|Mc92zJ1kf#r1&(-L(lLf zA*%XD!A^EqYE*#!`13FDE#LFLPiV(~I&7X(_SjnP{j|mcup=BT!+IYoClDUsXDeVT zG{vN#C)g>zR0rmRf2cbtzfZpS+=O5#wI`(~g(vXI?y8PZeHAVAtc6urb*Cvf74Ltb zS<%#i`BR4~H2Vm~_ugT-e=`1Mpr`33GP7clo*9BT3Y(NPFJ$HfBDXLQRgK{oG$IpS zgR|kJQ4nRJPACrIGQ^>LI%)N~X$dD#RZT7oz~o#{M?D?kG=nyFNtqAE5>hkry|An* z1j{S^v6A+6O_du~l{(?1GAEqd;EMAaU2#=U5H4!*!D$svSX<+POFP4HX;%~h=!f}K zhO-E|m8EVtz0McQdG4w8o;bfd6l-eTF~5N7j}Tl~=!ALs&X|?$gxNVRn3dy<3B3O_ zKFiWdZ!E3$#q>M^GvA5o(GLr2gE6}*5YsCHF|NP|eN?4gRDInkZd7*!9p7Ue)nxR49C{n_&{kJ~x~f9dsGpwcu9@8)w6TnV&~xMGX`$yzI16|V=y|}m#2Z0XvbbT0YKuZlR}5l%V-Y(j7BPJ> zh#3-#Xh!TX!f-U9H=fX&O6W~bMaGOwhhclcL_>Qk=0gNGCcS^lW0&(olFr3JUW@qheL>Ps1O zWplqh1}Z^Iee(n#Z3~RQ^1f1k*1lSGCkP6f*3t@kTB@ARRST`PwHAI-eyZ1#(o=a| z-`?BWXW3gRJi*O&MpkHB^?b(WG6Xu=N!90RU8+d1aKP?cV;!ZSq{hBhy&l2zHOqd% za}E%O`>4*O-jrhg+aIYe3B1nieAgHUI5L1e>Ov6d={v`5Rhcflk+! zk~Hw*_NVF=L!g?8j0iqGoB9q4dPc?ZGZDPL;Cqx3)iq3V0|HdaQC3%~Q3_S>L(mia z^nT1bt+K4H=GOg$KVV<+Z>puWxl(@L{2MI$#i3T5SS zNT$e%Pjp9Co=uKtrJczal&;&LvU4hAkL_8 z#R`JyG%l}T*)>JZSWTFnRP2nC3A?p5?l`&J3FkC=Pz}1{^jc?})8dT_JA!atYarIr zDxXqC$QIHX6P9xbz?odGuJXj0Eg@J|Mo0>dMXs1?6e1x=>o}Pza3t-xT6~B1H-hvM zbgJi514c`#2ih~dP*0`Tp6P+X#X;!E@kJd!uqLX#uF7=uHkP2hp~56JP!|@Hk*2l| z#U&+(Pf3Jlptl2hzJwlCo=dPBoK1EWw|;v|_)%e+I=xKV+Z;D|3wQu3o-Cu(p1@?_MjI@pr{&i?Pl`|Ak=pu_cJ{!Rd|!uuGuBp9KYpTp zyvC<2qw+ug@;lY^Z^zW*CscrfsVuJQ4ppxwn=9L`=QA+obLsg6E_G&>&DAsN*#t{# zzXd<7>v>gH;M3C4(4lTO->)@c}AmDZc%* zv<7xu7x=Uk^gbhamDlex0#DhTKmPQyKS9qnUJSInAJUqCOi+IK(Z^JH?-6?HuJUgK zy!WZ}=FFan*cdvcm;hvCgdrm{2(fXlh)?i9dX^t@=q$^sgV5MUdpjiCA3X%XAgUns zk5dck&=ePp&-FIIn?a>CGuIsp_~l$)>WRfvQ!7h6v8>b|6Ipgj5)nvm3pP znme<~1y}b4QssH$x?ushsy7H1G=*Yio-0<<0-r?4tt6lp^So0l+;M($AWkWF#@bpB zoI*u-S{0A8oFI65wToF^*cM2I7-q^Yr2<@4>5e6(&REFjSxCSwAy60RIb#mh-&BHm zMy@LsR0bORJ29V1mG*o>o)<X{ma zZ4y$(Wgu;0CekO-)=tYp&WsG?&(1{g!h8a+5P1uWkh7#1m1j4i`KnHIZR|qNmQM7k z58l>34Bs{iqqmR8q&p^K{sZ%I@{_0Fl9w;TwtwG*+upq$ckbJTCqH=tKM{Il9e#%9 zQvDf6erNo~pv^V)dxW1_bPxUf6TgTA;1Swef^N$;^(S*Ool!N2{l}jU;c&mAQhqku z4^Q1tv`=BXnJ5PKxl>tfD@D8jr)(^??7(vBt2cc3aMMji*Ao;4Q4LSC$L)RX`7S!2 z&-oo;_a(2brJ$#lMOC(|(p~E|v~Bl&CAbnC1H%sf7jSHtNrVIu!Hu?2>hC9l>!_xI zlmh$SVM}#PkolG{J9zY0{78kZbW*aknkz%Cw;G3g1XSIcH+2O=jHd_!aMK&1N%RE8#883B=CX|746ILLUM8d5|eysb%T+e6N2nqe|~qP z(9xX%r@k~QAKJX$cqjBy<*AR|aH_b;c|Mq38i)nu!B|AV&ChqoLPBMAsRvH04#1KE zsy-^c`Gm|V6&|>#Jpfm91>rOTZ8ZUO{g6-ty$hNMtY$A<+8Tok>Z5R4nVV67D+nva z1%%i+^*&fz?S@mS63?u1rb?tWZt%eQjkLOqvua#$Sw{dt8o^}{&TjC+x$Qwzc>c7- zKIZg2GdiONwUwI9tp=rK#R%67Xn`IC zo*SGrH%6p4++%#;7Uc<-2&%hicep2b!#ROAI7w4I`ofzamdbO_bBAZKFZ`-Q;aeRB zpPDd)w#Fi;F$}@{p$KaWLv&9xVh79K#?#)?){aj{D&dzkB@5Zp(~vzg1^M#{KaIjz zN{hRu0+r`BplMwPIyZKpiz=^Yb36Jrbz{WVQJAn}3TECp3u_)(O*?xLHomz9cfPwD z_q=y6_U^kE&mMS|p!)^j7wi`T?PhD*+^xoP35#E)e~ITQT2ivxnZIA z9@U=aGI{6Se-VIhW9sy=h=~hATznuB5`zf5AS5L#0+E{+iK^;klvl)}qAD6rBeQ&| ze7rE2s%;dN&iHIEjLA?Vf)}P2_|YnQVs4=i=JCt9fJ*JOnn0Xd6@Z1@KRcD+DsaX5 z4gR>2FxoUE8fTR`<20(cOPf7$S(_(qEkRb{g0rb+E~p8|>9n(JE1Yp^u@laqy1TSB z0O!{caCPoDx7rz(&>FAn^vC+H09@JbYqrm=p=GXjWm#{WPB5NP>4b|q{ctX~&uQ|; zS@oV&jILNkb-1#GcDdRca|z!0++ITayR_CDi)*~GpxO(w%3Lvla2(C|sUdReWZh24 zw-VrjUR$aQnv-2o7we3MByTjN2BEVw6+>F9&{XTNx5}?q$Ec}PUcNfSrX?dJITpT& z5d>TS!AAgw(9TBt!zIcW&e1;b%Zfroc`AZR6X1~(#Gvh^+H=jN^3xdHs!;e;g~6*T z1irPQRC>{f=!ir_XE-fxIHC#sm|;@u`Ge0V5Atw79u*(@?T32PLcWQLwTA zd20$$ab7hVu4+W<4UOp7(ny=zOz^c~*tVgVv~w~R-n9VdK6wsqczq-8c;`+$xbI;+ z_#Wef2l4#J&l7h1_EB;C&i_%{8T1{c+B^E|FD3-N!^Y+ggx()YPqFo;8*%vP5d%Ge zNQ%uuYT?3#228S*17Wv)`*x$)HgDc+5&&pun8xL*+(77QUW}V=y2+H$y4|mkZPkbi zb#B(R6`B)8lLYB{y1rEC`fIMnPe%?L$XU<{dRh(ypWU{h?iPZ`@z5I>f{QIP5H?mB z>X>Z^5%i?sGzFxfckt(5@da(TfF|I5t}Z8p-gk$O8sI5R%P-65m;k6z8y0pJcm`sE zRX=ofU%^(-VWDSXW+7*xHxP1m-EIqdpYpl%YzBM;m>{TQ1EFVQAoR49wf%}f{F2*W ze6BS00^uLfdt3IF&{MpB-~g@nr=}4^^Jcui?|ox!KTw??L;p73d;e{WpD+p$k%5F> z7*$>*;^KmlloUn)1|vBo1o`WIwl{jIY{sO!ViIlb1lq5$1mAcC z?cnqr56sGT!+e5oHPzCpQXeWRFU(J;6(vM2qAFY8=7X($p}3gfx|D!AhhRFdR$!x@ zr5#;LK%PR_oK2N?0oB~4RBM;f5?@5vT`E;4kP&VdRy*P94lisO8iGxOgK&M1AFgb5 zXIWQVNr;|CkeyBYdu~0i&85PqLc;Hi8mdSF^<>)X)9Ssjy4n>d6NslY`(h>Ux3bY2 z%eb6T;(`eUUKpD0hE4)+NS+6V5$v5Au4t#qYoXn5PxC=rh96q80?=9*h2iaGXsOfO z8OqeGbnu?I zM8uCtLBhCfq)#g(@Cs0{C?CZuXm1I}6z3<+ON8W!754}a`z55_u`0xe%_B&;VCUpJ%SK?2) zPkqq{KH6H;D8| zb)%5N`+_!1Fw?q}orT+fgWgvJk_DU%3p5M0f$J8010f{4Y3I3+-4)EfRsKA+3=?F6 zik8||%6NfJ2}K1zZQFesv7u1ogzYLL_*m#!OKZE*d+wmMTFlqEl#*)R48s89flnvg0XE#05;MpZyOSbE1R5g zb(=dbB2dp|+4H%)qQjTgIuPg6MxRdTo~r$=emIjbJg?0U=XC~Pb)74g*ScUe;kc-l zDz?N2V+p&Ve6F#j-WXRAh(Wnt=%C$KN9C?ee{^K|p`BoADGVa?^3c^(g4(J|R9Drb zs3J#yMwklxgQl!jVlwznX!uLqgEok;6xNA}PT zq;^%nw>TC~St0N!i9eV6ca(aM zYVY?W1fGIcSI|?3<)07Zr=wJR$3aiAaq}hv7j;EZ_YkFp60`)S#fujkdn>3}Fey}4 zP!iB|Tn!EuZUg7acT z&{$jnPs5_L&%(~y+do53`&Ir#A#ge%C{^{Jpr@svr15>*Y%9*(>m{^XE}8ASJSOVjV~5@7I+qP7J37LcOvNNo+pBy>h#p$ zA<(H`pTa^5)iGq4iDlMP^PIoMdUoWx8Q_ zyrZw&RDSVi7YATYNf2fi1!8__2xjK_U~aw-78UtoIu+Z39A_*fEKZ_Dy>5^@ZW``^ zJ4gFtdyg}=brJ?NN~c&dKzG^)%CTD%EBUo0hX=Muh)xNmW_C#DkG<8ob0rj`-hKdOXk zH(PUkP~oNeqAAe>EmXnn`GjDR4~8_;;&zmwxlXpWfzYc%b#0^Rs9aLlfTY4=WYg+q zH#8!tss?eizKON3xGxGIkh}x*FhDnUC1+MzpM$hvDa* zjA7@l!I+CrLBsM%NEuU)ptekeb!8!ZP!1vn=OANh4eC!GjgjlupzFegs62TD3YN8@ zXk{}>PHI8f=^dy!e-PTP9*xe8W6`x`6gsyK!;o7>V$7Z6G4;L~Sp48pT=>L=*#6g> zaL>Q*!ToPPK$Z6h9{9%tc<`J05zKeo-Xx~J};PRa`Ha|}J}ulHkE&{^nNz?}$svbQ!F!twU@M9?$!dxV~Vr|v0= zeYCFXd#A9#GsDif?EoL`tDq?0$>OSRk0AU|Do@aR{{yqFc{kqw;4O?9JCu-fXSgz4 z2tH3lM*ATmifS*xA1Ub}h)eQ8RGb@}CT4jMbe@===ZQ%fgdT0|%zSUmEh6{|{V|bH zoS5p2#ic=5Ru+iaRC;Q>SeEC6i(6c=Ww<-;80U$5#`xea!f)?5AKX6L1NTgi!1ghr zxU$2KYRnZ^w76i~V3rvbg1bkC;2J8s4J~fiPT1W%$Okv~dSY{%6Ydz{i=BhraoaF2 z+&v}$w+y9%>>ym53Da&ub8G~zZ*gW>S99*p;bDYs1g`E1zRW!69n~+N_uS7c4UvPFlg7bL2_jJ{(UrrbLlOCMc{i~n*7ZhGS;-1YWdxaaMAaPM39 zVeh~8V$Z*Kfv+bMMbyqOh4vY+80HFV)Wp zY66a2ly|=<1*F9vvm110t z7e{_RdaNNskW-7Zg_{j+OARW_F@E6(0?#HFHS7JM`DQKjEO-V&&w|jXK^{BSAffuX zeoHI6YgApoinHob(E3qbeE9b{NRX@Ju`;N9e~7C12STkszL5%)V&ru-9L;W9=vj~s zRCfbapWSD}*jED1wAym{QGX+bwsqV>PoO&u*N1_7sGWeSJFBlP(9RA}8X z&h`^yg&ivIL+mFA9YOegNcesDzG=kxVBfzmc6=WKf}9Z+=|M#ofbb|!BqRqSI^G9i z(QZgg^GAAaFw$}Y2)ztv%%II%T1jP>=Z@)wnqpce;a2RAF$qo>pGbAY^#xQt^K)Ej zZC!9yttYmPj>OHQys>Mn7j7TyhI>c*;jtOvxOZ|8c2AAM&T-K=r_vRt6F^tB5^%!; zu&vV_w+`~7D)b@Pg0QoXAf%ex-tB_BM)+Xw*Z}P4al-Yq$v06oZfPg*8W~O89^{3` z=Vsu>Zi0{yyt&K6T;t|pA-JB3?~*2WoKLl=hL3ZpEYGfx{dJ)Q_raMpURYi3fmId0 zSWy*#rHn;2{+L|kgfRt-Vk*5%FSI21p)t-M^|794PNnk8cSQ@|Z&Q&UMt7B>x3LK= zH7%&AszYsEBPwefP}$Io+=^<%<`kf?xg7~*)d(*vM|f!s;_KUyGiWr5M^8iUut~JG z6Hzp7Cd#JHN8W^4NF6x^DWj(&b27JQFGt0))6jAD#Tb3nwHR{oWf*+%MHs#AN=)6f z0kdx0f|=X5Vmf2;wk?=_>kfkNX3V*D6DDoB0^>JcjQP86z}jbaW6k5Y;=I@H!`ZLj zg+-5DiSfHu5Poy1^yXpd<129fa~ELK-#20ByE`3SRo=N1cf7R=xBvS#JVfQE1fdps zzy0)<@ zLk$xvR;>6_d5zkT(zD7^0JZnEb%J`vnX_jZiz|>x-3ffMwgc<+Y*^sgP+vZU`t8|p z>~p|S5VX(}qy#(*Jps?c&xQevfG0o^OoE-QVB z>2#(gcBAEuqQdjSZM4<9sS0Q1wJ&pifmQg~8#vby?`X`jIMpMa-*rX%xlVL@mWpE@xsPo0$2 zK=|3ERe##H&{4yLx}_*&YmWhsDo=HBN+j^jcRv`QX{5}rgr3@)4*&A20p1U^yNW|c z)iuVkZQ*BN*AKw&q#_42W+(<)T?;(hIB`7m1Uf-ap>3o72t3n>L8YhRZnhCajS&Le zF>7nU_X&abXZTt5=jfB?7^^Gb$=ZHI*a>(xRKI7`-u`z@KfMv7x{Z}h&x%BITM34Z zY({lmHj+|WtjqT`l#ht^wa3i57^|pJ23!a-Bi5KR_;Muv+cxZAk z?i}fjTZVYy657?PsQ_;v+-~W0!o6d>4HR$Z@f(M@V>8v}?wL_|U~v*Qjq<>C!)U+9 z1>pJ-fw;Kc$I;=LN^dbiwnlvoE4(nL)D`3Sp2rsYpgYwQ_0gKt$R9O?pC&+R;txP` zHkDqU51I<1F`}grg9x?y%1Sgg)H@~$@1KM`ySNM)02D#%zGaojZD>VWeJf(Cnh;UR zsBS|-+fd{VpNjH{3s5s_C2|P7vX)2~3?+V!YDa~-O=Y`$Ov>dsz| zrt_~w<9XMh?cy8IeBlNR-*5+}-~JFL+_VeRcHV_4H{Fg=*KWqJE3d`mO*df{x2JC2 ziPiT#hKpW)4Hv%p0@lC#0@ggc2@~&JhWU>zCitdd;p6kM^r@v-^2Aase_{oeJ-i%C z?pum^ch19wk6eiF@dH8kD}Mg%7Y217m!{QL-Bkp_-!&{vV{wldAZlGBF*a{jreX(} z1Q(665VWM$WIL_WTC`}9fs*=J_1VL+AZR@VqWmnZ$VXv*FRfK|?Lh8?!6;Pje z;)!GSR@PY8(sSsZ->{5@*w1`FwgE#{*MjapLXXdPJn(D`1fNn+eaUq}&qB_E&e~Lk z)-7xVA|)-5f>Yl;wdx9dDkI=2zWtu>gG%l2&-yz6M}Pa>K<_7R3w8>-emwXbA>cXe zvaW4I&tPF^mjc;|pl6kywYL^}8n!0r33SIp&xQkZe}4L~$8}lgd1?EZ+&IS5i6g*g3VnIIqSVn})>TwsG;evCkiy zI@}!4o1B142o=HX@=7OM$~Y~{31{Rw;lgrPT-)k}ox>xrV{jxkwR+?7awl3`C)_t7 z052>`z-ud#asPM^Dn=*VJ=_UTP7lQ6Qv<_>35w|6ZS;-0adyr(zz zjPt-Yr8i6;(8FwxY$5vkFl3opM$YEXh?QOeX0{`Q{7RK?1!%M0t|0!MN@4RYN{(x zTT^YoS4qW}Us_JJSB$Kh24pq1Bd)p$q2&#TsBK4L`!Hk;nS|o8b7*ZBA$RILWKN%l zf<>!QbNU6SJL?J*pK=ijPPrImXI+J+%QmBdpli5z1Dd(sb@dLkTyY}?Z@ddLc0Yxw zx8I8?gx+MrZ~Uem7<1iLOxtn`7TmfAi*COUi+4SUbN})x*3#l#_4Z3R=Z)Ji;^sM+ zbk8JA*fSE7ADW2C4^P3kz2h-<_c)BceJn=p7>$uPj=-9G)__RBQ7Sq0&2xZH!_a;u z@EBbG>F8l&XQl9LQUOisymRNx1~Lk%Dgj1aPkyHTdiCX3;NnC;@tEwb0QEa9nk}zT zcNK+>30yi~&=d>>N?T3<)v*gNywF%zK`%c)-!xL_T*1vMJ%!Fyi>|$&mCnVwNMOnCaQaB;zO(2Ei z=0V=LnpX09LP@Q#Tf2O58NqiUA$Sh2c{M?`ZIB<<*Sio}E>wI@cw$xro?8%ur)GuV zxp`rDWD)^3)CrGFbiet_W@8)TYg-Ur-Hf=FK}hc#i|pZ(Q9OPQiYL!U@!Z8IT(le|D^Eq~ z$!8;H#hFN6bSg4epNDcPyXJK_q3s%iZvA$&U%L}Sw(UXhrn}L%buT90{UjDY`aBjr z_%s&meH^n1xhdOj!}J?(!@}G4VCkI?V9A{iVc9(oW6dLvVeN~L;>=fX!LaSKFv@zM(~9)sSJB})vTlu$rW61c27v^G>W zS9VpE*n)}dt;$Ims*%Fl*byT}m}_YN^5x4-Ssk;jw~9f72ATT5E3drLSm*0+*nrJj zw-~@#*h%TxzIw-lZeV=-9e)xG^u9g@J{!m2N6^{UU4id-m8U*)f|=H>y%lI=H>K`O zzde>wIRWzf9|&?OJwi|M!;iGEKmCluNBG$v75IMV--rs4p%hmdGi_3XafiC9*zPL! z*pJ+{_EsV2THpzU_IU8V1|W*ii;3|@LV_Rn z(ca3=t}1cI*#z8`gxlImSDaVtfvda1OkCFFYe08xX8r2D&^87%&!gz9uJD!;0g%{>U;4gEcanERfJUTs|R+kpJo9c3e zJN8WUp;Gjt#r476qh0XG9F|$;kH=37#Iu)0qqXn1*;?w59~1HPs(&83Aa`3PMw6Fq(6s z&{toG-u4DmSCykqlcm?v;%eTEnmVKvmLR#b5;?7%NN(;xcug~6T6&PwGYV-#C!mOy zws`6SN5zvbC9#@EM%-a8`)HP#pkX^-4$EWzTsB1T)P9U*WH4_H|@oU z+aAV{oeyHO}Apst-G-3&bu*d$E{enYY!Ihz7MM& zc@#?@x)Tc@xekN2&BV}KN1*rC4s_ktfzCTR(Xz7zEjwD!xTOgV*EgW?>ITfcZ4SO7 z_73-t!uUh!{X*!`-ZFk5;FKHhH`VJAc#7X?c?CTS8rA3N_0y01qEh|5{@2%zK~G&s z^pYYwzLCK;NN8J)8@7AOK&AKBzy8(0MV9rgx85@8=Cxnfy5o*J%>IERKoqjN0;5re z{gYkG&I)9LpF#~AwsGS>g`oGn0iU4vKf!0OA-F>)^*Zn}y`@#%K zn>2AO!o&RuIDf>(_#-)qz>A^E3wJ|wq&wl~fusa4q$T^nX%Q9A$|4t>TIzyxYCLd$ ztvl9t1meaKakybn7_MmY#FdTixUS6$>j*5>sogrn1JBG3z-!B*aW7%Csl^GKXiK-! zV(uE_i>K$O;E5T@xV_H@4^0Th1GKQW40gdS!@aS&#{<{5InkO1;Qn#0xQB}F{xME? zc9s`jUmT3r7X{$e`M!8!k`o>p>x8GL`{S`Gfw*h9C!U&}ggb@?;Cv~^CW3RY8=u7; zH>jSEO7elJUiiyNp?L1R2)uYrEDk(Whj;c?a&H4+nxyfC&j2%YKP1f3u1V!aIXI?_YXnH7fi>`=7jgrY8;5G+kXZ)ZJf>#C^o zXmhFbs%tb!cpb9KDiKfcWfOWCor4hF)Q;GWKE!tqM?&vtD5W~ZQO;CyB^1|+aJS_TOLN=cG}xL zPZNBvW7(rGVBX#LVg6lr6LfcB;azuN*}eA}mAB&IM|k`m%zJPH8rDxh=Z!o8#fRt&IVReR^Rgr^5elDG@cF5KJyoNx_cPtsh^%~udJ-B=|IqFY3uoPOk;0l zSv877<#bFy(>_~Y_}F8Q9V;&YzWL^xri{Jbz^*GM)WXc?wD#2wN0Z9a2o2j|*~Y+a z3%&1^Mdn!iXXvRF_uC&18O3Imn}wYXfljbdNP!)TelW_`3V1*Ma0EXcascm$KqtWU zFMlTR6oMW(_dk@Lg`YwyQqUU+I14sG@OT*7V+MrBLGPH|?T4PRxz^SScnsTsA>cWd z2M9Z*igJv`5cC|Kl^u0?>W(Dn*)Y&kPCb>|uOKL8b9wB858lDdnG=^2W`=&?Y!5Oi*XKWZAo)CvS zy19?Ax@V}fX{5NGpjywRMqE6Bpl^P+`y#Y^-3@V8~bczK>Do}cNAzs~o@)03U> z;%r~MI5!y2Ob^DLgI#gkAP?;9@xmpgPPm|q%Cg5B4^1Tq2m4_&p?4FN=KXWM@X|%G z`1?(n`1rAUyz^iUUcI#(cb%DtYsUNIEUHX(p*dByyUEWaK3!fPh`Dtk7+)TQLAkzY zOV{M$zGz7fL}x}gy0fFuEUQb!SEW>3#W5JvRgZ?IIx{S-y1ELLRW*cOJqj7I`9(;o zYeYsjVb{@v_@1GN=^lpYu3^X+ITgvnryz0k48)C{gOuqjP`KheRIXi*%5yiM^qdV; zdN&Yyo6vmi?F8SgXt?GUG+loOM(lc$@f7-YK8oSDJ%urMJcEUgzK+vh{5MuU_6%m- zaXaSh+KH79-h&j7bRS60&DMabT#k9(k4D^nep=W}g1AJ<|J^Y&z4;(etR(4k46Y#XIIC}IjVfM2D znxOP6zlFd3LMz2D$^#O1a+>q_0lK`flW%!*6m5*XXZ_JrH2E`?DC@PNP zk6svZa{Q5&>W+jsCnPd5Qk_v;;DhG+C^(&7?TfRjy>L}~Fs|(i#?7PRv2jo+u4?wg z4~^&R1oeQ?o0LNhZpDg;rUs9*gM=0J3770u_xvx;o+$<*xBWR zom6vs396?i`r)}5VR&X{tm%(;U8@VW_PXPxWeIqgz}q|82`>}U@#U4s=;%glODCe+dJ#s88{XQ7%n_53GIWyZsuDkXHr3url&`)B)oZUt=^5)$ zbmlcEJAV@`?Jf+t`5_FwS)G$GF|6Bj@}Y^YU;M4Fs-$}5ljrzVoO*kjt~?& z{+lfDQ63}oEXYViqx3BB4*l?>spAv8)QDko=m|!Glh*CFV5X&Qe6YY0)U;1P6Eqb% zZp&EPtFkJmdOur6KoTIo=P{*|`ccPKo=e47EUTrC+cFBfZW=KR*ev+|90P$j5PE7c zweXSZQ)7g>o_x(?vc4K+p%&Hee>!4DT4*$d8F``oQi+7$4+PW?S~3n1U`Kepp9wX^ zQK~m3ef@>@RWYzpLyZ?|$dFBTz>N14^!i77eD=-v_~a{ETRx9TcV*H`sVmF(1fHNL zi!0zs4RRmX4-ka=HOhj}Gowftw!4aBIRJVVbp6oN92WxJ#{}NT2dMN2JcXdAK9&Qa z=U5l;wA3+&8r--41AP4HK3?k`UgO`GF=H4?3B9U{cr?~0qPij+_0~Xmtk%;h`CccyLMt zK^216mM7xPHPLu+aR}}k>x+jcMdIn133z2;27wigd-@2jDNcB2br@cp;m>udxP@uB zcYHY2q7UvGAB;z5hv4t0CF8jz{&;S&H(p;8j8~TX;GZW4;-hoo@ZlLL_~+7C>?OqR z9qx=fdYrM9D)9~~%I6nl;-+>_(>G9ES9Xu0DqI|f$7lIq&je@eo#KX9&q%~8XQkkg z6_L1mUKn;vjlkty9t>ZcS?7*bRbH4^tbBpWDC2=~d77Zy-;pXmJ%}nV0L`?wT{%(c z%#A`zRtOrY^y(>(oAN@@Tc3-uLp%84mYFfQwGHhkt7&F5qKH;EHa7=(ZS9DwZz1@4 z5Y^I$xb|U4?i`8Ko-s%pHH{W`4svI$K*`cGQLyYRWG_7%g{NMInu|B1VciZ?U$Gs% z+aEOQu7}IsosVGT?q@ND;G4GhMa;hMC9Hhg_Z{J(_?Gbaj(|Bxm|3+Y*g49m z0Y%``s0?mX0e{c;t`QcRyyqMKK-iHL%GRu`%}RT(_=!IvKe3$Rus%C}{wBLM|7PmH z$3Gci_cQOu>#3tNLuyj$O@LDtXT?|A$LIW<&!nZ2l4{-zB{wyRNxx7-hJbg_ES0cS z8J0i%lq#i!fKZVqK2<;8*^rkM}B4$DhguIT9bs@k`UCD1fZin61}bQ=x&ZfcS|C=TT|e4X^S7$wfSIO zi@UM0w~vj(w!vY9T`+Fw3B@gAXlsXuq?{d1!5;zBn(JHrbv&I-Z{jK9#{K0GcMx6|I<+UHL2d1F_fFZNJn-a{q$ zz(nW%=Ly2obA#~Y94f>qezqZHgZ%;@zqAJW-qOkDjVDj2+a7milU1TulJp zf{L0JGeV=B&`Zk6Ms{5-(wjSw(A)!cS&3~PjKpSIUPj80@kk##0|hgeqIkhc$eVu> zvKE|*(o-)%)1_Nbb>T+jp1BTnSM5a4O%I@L(>-XU`fKHK$juL8>RnG`-rnahd(YEY z_`qK<)0`<{C-Z_gggrQ(}@>n#{`)n({ecLBPuJslJ7Itx|nha&g#GGt$oi>ynt zk#=D^63s5Vs@}gD>s{ztT#u;^_I>AHNHFzdA-|{NF%tAlz*H{{p@L zlNboU<3k`DxPGD-2tB*~KcT1n>ib7K%+UNB$6zMd^|!PdYpNO0WG@LCK~DfZ8!vA735-iJkV`CZD4M@d!`>WX4eTNH}gLVq-r2B4uV5Ot-&s4ovkV`VIy zE~RW(htx(z1DXl79XQcFva)!c@prcR`G3`1OFFXHRF5L@4g zl-^NDA2}I06R7s4EJW7SCCHt75~@zQ7_ArGfSPl!L(%D1qVAHdXjk3dRXb33`Aul# zvitg77`bUT!S*nw-Sz+$-uomb?x3=}b1%kk+kv@v-Hi!1ZXxi_$KZ`;W9UsMqv84q z$iA2mzAPWf7pEclLaMy;k`cQ$7IA09BKG7M#4L|Q%z|+AUDyqN<>gErO3%WN3PN>w zztZ+PEbp)XSKu2M|LK^8;6Uh|2!6+J|8Fr6dMAp3z_XA$e%k`iv954H&w|fPJftPV zLQjnkf}Ux8{lQU3XOa;Rbl=MQ3V2#-l!Tzi7&v6?c<5>VUZuX51sC)LI|Dp^_J{eI zngp->J6V-yfJaa|>~epDi2%q@jy{d?wUZPHhO)S_x3aUcwE~>YuB>fMIjMOxP6Ry% z@IF75Sm1cr*%0(pcW2>tqA+mlS9Sx#0X@R+6IomhMcao@xxQ@Cd=%%VAulZoMHyiz z&kaFUK`^Qc15uIh$7K*|O2P@h7&u+h;EY?wM&N}tMR@J>QbI2dS2Q{q@ZCEz4R=gP zz~-I+T-WT59lZgxlwP>K*M-W>1#h1gg||+R#H-5#aNh(%hQPaXm=m6x7mRD{|IM1H!T3KE(pbgW4Jxq8IMiz!h>qSSQ3N>CU{`) zXjeQvD-gSe5|$lK*xu%X>ua5GYmXaVUX*}m=Y-=9TIn5KPS`}a?II+fqpEy)c^dW* ziuVwd+XlN}J@0un?eTdHnxn%9i*mg%zd*CQ`D0#LD5h&#Nm|=s>AvVr_QuehFmz`G zV^nz(h8D+RXlVl4v%^pw?}euHKr=UPWeU|^O%BEl=|n?yCCW=H9hq0E)ls<$MU~}9 zD#%A#Z9Nj}T9Dk*i=>u8NFeYM8oQ9zJp!qNMkAGqFKxsWWYX#u&0T}qH5Z_1?Nz8h zYdz}Dxfbo0Z9^07t{~WS$riLJp4`sxBm@#wyy4Y)%Ty`M4&r9P6WH- zV<6}hcAr3J!Dpdof%hl1*B^r3_p-HwoR-G!_5;w`TMIqY&>^rr7OyM|z@t+LIfChdF>ZK*mh;60w7bhf@y`|Ec>knW z{Ch#1Zzs=!!&)sz{>Zi$Z;pFWRy~jp{3jbw^uC z5+)DpL}w%IZFxB=%av=7q1N2GYNQtCA+D$xiB_v$3oi{fNIz{aDV_+U%wAO#_}c0P*+ujvZ6fXq{kyGH3H>@38QXSiItdeM(=eQBuQe?Q z&8bv+RD8uzE~v{2$Jm|*4C`t}V+~6~tr&6jjTm-Zq@SLGkaXE-vn~t!x z(Fi#^0>Nt;r-UP9bvS~Thah-KFoNa+F8$N$45y{cfPA<*!_EwCMf; zy#EG1#X!g%zu$t+9=FgN7$*YY!1zCb-tpi&(Y_OfEpt5dG^^VGfZl*Gs*EtvWZ_!3 zs?P$@g3-d!l;cv6JErm!0@QKWR+ww*zC7nA!b=So3UyJj*^zB`7_|;Np!Y2_QITd$ zJIHhXl-PjRv7MqV^aMK7MWr8j7J7o4wYA6Zw~(_-Ga-cKV@OtX!x9?*tUAzqSbX8US4i**UATuov`PuO(A^0jw5>Ztgk1`6E zvOGd>$FM-$JR}eg&Pc=V330e-a3F3Q6^UC%Mqz7j5Y{!iap{lyro`ZZX<>MNSsebk zHVw}%^20k+d2e6niMK9t!#~dR!#~am!;7nf@c3LG0xt;9E(pc}DvJY`C1T%233&Is z7`%O6INmtT5C1$P7|+airW*9eU*`JZj$T5q&jnA-2*E#2&cvS4-h`MFo|+$K8YOP- zamDs7Dz-}X6Z9}0mY<&KNAMD^GySn^I6+A@x24q`Tiboikhp7Td2gQ(iLE2UaAC6# z&Zzar8TC<^Lv<(U&0)+f4#tcEe@rd#!{nj>OfC(^g4PtwYfZz9rW8zVU^J#BSR`r6RiP=lIE2lT3{t5HVFTVO11AyO);klECRr35L4BHqz2kts=bU}LT}I*qz;{k+zE40I%_4W7M+g#S*uX8;8e7ndI^SId;^AF zv;m!GUXJdwu0YS)OEKhp!tkQ2F!IXlsratPi1k}A?uMNhyXi($op~~9&tHtRHC>2W zo{#X85)rvJ7CvkI;CFHW{FVnGU`Y`C76ib59%H&c{3rP$WHO<*R?z#=0lj|kseaEu zkJ9hg-&MEgi2qaYohU5m2F5_O_x}(0DJ337I6!@|*?XM#|Zl9QrSmlSnRaVWjt4D>AQY-k9afcN=VN@exE>7ys` z{h)EU>bk<~33#@jo>ZQej`RM2o=tHt$Z1_!WIp>!O3z_!ZCGWe_~dg(Xc8vHKfXUZZe@qOE^9nyT(Kidj7a`Tomq^l!y&&-nh2e3%3l5#O`r1xOJE>o?e)M zzn+qcmrshsXB)Ec+2%-mz9k%=Z%V{_m&M|lrCxY=hC3~$6W&-IkFT%E#^+Zj;iD@P z@Q?F@@XQ(~ymFQco;}40FQ4Is*VhK&=|$dzoEvrzbH~%uL-F1j1$bk5CN?!W;bvOa zZM43(5BI^NGt+SQ=vZuOcE%PSyL%Y#u__Xe&+syqc-L@WY;N(O((}Nkb}wvb_r#6E zLUC2QFV3j+#EK#hT3bKN%k#qG(hw{xVU&epURf~aRnqd-MPgBN98MmZi}R<}Vdc;Q z%l zwxk3pWfjP8?>0$Am0vH0DlewG74dbQrk*cd_IB`iqz{{9)L!ny1t=!?3TCcE$-I+L zvFJ23pL78_&$tvFr(T5KwU=S&c~@ZA1y>S=>oAxx^s?*Fcj>j5xM?SbU%dhKXP=Ii zOV*(3f~g3dTR=6J1ivLA2s$YU-pjn;%O5TO1%dFL?GHaHJf8_Z@EPNcfYG$@C$~`N z9dZDVLEbqO-?8}p{~3A%%l$X#{l6$a8z%za@i7o+ivI>Yd;Cvzd;QQG2(^JwIv#Se zqCfnoecJv5U^*_9XP`F#Zg%~I!Q<+`r~83KY^aMzxYP#QA+O*=o#Qq)qTSFl+Zh23;>?BwAQ*>wKq`R*=-Fu zQwVwr+1yV){Txe|twd>g6-p~8I%;cCQB{u2tW@OYrl6&%41;@XF}SM&olW_0dT3@e zo>-8I7giPEftgf#L)~%T4DF-U9pr}{eG%9-A{n<0ji=fQ$Bsc%atjji*K-na^xisr zb9)-T*q(r|wq@hpOX6vDqGJU$-cO2j1mom;NIchczi-QUR{ul-9!Cw>tI@6Lhr%pk+^?a4E9c; z-5p8gN4tA7t?X;dL-FV|XWTzgt+%nbwU2hZ)dx5A1Y=8g5U%eG!s$gy9q)x@1#VbT z?uSJszL=Bejm71mSX>c?1r>x}br|MU24PlJFc!AN;G*d@IB#k#Ru3syfC z(uMd&DzDlO#ME^ly0#s$O+B=?qme_kS--sz&Zbr*F=b`q@Q_yqe*=WCF z6*5=#B5+m)0_H^{a0vl7-yPnwyx=p#2Y%E1;5*S5{$mNh5q|LQ^F-U4R_eT;jzQ1F z0O(2esbjIgrKKPyU}@cM>m^tSj5;L-z^C%6)3bn7qlT91mZJJT?N^E_Ep;7TPuo8VYHAriq_k0L zrR`5jz_7N~!p^QAQ+oYjIw!dh5GRXAmR1B~Syea|Q{l}n_QRAscT6et z#Ny^CoHeEZr;p0RNyBrouqzFdYhy61Fc@vhw-@J%`UFqZBzmJPj^IlQL_=W;Mh$95 zQ(d)zUTKw}*MOp`I^eZ|+2L+h8QN4>hohB={oh2|{is^^Qf#(8k)UP}pt=w)u>tYPL_)3hp z>S{EvJrnh3oQ&#Im!a&WImll%8u1IN;m02gk0}J+3>SD!@qpI^FL;mhhWAKs`0)qE zug@1g-Ck&0*-D*v*rD+H!`AOPfX5AO_bWXu^|C+``0TcQsf+-p;}(Q^X{r78T7rup zXZH&@+Sbb+(})WTBi&17?KuLIJx>4=;Iv=mZRibhE&V*7$1I6{jkzzJA_lg8X0{aO16B$g5Ilxj*7iYq^%MssKch=ry=W|f|nUlSnT z5Nv{;8ao6#3&o#R|Hgd+%XdFWo5QKKSdBaQuA*p*KNeY+dlrbNunnRpI#Jwp9H3Oc9PgSBT$VuLJ(tjE`N!uBL1~H z5w9%`Hw`2&)4Dx5I{^2L^2Tkn=$l*oaZ`6Ru5Sy&)qMW5YiMr^ov^gX9rN=&v5c^r zU!uv(gD|f=fY5Wp)FMx;?1;yCw<*J3Md1Q*`-~#DoroJM&}7Qg3>^k2{?M$b9G!`v)dL-x}Kg*>lSu;4FsA!SJ&5h z7Lqzgp6&jxTdMU@@qrfcXNJ4W~r*T1An`sLT(xh^OY za=eZjLj=7;grC%&>C(buDkG>0fKrA(@m@btCCbh!?0DYeq4%fYxeBGZ(g=_*6?~S@ z`AnY>$coPhtEp*1jt3yS6{&w%{vuXMy(_k0}H>jdiwRp=T}aK!tZa z^z6QY&^rIziU5`2}FD5s^f>Wyev9#D9E2|?glNNUl*O%5tU_Pzwf|?Ml>P*Dyo>Z(Jl#WI1Ntjw2g^5*> zm{1*q(Pfe7%?U(%x*uv2JW&?yhN37plq3eAtG*b+`#Q}y+)}}=rjg5fRMjb?aw!rE zijmpWj=1I?#8KHLwhuuJ)m~I%FJjw0v9S{&qmtaMjv?9d!S`rlSww9mxZ2L4uGCsf{*~B zmw+WW=w+7{VuFue+AnbECFtm-^HomQ5;O%OZEL>;tAL^Mx{d{vg`}>h^XxHuExiO8 z?bCf#UZL{ZC-~^PdTHGPQ}?k7P=M6+bRC_meOl_c6di?^X+0H`q{NJ}J3`e)=$Rx1 zyq>P9^K=b8x9+KHNCC{9b+i2t%npV{7{rq12TUg4&EJ z7JQ}?vaWd?^uAK?d`IS&@9{04>1*2S=_r1$EK&?FLN_YpSiy* zjKuRR0`Su5KKT1pq1bn00=~N|ANw{W;FAs6cynzu_7ZeYPIJc_%Y*Rv6em1P2tF~# z1<$PT#DUG}IQ&2%KHM3D{kNv!z^%DBcvmG3-cpNK)+Auh7%$vE&JV9H%_i95u(8!0 zJBRtnoga zO^qwIbOoBU_Sf|W;j9`rT-+Xuvs)5yMr$%=7x-aLQ2?PAg#}e%SXdK*<*kWW)D(v$ zZ7GCaDrVHjV_bOz#*{~4R9Q6o@NGGa2PgLc`Vpwkr>iO$ZQB#jH zg0Gw*yQ^QFq}+VuG_@hVxtq`%g5-{&NNDXN_z1n$!HDY^j>KLnvJq2F3vLRR$-^fj zb;J~8jhT*;Df7{>{4`WfUx4C?Gf**oCMsskLh-a&$elEUc6bizR;)qMym=^HG#9xu zCm?(J2qaHvL*(!h1P@I`;IJt8^7HiQqr&TPgG-wW+?uKM>fGT~O`BWk1&<0hG)%7t zjUMUtXXqK$>3@u)R5kWx?W|s!3&ZBevrDCU7i_F5v{19#dI?n87NE4xt_v7?X>kC*aXF+54*|L3oeI|*hy@&SMX(0tTEd?YkbxhaMTpsp)=$OuvMLo#t zYm|gqSC9Vut5G|Lexeosh4=rF>hafK4D@u4uB`+GhX})i%GAR3-S_O_HC>L?)fr{? z&G-0)0Q;W6JE(3Yj30l{bNpx&o~`pcWQLc~-V$_52CA^ovooLBFq4uKcwZ52N}Xl0 z$MBiHWqifw`-0CVg=c|hR2)O=g5ELsX`k$?>}|io6YTW9tQP!&j`H#f z)Jpx8Y(_f}xvwaGYeZ8qLGJsOYC^2Xm*Md62Q3-Qhwk$7gd z7ti&>(~EuZj|-v*$Y}gyT_B!3%^mMvlY#x$<>38GvvKbPFWfrF8TU^JHn82;?u)CN zJk6xvo4Vbwsm&R?hWg^(kzROZeiYtV?C9Y9&y|sQY?3pTpF19&=s_TQP(m*TvrB?8lQF+02D2(7F}o%X z3tQ4Kt1$(mi^DNEFBpSzgQ@g_(UqkXRH5j|2tZ|wGpbX3P?jEw!JTzzX=&oGO)bhQ zYEW5Ki;B`xRPmQ5)l3H7h_nvb+u=wd_!66Y5z|O}+epPn_(iu4L4x}14V{R@zHvwy zHi7Ge*QhBdn>-hdbC#lf;!G5eos5$46Hqc?A_~S&MDF-W$e%Kcs&653rcOuJlnF=~ zKLlwLJCQi93gN>t;6Er5K0N{O+~Lk270*UbcvgGDtIP|Y#h!32bVluj zY9fzdA?buOLGKvW`k|+n0APWqb;Zt|I}IQ#tn`mr+sdLW2&S;+^}(z zfu3L{XvoGMA-KNziWZpA`#~L;xs>9Q(v!+NW^ZMq2|^7m)0oOthCK}%>=qU!OI)%0kG%ZABjTyH7&W7FolIJ+$OVik66`!fg zQ$5}Rsyu#<2M9bt@6TzY{-@Qo+XMI6_+X!6Ki++x%I`zP`{)_eg@~vyWMrkAE-O`4 z)u^msR92(9TA9tuP*RqItlR`R-8(A>4=t?1woxhAL?v+ZxL`cIC?5Yjw-|p}6i2%n zg2$&t;K^Cxcy4hREo>}4z9t@@ZjQt!o5S$^&P@F4qA>jR%wVd(Ks+@s5U(tY#9L=X z;LSCGc$yG=ePs;ZJ1-5NUY&yvu8hX(XZz#Pg&ufubrhan7=YbloUnx|PLl)OGcM4y z&~EAp#HIDFxV*^;>lin*I1^@Zczt0So|)u_$Ehg)N}K)8DFooMU_3qD2lotf#_l1W zcxqal>AR_xc4Z@PLP|Pk2!orG3EUSyhtm05iFAT=4 z@<>cCjl|TlD9ox!!sN<$45Pa1&hSSkqboDe0IxeU04>R$s84V~U5Yo#QUcLQi#wPf zzUuYL`GHqg)}W@m6g5?4$S%xBLQyfYx@k8%X>VzB6B;`a*U(Aub(#9TNCGgbc@SdS zh9ItsU>rOase{KLYuI=LyOJ?eP&{fPibjk@!SLb8A3hAZBSs)^>^Kxno`IYR(~vn~ z3X(>RK>WxK#E-5)^r&ov42>f2sPH;H2s`fx7}typ~`=~wBOg)7uQ{Po!Pcv6NI$XHMCDJ?bA#91R1>q zBMVUNQ&_m^rLq<*7Kl2grLL!AHuPNAUw^&1hE;O5ypHR9-A~&pqnX&0gj9A+_f#39 z1bOTS-GlJaSXtGP34Cgt&}Stzr6FQ!xX`se`|MM=xw+zsD=s6z)CltqZKOUsT)i5<$+@F3V*t!1MXJ43>Ua36OTCBzq*;qpE8>vkN?=3hvpeJAn zdR%_H?)02|f$FuLwLxSm^zs@T{dh zrttcKci>~ipVLMuj)&jzF%Wuct$lyr$NjM%`)GCBJKNyv=Y@!fFy!UtptQJ*-@jT^ z(`^ZSRT?}{SB`?xEI2)~s2oqPYQcuSXlxxBja}0s@#3jjc=wW0{QK-&yme*)-Z`@n zFE5G1<1>Bm`Y9p!^qM3bxGn3Lj_2ox;HlZZcz#I$UR>demsZh|&hW?1J}2Bc!o$p&`{RSmaOW1sdort&PK+(s0ZwjiAzt$Ly+j zOeu}V#FA(X&n56^XS>q<(ZOg>4?uTz2!<9$qAk@2O^I%(h;~9@G;LvFGR6!aWTu9! zsHozHUX4ac?dozAQg(->r6Q-j7m0+OMr0(^wIh+R69D6y2)_CbgmZsrZL3K!pVBiN z$z4N`F=#kSMo&U!_b?O;9>o}e+`hrc9l{tk3`Jwdqjd62TH|rZ9ybXILkA$10kivVIlXknw{@4WL4rc9Y)_9-+6hF*e=EwA}8CQqJhGB2yF z&Rf5JJxWSS41_v6JMr$j?;2o9xs{ifn``OVMHgLUfGZekz73tDV`XJ!=D5l$YqRFt zuwc{|NoDT8|9%50J-gD`%huY=%j3t7H{~w4-~tpD6~T$eX3_e-rRQMzt=n$Icq;Gf zH*CQ8i2(3G55E%#D7DTWGTEE$e7>%(F1)5Q_UyR_8*b37an3mTlvB~s*=5#8jvR#} zs^25b)?K*{#f8OYe;@zWufOp(Q)j68bybgOhN~SU^tk+v_xgqaJErca+$`|E=ud;M zklH&2J*qr4W_-r$32+vA18waw==CeTfzT7=1UnlRcD6q6SmT9Jcpn?k+4$tZAF%s> z0iU4v!G{M-BGM0(LZ0Web+p0F-38v>UPw+#GSI7%+hUZJRiM1G9M$z|+2v>Y$f9c8 zIi(P{Pt3ut$!S!5>4aVm{zl;a>zo{Xd|3rPIIocEEeg-f@y1_Q`r@NYV{zc>IDD}w z5npYI$C2A|uAw*o9F4*1~fIXvPaBGhjo|+Mg=jR0D zfzi%*m}>7;g7mLT!?9<$FYfB|$0LN^gX2T6XQV&w80=}1o@(Nv4IO@js2?t<@xZzD z0cP&oxrKh1Q_S_6C@igu!;F$pOe&zwt%}2Fs=LtzVN`jc=t@&!QD0M!H?$}QJ-HER z&k99bW)SM*Tu?*kRYbd>GAR(Fds@)j*^ZhTTHJCaAy9{9%I(@}q!koU?Uf^0!_X>P zkW|x!+G2o*KCF2Am$v#6t6TkHXO0R+7+p%MZfra29_4W6^|J{7y zH65c~f{7*t(icK=U#RSZ4?g%Obti=-D=N@vav!COSGh+XdBoVug$oy=ySv+fQkCPf zwo+klzx}p3Z`ZC}rovqJz2%l$j5<@xp|-EN<{EQc>w=q}^X|LvHc(Xko7dJF?M$=kE!gWNt4Wb)%kiRJ)7>M=eq2&OU?CV zoi#a;%Im%U?QefG=V<%MC!ahvN%=z$J!I;yG;>@}PmeiQ&+#2Wq{fKvcx~0seMgXe zO%?YQp=LwC8wfqC^c0S=3c^gvP|zc|R8ME)SW9nzBZz7JrK(ey$<0|_pVz>!&=c@% zORa6ZaD>LoDjbPGk0lNGFJUqy4+wQnQ5g8CVM9Ts*V)qzCw|869-asY2t;ynDhdmV zO#+aL%1V^+J6KUm_)+aSJ-DC>w@t{!j?t-jctJ6qJGqqhvjlIRlaJ?DX~t+y(6P=|JV@h8tj2> zw9EI64a5C};9aA`v2$1?whxcR70uqbsELZNE*NX-BCxzV3@hp)v9d89i)v$NZNo9C zI2_}PBQS#C8WCf!yFWf|XMld?_qR^(qq6uzjN^m9koKYF)jlPCb45!6) zjLcBI9>J#;r0OcspVyme5>N^R(UXy|)kj9fV^GA=yk=5{OBRl=dEE5QrI6g^6uck%VNHlwNxc!rKZF*pv*f zx-hs_d%?Nd70xwMcn;V(l@oa71Rj;1SFtyI^8FByO`ATUi_klapZVXfMBlMyi~-O) z(Mzxp81#iz$ku8(oK{@F-~yYJlfa|0f~AfNE;^quqle%deCY2IEb7`=}--gx8K zGwJ;a@`8iPTKEbqy0+@fOg3k!w*J)i24;kv!me9et2q9c>iMMFPE=24tSk3R*{Siw zDm(}9bPmt|FTgVj?}X6%BTQO&#);r}{PywSQ_Jl~{b45k=Dx1pUP8|W&Tj7T^zuPu zWHd4}vrI!qrD@3}@M;9T3OGHyxElA&EW*~|ad=>U0bX5OjStqf(!RFfnH6z(jaK(x zYYXtoiX^JM1pMQaBz$;bF5W&b3x7W=39p?V!DSS;6Y6Fe*7`{PRqIymwwO zo?qyN$EJGW`FT-zd`c+pALEbP`do4E#1Pz0HFi_4FSd30U~8u*?wb^emsX_WiFsjo ze0CUaAL@p!tu6+9w@~Hn9_dXb8ID(%M-X=Y1fet4qbL5lA_h-RPa^PQ@gO00@0dXB z>~_W7BLcB|bTIZ#h{5g&DcCtG0oQf~;i}FMoLcFFwd#o67KhVYYvpGh^v3JuI1ivt@MCvl`C8;2)Z&?xEH$-cAoGp@@Pod`$@NMCBJu=KKxA^I}uCG{w{ z`1tsk{dPt(DKIG`&38L+;vZRIs}40ku5HNBy=Ain4%z25Yu1>C3!U@80}q%oI;Q6k z*aS5lv-OiRXU;U|>6pq1@Omym$SOB|*>y}WohMl8v(m9EuDHV7Psas*y>|;qDOzm{ zda~tO*EtG3x894CjZ~s(4ep1YLfvDIhu-nvv&U>0wKf2H$A^WU4O`X`ya(T#;&|Zc z{SOQazW)@*AV|=i$m*Wh@_vW|TzJ+bBSV z@?G+KD77c>IX$te5sxja$M&&F*fA~+Ppr(t+m|U?; z%)pe|RE#W&!r+_`^kn#>J;fKT89``B_AyGYJ=Kd#57Z?3p(-;Lo7hBFWm; zQ0bLcl%cY|9;t;Th$$?imF++jm0dU?7ggUzFndO z2g=JARauL$vMPkqI)_))81N+$b}1dbgdd^T+>E3SLbIt1K{XlhEQui80^w2W2hTDe zc$9g{K zQCUWAmaP+9z1(oM)bS#(-PORAAOIK|^KjIl8ZY zOte&Y7z;eL%38>2IdI#C1)<$?BaGTTp0fTfk;eDMnOR#$_c*miV6chr-zo*;?d;| zxO-+kZkd#br%x)vn-^8%(~X_@+qs2!c4Z3gpAm*f=fvWfMaj5tN-!Rr9)Ud*BXRe* zNbDXTVN%KO9!)hhDuUsQmzGB2z-7s_!AbbwoOHZ*W;#ARF9okIsZD(!0Jl9M|?l z;D%v|xMpY)E*q4H3;WV=R&NGYwx(cVV+!UpreRJ?7G}5PU~)|gMioa9d=aK4wVsSVL> z8jV5tHMS$3mQ~3`6Y4rlJzr97C(@e;y!OG!?HG*Irgp>#c%>BxEiFeF|A>*a&oOm` zUUQqN<4dB-i*Kwa1Z#QyQiN2d!KW|+o_T>(b3yPcRsuSnzF?Tonwy)1{O>Ll?`3z^wUo_0MxVD@_H7P zef{;<&986G6|y?nGh1_puU(e)tg%>?8a>AoK{nUMf9TDm;aNr|v** zZXO5<3PDOr8cK>vP_1q(m6dRMa8W%TTvUrabBb`^{5%4$8UMJX7H?iyhBwbI!qY1f zaSs9YYC>{Si zJqmwYm55iCM46m=FD-S$M;9d#Qh|74W*GL24Z=-b&e+oKgsmOURD15Yr7r-t4i3T2 zK_S@KOk3L*j7O%W;^}$3&!`~WF~XM?IvIaolSc)ajF*>1;+gqj#>VdKRe}N^+%|;j zQwow0-PYlW9eupsuqd2cs?_oBxNSlz_RP$~j;`jS1+p5}*^j1ZK@2cn@U z12ZR#GU=w&a$8CGRnXoRQ&E)E)FCP-kM^`4v2C4*Z|g#Aa|fcS_M&KcqpMnxP{aMz z9Z0I{LRw=t(wYdgdLFB+CiKc2!D~fURUwKHjpMbFn_3u6NN%h{dTT9`8cPsSo`!(D zC`|V}g+2Be+Su2~gT! z!C&l4F1f_)SBtCNFPO?os{^vml}*%sskKv1IR$NPZ3Z@a>AK^_jWfsh?Ac@PYgL{A zuJdjEq*R;Ur{H}4`RAj!Sgoo~SiO3+sf*KbYj0)stQytz1Y@fT^||O=!C9Y^Y`VV8 zf|&p(C<}T5r`|(RQIS!91qB5-@4WNO{dF&MO|DzuohVKOyn*E%(EB4yUp^_gZw0+S z;`seqS9!h9w@Sg7H5$my}At$1{46ZXt1!V@bh@bAmo@#tG7=ePjuof2WPvplgNg{m(GcMOZiwszXrF~Rr?t?%DYO~P{v zLh;Dt5Ii*_91l-%!Yj-DuN7C8E*&!(GcmC;0e#s)=*|vC8{yZQ;)Nz!T=m~;%?L(SaxiML z;xMYa1tWSp&{$J}igNzSRLkO45HK}VRwalmDWfWDp+f6Ma!VKD>e~siW`e4j7Pgh} zYeQs3GeOs6>h==Uu)%X9sBFSZiV;>)f@oeZrlJaQl~qWrsYP-vuU%V-jD|`iRpueC zB#l-&5%I-INGeN3N?97xE3=VXL(tWfqOw|`t1{14Lls)9d}QTSm^FR&AEfJF91lIE zm$IPJOG;_rwp|~%U+V)E-SID#QyKdbIPA+_M@vCPpwP<#N-rradoMvs$8-ky!Pq10I#JKm|i+hp|bWI**@J>YCc8rJ!i0qwELR@nIF;@xU9X3I{^bhCT22>kJIl z4=QH_l#b;AR*V)2({X%6_!DWok;#A8%}k4z7wg$*F+{7g&g^Yat% z?=zC90Hg8jyhuDbJ(~N{@bv6>ytXO>Z=RBc`zH9|_F*2hw^6ulSTHs;yW;M#(RgBB zI(8CZw+{C=>ECZ?cE@8gGO%l?AMPCHhR0_I;LX!A@YB3fD`nZX>%JA+)d_{PVQ?+_C;-S5Gqr`(N>;|DWe7( z=v5MWwFHvMvqxK+TV8=ETHI8EEQc01tE~qqRCCI)7hc|k@X8iMRJ9^g$0N%DMb?HdI~|9MChe3lFCYvP+Wkdk~}07rX!&s1!=_@$giettu01T zO$nnEM`vY0LSJJ08n~`6seM^k z9sd%129~qy$HU7m_0qNUQdxUH3qhSHkl3YODyMUGp33PQ?UVXbpFNec&!^*Bx94ly zUR!V)2qo7w`#sAAh(XpMHD* zpD_f!!M%NOC-B^i+9UMb2t8K;(8a*d*DnAG2}vl*FM`wU6SDEpf>J!QvJ(Hguno_w z%E6P%lkvvc`387zUtEee&dkMg%hPCQlW_mkcWC+t0xukzO=TB6Y(tJx`Vd$re3wk#!^v+5`GbQV5&F%vN#IQEzQ8a zlVh;8%Z=e`CIxzAW+EP%nt{C&(s0L^1Uxu97uR?DV?$2>?w*v4O|;PG*0|xCVM(}U zVKX+)ZN+6X+Hk@2E}S`~8_P#FV`gVLCbtw~VnYtbR;OV^Nu24nGPEQb-MJwcR1j{K zol<_8RC@6~s7wk%b!IFk4eLg4dlP|DX6h2k7)o$hLWv!ho{gBye594tpoq}R>*zy# zZ7V{{8xT>`g0Si)gjI39s*W(MLy{~k0hd@-j@ZINB+}BQ5}xUVXkt+@Vsdg2o1KM> zl0p>Ll%s%_wWy{TCDlbJBkU@;PlK^5Rj)xyTvu0b`Y>s@j#{@gWT%QzQC5k0v*-N@ zc#fB2hnhD2Xx{iUJI9Vhfg3?2$s zRb>!xu7aJ53!L3O;NszF;OFe-0snwtq$H)m>ApGnxNlAtA=iM{&Z@%GgxJfc=Hk`U zvkAK#{Pnaf0w)l^?3w)aJ2V}~E^nG%MF=fqM~CE|f8NqBZqvay@b zFOH} z6T8OwQH}ZGz8SQ?vqE@}Xl(0orvmiEog;a_2~mV=HeOksgntpxkIyHJCsWyt3&-y9 z;do?r9QIC%G|5D7>J8`h60mzpGS;_wV8@tf+%qi=JE=_9b%$Zy&_rB4p$Jz_uEAw9 zJ8<5#Zk#%?3rk0|U}kp}CbyPgN^21&HRWM+Whw?2Mx!@B%)qWEKO7y|A*Q2pOL`z` z61-6!>w~J)2#g=pis8MTCXKvcQ${tVZY*j^DlD!*N;)lQS`IQwt5HCUn?v{|H+Leg zu??|Jw7B(6h^=ct0s)mu=%o`UOv@hIf{AhQbuv58coVj zPE}V?P2g2BDhNF)y2^4wkD(DGdJa98k|HS0c4=uDa&z-Ae(ZP$(~faUD*}gr^#5`9 zAAnIDS-Uqb69oe0oO2GNnbAly$~mJPKmvqB5F+QCa|VNR!~q*@48{gyf=LcwY~!5Q zUVGO$2XNed{^w~?-nL%s-TU7E_kH)iXZN_ftE;MehN+*P(-m5I_j4-+A2xg%(`f3L z&|%}JDR{9AU75r?QfsruFRd0#3o;i2*pFidCNyg%mE-!mIM z>rx;hG?<@xSSOFUWm%>(pQ?x7i-I5XGq1|8&e8RtQ-P_#k5JP) z)t#pFMDhLYZ&Z7-|IWLpDXqZ3UVULtwI}M1YLDR4oDh_7w{QP}@bvJ3-T5UpoSqer zm)F?v{4x`sUz&_J*IDuU>J+@TQjQnKPmHa!9_Er`Jr6Fu?VY=6AAGz1ssdPoTBi!1URM{ZxYjMHdwSBR#)d3HVaKzDZ0XR4^Ktf+%JUmQ7>ah`c zer^KJ&IrWI%i?f&j2Dh~h2r#-Se%*^je8pg;6Vw)cU5`d#{9ujPcR;A3&u6Y{cu~2 zTw_EC9_Wb0z3qv(bwm=b9UduF2_!UV#~Hg=UN@G+}6Z462d? zMB#a2NHSjr?xBs@%Zrq7H{1!ip^ixNAAqzFSJW5SFluN6vNJQa_4~~N7IqY*iQ6pc zNQyThJTe|(aRx+~(-2|HM5ur+C@TlSxp@f8&KC8Tjz9@PgGJ$m2FtTa!UAdTih5T99z(%@skxq};KD|DB~yS<;89A9#)jI{n4hNdE8x(04T3_| z!L0%Sjn`2CBkWW@mRDdRNVsKQZkbL%uwGu9b?NKjIaQup1yO>5TXjrm@H{^gufdKC zc2F>l=a|O4JSLR5ui(V%DzK^YJWp3?1P||v`I*i-2^N;)mMg@tbA=rwJV$sDVycdR z=L&jPQg}R8u1{Cz^q}jh_&yc?Ow=7+0gnFVXP@J%&%e<8TXvktu>#*GABs8@=RQ^5 zM;~#i`M=>as>b*JhWe6nIQQ;{0rGhr1bqGZY)|&>t`cXbnilHv`Q9Mdy|ytQudL6+ zGfNG4b5kZhx~>#|zq$|~UsH^ab`;{B_33zSUZRAMu{b^^PLx|Db{6|%XQ@B-)CXz6 zdunC#vb5N+(fx&H^Q0obK|LaMce_OmW52GrjQ4EPqjp&Z7AI@XXY3oRskN zQ32Yqb{E`V-&?LBAZm8So(5O!ZT1kw=Y+jOop5jcKs+!^6lF(*gsbs5JR%0?rY7OV z`AK+wK_p&U8iOO_y>YB72&bpU;l(9ZE$n`Jb^;#lh`}vIF4#LPK%f_d+o~OLZRr5) zmhkqTkx{r?pm#${Jhn6^;g(6I*g0ny){UveyrBh{-jI*Z$_$JuvWTaky~vD~tOV4h zgop>AR=hxfmmlqp+z1!#6_JGz&d3fK3|oi`YI7}^Hhv6Bi%XCyA*xxR!?8kE0Z_Vy zixCK@OlgQql2A1v34sE%0HX;3qT2Y}&P#ygD;{V}Mwp~WixNu^rDqU@W)cvZ`8-Y_ zM^6>-as4neJ4?(GPN#9Di4<6sOL~gY3}d1RNeRh_h>S!~XfS;Iec|Tm4mS@s&D~|c zq^ww)HsL^5(4#BpaWHbXI-;@RDKM!tngR)p=M^aQ#&o7BWu{C)i*t23f0l!c^?CIB ztXIKCZ@fP9@f?l!q5z}Pcx?qV1zhzRn2+a}m;1aY*2#PdhAhW@RX3gG*-1jk5gMvI zr_bGc@4XsKSO@d64nm8N;r;8u$@9#|bA$%BtcT@Uj&&0<+%k>(yf*Wxv5o94S3qQ0 z-YfGl&!4#h-XEzvrm6HRxdL5JSI{HCKKt-veE!iV`0^9^44;Yu{ajS)mtWyK0p9mt z%V%Y;v7BSRPe16+&n?y6C!c(bzyIw$d?j^#C*T}jSq+a~62i)7cCzcOfv>N?p1{AV zkElNRx6o#62pH|oE=j{H8*}jD8Y|w}mX8mvEyrKC6ydL%^YOQB1^DZxEWEhbfYXyA zad=c19%u{|Rp*EM8~wD^P#&KijOXUZ;YfQF4z@&LPqo0R+6nj9Ibv_aKpY?Ifn#Ib zak4W&fai_-MS-0X=)E#eRN90f9BcEy-r9lKRbemrJ#alc8}bL?&LN(7fFWs%Kkjex z!mespJUA>EXS!1HcvlKuns3E(b48(w%6n9 zbD=ZtYxcoCO>Velhy%74^~0_8-nhQT54R1E#jT@ExMoBOZk<$y+h#OieMcFVjV#5& z;iVe*##g3ebg=~;0=`lC$rzp$izaIbs*JwK=4&CNJdnee3;Xp$Zpc7UdxMb`?uLf^ zR4kuA8;y-kNK4OxNubB!7KEOry3+XeCL5AdB-|B{B}jU7idg`cj39wtXmS#w%qGMN z1d|wkN_d-^o&lREM=Lu<(z*d>6d0Pgc9?{$3{%-b!Pig{dS-Jnl9J*P8xxJNkT3-L z2f@e37w+zEaCLVU(7DLg1s@gGRW}X#|e`Iy5$V)}emyib3YMy@x?cq?LV?D5cuZiZG#)Ew zd2U%B&odv>6s)*d0-?u;@+ytjVt#JbF#*H-U>z(^*zg=-!*aZDmB#!Ef&>`1ET@3X zEy2Qa9OA>CX->K0XCa7GIo8Ros^d!D6L`uM?DT%6^XgcE@;`yz*Pnf%floQ%rw5;c z-p3z&Am6LFgty;HT|a*ODJC`!LzJT%e6Q*UR{~VU6Kc1ZxhZAF>1%%!>(BgroMTH$3 z<$}{=JaKxgANGr)+t=WQyE$7TeSm<^1veLY;EoD!Z4uDjQtsYbPwZMABdX@L@hRY;_e1#?B+0* zVZPW=?u-q^E&{zc+&4J~cTFn5UDGRZ>$DnN-BpE+oz+;;A&PHU5vJ5+VO(h{MvKzx zC@_i}B+Ly%d7_uV%?0@}-pGsg)Sl;M1`vD>0zo$nEl$VQwJRme7>~4!Y=NGDiqK0- z)jAKfmk0@Bgpf9|9yhO!$z}3wiF0QWdm17@ocg;OLT;U<$b9WN}x;jgE zn=eiL_i*cJY^ZEZdSk=m09?*mSh8fvZ{yk602N@Eu7w@qoTE=&lxTyR2?^%qAZhmA zGC$An*s((^!v;TV)+{ZJ_rO8bZ2T<8`(@o6BS>fcRH5qLHf`FZl~?ydC;0e%unr!t zU%y`a-Z;K*!GZ<9mF4(90)gP;F~3J%kNX^}r^fFQC<<1uzy7*5XP9~DYuB#TAj?Ht zRT`bwS1=?rmFoK=@bNsarLNc0uaw>sdipfF0-u5&!&*J;=-g-87XmSEmEz+$!cqCB z64HMB(FgeQ(~ogU>iJ1PxMB)&W(G`m` z67rs%5rwnUqXl~DI59dA4>o(@Xonx3ni_&f+nw>mWPdy*A@IRrF}SV36T55OB;@qO zGt>RFS45th5{N^s?gC3LALWBv3q5gDz9;Uj3B%5EQDb=nG~hkj9wwkm#^V#?aC~&A zs6G$5ekfjC8jJU~Tk+TJ83Mf+oRDjsm>h{0mnY+-oZnsNi(8A`aCe=A%){m0CPv}p zOe1b@4#c`bCv2~gkhk50TgRs3wuyzfeR?(SoZpJ;XEbBu_*yI;QG%HbxtLI4#ppsK z+Vc&vO-4&*B+3%IVV5h=D~R)!5Z4(wLH63*+-!m2$ck*-ux%6O%$WykR-X3m3~eC- zOA3+&Sf+FVRJs7oD%(^ENiAjxLsO8RYC)PPJi1j>p7s)P0Un2yP|2|lTRNxJ7-2H8 zlS1-~3N)E5h>VJXpML;6M6tOG&|ExS;N; zhL@ZxFDsSi-wiy4zMif(rKkujhHz>CE`R78AWV2rIdHHq@0D+}VhZ@9#=gK2W0kZMZ)HpjrlS+tS;I#-h4u`T~W#Mp**jt79?Of4&de=0p3$hO^vpC3xP#vd0vB`gM+)f=cB%_)xl|Y zJkRS8s00p`9k0)HLWf(XDZdi@6!i422b$jXaQmZkzf*j24}6BFlpWO^oyUZoQhzGc zRq2Ex{R^%9C;0x0&+(H$?;n5v7;8Hxp)fcKLt+d_a&(86K+eUk7aRn9eF;4IF4(!D zjpI8!&4!b6GjPxFKpYqsg@;9v9h;blS6AlZ?e*FC>sFhDiUv`BL3m`WD-L#eYn>Ml zGzkDZLUDAopM<4@@#shaQKLWZD!0eI)qQb(W++}*5Q(QG{5>?%S6~>8n+qh=EFOTJ z6<0}l30Wq3;KJ+(oEjgD`|CY$OQ9p~EceAd)gidMItaIvy5rh>d)W@eff4?AUKHtL zogtzY-Ed!n1I|nd!Ko>}cxk1C%JU;}YDO5ImvH^e)EKR|_s}RW++I3Z!rQ^vJIovV z+eB543cy36ME8%2$JR1;Y^(Cat!)PE9G{7uQ;J0CRpO5M!?1I4JFcGAjOCizE5y{A zOpGZqVpLut+C}NLWXGW-(GR(?-pCMDmlN%V!Z>&2gbqYOgcCACT`+1$9&X*S4a=7< zM}|O;N{=f(7)7NKc1Bw&jGUr&dHE@;>=Sqy;+X=33;{&Cs5YCZJBGttpRZfZQSlkY zlVLCKK_1i^>EkRL&8*7cLAKZtANf`LRVK0H-VlT93{=!O_ZIeJuep*cnS18 z1&W$WsOu>(EF<(T(~TQ_pv!%AtinvaO`0@GTLVv-nq$hw#fHu~xGKct6fQLNZ~czWeU`t!^3t!sD}N&;B;6gN>P{rna#GPMI6!gCPQq z{MVlg6s4|9AIb0J;lBOz{Y330o}?`dj}s_)n=6V5Ho z5S5pRlhciOQo__zlTCPaX+GZHSd14ICE&RQ5qM*@0Z+~d#^X~XwN8z*lO??D2kNx2wtxdz%8WuO$qRj!VR@CO@sy;l|1V*xlleM>+#>WPA`F9_20yQvkOh z2B)V=$UVjlds+tJF;R8|-l0+cI5@%!d*z z-5P>xYyGghD;0ZZ7vsJKwW9JGaH~M?#swp=W#%xfnoy6$9YZjug<)I6STnRvSVTAig8bp>?Ex1L zQE^@_aPe`4n{4@<&RxLf>FyyeiqF+URGgV1_2OoS;D@)^?RF)^m^u{t&e^hhod7oOjgesEPrLxjri{}UkLWI{PaA*q5Z0x*$ z?yG5Zte@c_!9lgiscXDfmgUMUoGQk8>C8`sN+>GzNB}Uu3R^i1j$ol8rOG9Q_}vh; z1RIZ87lFuh!j|DO?}6obT?H-H$9zm@{e)h9eZ97?U-dYVZ`K~L{j zf}j5QcW~?B3VvURfBnVfXLnS4;@=DWE`9wiE)ss1)0svnGB2U`#by7Tz~pr$4F2)U zi}+5!H*NHA1o!WQnot8O0um86zy*#H%5s)Q?>_zI-@l&*zW)4odTmQC-rAOj=a#49 z!qRj+GcR4jMw5Weh;uU}oSYJYC#Hqs*?A0Qqj6z=Do%C9;pEsTJUulY$3*!(F)>)e z)M&gg$Al**#)wDYrT$jrYN+N#S^KcmQrLb-*3fjyO6#0>?YUaeQnT4h-`V zW#}wQkI(Elwk{02BqTjP(TtsSKDefI5N@q=6HxhL?+9N3ToBIAj>litS@Fgy6VA_% z#rXvZI5RU^dk*=;^mrU@=d-_1JTlw|N7@2Jk%}Um6py{F{QELR7nM>Tz&770V z&Ia3W&*{|QWot%vX9b_-8BAuxCL|&vItIa^;S#zA!pqwio?hMpF%P&?%`tTK>ULL8 zXSfUWJlq6qZUQ}5fg9aL;O8QMb90w6p73+`gO`)!@eqaQIS9VqgT59tlJOhUZv@(4pev zeG)1RCwa`ec%OX6r_S?w1R3|K9tk$8GQxy_AxNpxGvkoq!#eITm=FFKII9X1CoFL+7U>>IHf%o5mo^k~{y({R^^^oi73VKv= z1RI^O>j}N@{p-%Q_2s5ERI0cb| zJmB1apa#5NeFvg%Z+rCX)ffGH3-sQ*Ap?KAAro(HNyA?@XXB-188|;jz&0rfCnm(< z@o5R-@pyD}sHmk-fuT)QR1BV&Y7jMO#fh#UJT)~6&&)JR226fH_D z7|%*r{LCDIUaK2!Dei*@hI-<;d1-iTT$}(e1iLETaZ7;%?ksb`&Qe$GsSCvI6+XDL zCIlNZ2V-5j1GX2q;U>PgyUb0}y>WVqL7Ve?Vqy@U5D>q*Is?xyOVgf1o|qhsg95!{ zV?%JdD+=cY+~;SSMZG5Bt_E-1Bx>^7N>}Zf-aQ?0xVw5|h(wob*q ztut`<`f0dn#UyN-KMosawqxa_7AzQDi>X74(OH{^5f#~}Oix4=$L-m|QEcF|Jx63l zI3P3175T|wnA*{TgZJ&mn2~J)I02h95))OPK#$Mh*xSnIaO|Mqu`MHAQ+mlpqt+1- z5fy>Jpg{Qe`ohCgpe2xV6PUS5XzJ?hra{b0Am%A)gr6HbBV_9-A+9^&rYSrDp1Ze% zv4kH%=ng+odqJZ5f~1}>PbWl((u?(Rf+5fw#^4~dRn-e!gcOu@B%DGW0d;xzAAJAe z6^vjY*a!u>T3?U*?9hmdi_;nijSu40J{8cAAw#sq2e=$O8$FHB^jMzHxA^RiMraT; zZ2VMWTpgLlMoc4g)at`rSC0T=n8#;lv>7vI{1)EQlqzC*hMzW@O?#flYw(&Z$7`}2 zLr|s@B!nJ24`^zL3ZG#TFzT5epS7_r!j<<*D6%X=d#X(WhIv>QJ4aYI%TY1%^ASV@ zJ0Z>Qkoj2`^AXUkt*shhxO{$ebhK8EdH7wk9zsR|j^%ic_oveIu7@6Bt^7*RQ=sd4 ztXF&*;4UBk&M!lcKvSSoPS6o}1fRAqTQzp?D=I$1Pm~_d{rJVV_~D~(@XP1lpuMFE ziGFTqiA}@M2pggVdam{k=+oC8z55P8f4lyo@+8dd*9&&2e(eg-pw;{*SZayd}kQIdrqYsMW-H;pYBH^wF^39Q0G@}c5+;Rh|r7>$^tSGnc&{jfMtw*-oQ>Bq{ zrUt`W{%`{WBy82dC84UTv$&JMP86I5HBoWyZt!yT6!(I+K#kyY7ofShN`3*Lhd|9$ zR35`wjzIVF@P?11`3pD$y*v>tbtlR-%z=JL3-CaSD7|D?dsw|)ksBC-&axV4uf3&! z`v;X60jC}OYxlC$W8+h5jg5*ckLZCyrALLs#>7U!#Y9yo$Q4nz!pR$NyrCU)EjiXj z7_tnH2^%UHp69bZ!b>SjZrM{wMaA>1mmp!|<$dwFT6}!Gmd=g~_MEbQ0;i*+Lu>pz zR?h}`k4)p@qAI+ts;c@;J#rCI0!XPpcGjr(b5M;j@5YTAwfkbn2I0+fRB^1A_dzJK zJUdF*A;RjK~yL7=n8y#==F3x{PeECM_0h({(m_y>4YAi<0J8#k0W;f{JyX05IQ9DlsHJOwW=x8RM{Hc^Vvq5wVc{zeO4UKlNk z%Ln@!2H??=uDCEeTw5gcfuUa5S?PqktDUg7Nub#1C=40uS`N+!2(jdpsl|7amplpyIX^N=3w2Vwm0Y=BfARr(>03{)*7MhA; z)1J#YOPY(;$>8DcB+zpdz&XK3VCU`Pt%a^sbZ!DZhOVwsp3m*vy##jNe(?5`t(>R2 ziwN*Tbbv3Sz1@)P?GBrl3rgj$V^x4V3OyW<<2(oz{vnuCGqhXjX)NPR+h4W!;PQVO z^(6gQX+(M>oM;3ejSZg-Q<(x4P35KO)73r|jXs?XR+Uj(1zMU)W1}Si7_KVgIhOBf zDvj4tb*VI7Tit_7Q}ysOF~8cX&&z$DSLIY4%*S-z8@GfF(+L)pmk{DU>!K5MG2-iIbd1*tFHvAUww|R*mL{d7r?up;o31{&>9tuVUb2yode+| zVJ&-Z9qjtTUR0jlRsCScMF{NW59FI`QbpZ~(p#T~zwgM$vx}2)ZoY)Bt26P&x*Q1` zEqG+CzbK_(JUKH-+=Tmv`Qgq+Z|ol(hQpJ>@j$x=_O-d;>G|<^afuNpC;H>*S)q7u zLn_`}Zomt3V{me;kGMBpnjeK1=EsR@3&g>WFg(IV0mk^tu^Wze`Qh|bsc*O^?r-tO zo}t0Gvo1iwS`XY@>V^j-96mZWO2X_w+}G@j1Ea#Ud%0Ia>JwAq@zflnfLfq9Nq{J@ zzJDm+ncGjwJK!D(XK$(T!}V4E*i_<)>xPL+otuwcQ!}xmIuvW04Y+Yi1$M6(i#rIt zr4w=8qKVizrxPotbztGd5tuc$852iTV@z`iM$}}Zsl<%>>{wK#1a;5HjrT%Mq#Ls1 z{LoUJj}=Sjpk?Gx7}HWz zq9gDGeC}@Dz;kmStnGWT6T;P7%6P-wQ{2l79^T$?^P&5|&C?4$UcLf5Zv=?y3-k6u zLP!8iVF5@D_Cca!f22FxV|b(=7G#^zl^lr*FK5&Q1Y>dYNG-G_3W%Pc1$5GMwO0jc zFA2H~2^t$98zZ-L8XF>w4NRrc)V_ig8>BMT(Dg>D;Px-iJz8U}GHw5AQ`?SAkFO%**q7=>3uZ8T6E^JbM2|{QGZ2 zRevk@Ax`j7@%`|P9A8#`1fB*$?hELYGsOM$vrqBaw*tK{KL?(AZ!+Ha!7Hr$pny(E+%SUY zNqD%!2M=*^gArl4r!f$B)cE3tqQMegyW+9VC>(4H66pDhnhV37)$X{h!ckLwoPze4 zgxtr+2aDQF6lLm#+lLIm^~L>hd#x9)6?ksUcg5yXZ>+EM!mVSHaBW*WmX!NrZBqia zwrAninT@!8!6@7~uLC>gj=_c*?N~ao6?4Wkips0jR9;(M9)?z?p}sH)Rn{<+BzOz( zyipkCj*Kuj6q+KmXAB)3!;o2!38T#-;7gUTmdiKUkeWgOS`m{N2mc^;OnAdfpye%~ z@^TlLiDL6`an%6G@YYq7ovVP39Tx7Q&OE&&gmrZn7<$1?%DH%ZYUA%*z1-pED|Jce z>g6S{6SWuZ697X%5DbC-Nb>c7(Z>~5Ul&+h>`~}77?Vtq*i@Z|nd$K;l`ywR%1^GT z7cvoc-BB)^{0{=VpTyazAt?ASji1H_MPnmoBhwojp(?}k`to|1sdVO7rqb9bX*|b< z&DjPF{iyC#Pc6eeHb@$;)AL#iNHqO5SRYN@3(H*TJdMEM=Ti`(se7XFocjE10{_0r&UvS!GM`5qjElJ}N;fJdW2B;8FE`^64k|@XL?! z+1HZx)d2Lmtn~W!73e*;AWJ}# zrS;stx;96E7mueGCgbhRd3a@w4S(BFh_^SSOX!-6*H#$unkcv9UA_WEzB$b(zzf8& zNwL^1VdO&sszdDoI5Q z>x`v&195e8Ft#>_VRcmywhT+c>iR@%7@mQxV@t7ld^I+7)njF60~U1DVEXWKOdM8% z(e?Qf+Ge7@$bhQsXp|WJQ4r&SvUp$QhD*p8=8np&WVE%`Ah#d`@#aLBGP_qlF^R%U zw`L&KY(ZQ?EP}&=wF&;55=NEBP6iLx%Rpn#t2>98cy!OC(8l1o3EaBJ+%cV?bo2Ir zlK|0a@Ibf>c942p;4k0`mryx2AW-fn0!1-#$O;WclG8xM%U?fJKRYD%wL`vZf3!#Y zU`~20Mnt$H*SR+eL;TRy*g&rR20Xgx4-I)54Rw%WqyHm-{gDwwDjyp&k7;b|YM;h> z^~W^*wPny6ex|8w=&#Q_3S2Z*mjW1%>CDf(EK9(sG?pb$)HzkR zf)|guWg6?GvkdE2FryPl%u8q~AgX)dIp$S$>LI6&>8k9Nz}FLc%6o!~{%6p;Qhw#% zNqtJWT_pSjdOv-45x-oNdl%@bXMEaLTnlkITjR6t1qr_V@(XA|cCW53#`#4yJkk}8H#X+u z#EfX1Ut+==8*=d4+AO@cwFqym%f@>fbMfTVFq{I@vT2ryCCK(HBQZQ?X1+(fhF{3dTQ=9V8RiA?qWvOT_OF?x`3<`{X$cgnp zL97>YBR!B6?g3kb4~A9dBPY|0)Ql7uY-S`{Q(&}maR8gPUY#+?fPesB`1*OmQy}Ky zE~?Iz-VHoHqoezHOUNsstFyq+O_UqMRwqwSxJsz%?!_r(PH=V^1pBM(v}rdk;{Nvi z5$@)Kw5SM}0|QYQ5ryK=Fj$-&k?A-H?GfRqmi_F3eULfO4z)h^5*GNNGbsS|k)Fs7 z_r;iD0zEM{ae_~4>TKie)c94L9N^|M5LBa9))Ovlth9fZu8!HL^;>1?dOg!wUX|4w z8?`c??^!SFQ`aUq{>WHXU7HZ0sd5AbjmHEFx2#{~SNFg)l~)0ct_K?Tna1-fT>(v9 zSHbCr9|*QT_Dh#IOyysG@gf2DFDKkoT|EKFbIKI}uLLi>n(GNam3F24S{^>5<8wWM z-VXx09|d}v%Ra+g73vaxe8$HP4K+yl%P&61rvkmtzx+hbe+7K}86Fzng;nv%=nRNK zp_@PaoCaX9OMeWs?~6hG>@l!cfAkaZ_3PUgc26%($MI?r93a(Jm?15h5UT!NGP1=ncf4hCw(tBNB(&JVfESYv9{c=Z1Zat~e}^Jl^hwC;2>a zdZ6|i$#ZkU@Wd<$ohJxnhkD`uwg^#s;kdTa8+#`t;l7E9xUn@9*SADqMX`jwb&)EHvkz9PB0H}KwSU+ zFi4#?e_s@ZgdkIZY8lWUH4;X5c=)1Lq<*clGlmEIqDItRM~E|~CI+A})D5Lcp=fWe zkyhNzHlj>xzT^ZCxRr!|{L&2$HcXlx8f^Hg;acS@7~Q11>Bw3lI}=s6(JOE&>-8qzDX?aABbtFG;9;cyy#dN>p7-h^V)(`i!gyWGhu>!YPJUS*sLgFyl`r&~VQHY~`@#wffJS>Xw zK2eMhkMhLvPJan)1Mv7{35#33a8Io>_7C+C&_&`&E-NKU@r7BTczHntPEYg@=!M|X zE-xIO5Qy7FCGHv)fO`dS*Hua=J~C_Q&?-XbE={v8pB(^U9(y ztuP7`3S!Wf8IG3pa170eK&8na#fjd?jdeqMga9wp8JVJxl6(gvD?SvpMOnzO7+^_B zhAGvgE$C^q3iKqrHKwE>Iyy!hlH%jVP73GCofO^Bb7x0}yNAF|!dg!s4SJqle(?6; zGdcmC0MFOgPuvIIp04nef7GCX{ZQoRk7g%tH1%~tv0XouI=R9qf8p{2{m_sQhwAV! zi<;sViNjKZjzav?%D^)556dq*8~ zGeT=9G(8yDNUmgTxXh=A4vp!`c%H`!LNw-QBUh&1r>SfAEJss-psB68hJu#5e$TSX zn6KwOD^p-p#%rj0xaGCfmQ(Qbvp{IfPp~kLx*zs>@|foc7v|@8%46;mNZkK-&{KY8 z`27oT3V8o1_>}8m_FsUe^FPX`dSWl~ncg2jPwi{R;#7s2G86#*EXwa^33)Gm^0_Fz zPjO+@D%_P z98OFM!})n8oSc-Xbu{d5@|U^4)8ot6CsdaihWP6D2h(EI4f z!FXzdCtg{=rRD>1dYUM@HU}Jdg7*GA=p+Y zVQ-@-zVaZ<%Jo54x)&zpgxoN-?rN{Ag z0yq~Jg3%iR-hSGUlK=^ALj`ytQr_Ri5uwh5kTlovGT0N${A{EIk$UefKZm*oU)RWMa6T^w3KJImPWbsa){o-~ z)f6~2_K%;9z~nq#HO{ZJv{airOk?K^KNp>OIrffka#Ni(%a$#Z@?74EJ-b)-KZjp; zPjOG+DOb>=>%sTmfnU$_$_c#h1YqAQ=zT9CJHy<|;OiOYav(EZo5dla@lO)+{)1qY zH2IS8(5$K0l#z~^A#qrekd9@>OmsyjqcS8KF>YL8$Q=&SW;_QCg58<@AArW z=!){3R1{`gVKF8lQ@(r-7Ul31zK4QKJ;f&^A}l;yTNuDi8+a_rPPr(&-+(7zTq;i z3>xDiP!}78n)q1ch6W))l)6fVD&T}O|I8d2Tr4ckN$3edA$>qRb1w|TXQ!v!x>LZ+?z(&kvt5}9J zHu%n6f{o=bT)3dYna5OQJkNV!UQQQNYK`yJc;=aBw9m%@#4#~38pJtUgV3WX=-J!b zYu6{>ICxo|r*WnT>*gvjybkN;f(yJy4xZ+@g$oyIpJnml#oBkr0l%v3zk{CM|0@8y zr=|Uw>%m9YgYQ3q-=Dcw<_gf`Gd*@_(DgmJ>bV{VEo;yAj%#t)bImd!i zq6#0I9IdIumsi>F_QnFdx+)WIZ^*@KYiv@F0Z&Ny``Bb7b~OpqMukWiEon1SaiWV; z*dnmEE*Sfpg1Z+a9T|WZ1;mHP`s3b42b`Q3foJBJaG=FkTUq(QC~usY5{gGhd*RRs zC!Fc@#?zApb~F6(yr{f0(>(ClR1X}KFn8xLd)zhB9b3!#VMnDi?iwD3JBEki)|Oyg z-xP>VRo+-r;e$2R!B|-8kJ&|m7;bdJ$TV*Zw*{ag&J}rKgHgm4lEYn*6)Z~6#~!J^ z17QhrL2+6<$_p}JOEYWh+nEJ&CR2*G$Uu6U05I8v=*TE-Id)f14>)NvCoVs~y9_&q zv>aH>o?1_FA8$_t`gqDek{2Q*3=SLIAF<8@km}=#>_8tR4<3y8UVV@=*b(hM0oW3r zge`tiSQQ+FxslPxA0RLc4?DVLeL|6Ssn12=5a!IwedgsX7Y=HtiY7Fe|EHhW$-(D_SN1=LA4B6m2H%y=De(R8 z!tb}dvfmSWKMCjvJLQC*f*yykXakqUzu_N}pMS&`Uw1FG`pc&uWB=0GSX`6=vqL`^ z`uD|TQP6YER&)g?qS`qKdA?zq;*0k4h22y0<8Yd5;?0TImXhK)yjNGI;^h@KQGHo> zX>}n^%rWAnt8?(yZQ1zdXgR)lv=o26H5+ek%Mk@=#G$TmoL!hK05ss0H7R&k(m%Rx z2)?+vL4)2qn{)BjhIIU8Q#Q^_kH-jdpmr!$lkT%omOCX>`QNX>!f^9G2pVJ@tKXwA~pm%n8M7i=#vl z2H=zc@AMphJUZPE_ej~Bng(H0Ngv!kOv2?6;kc&88`m}XNXfrE(4I_?S$MQ59IsHH9ehC6B>?|=mfMzM5Em=2#>9>rL zF!aaog=T^`!bY2l_IqbzWCK^mhR8Hdui_LhDkds2E)>9LQB*=yP&77tPLrY%Vguk3 z>TK|AP&Bn#GQ(2BiEHNZJe`o@mOpk*L8B|saJm<-t?r9kuHDD{Pd)XNmd1PJF+)o= z{Y@=f#m249F(2=f*XA+d#SmSu*3|M*Jg135HcGS-XT#cX3)UuDF+C~~b^f6!4~ar$OfpiWorJi# z!0z3v^YE%DvA5OyIt&Cpg?UY>?$x7hI7t?9V1!-Vr| zEqG#)8K)!!etf-Hx~#nbMfTtR8e?@XC__}_4mwTGoD+Lj;H1t z@ya?YUR^JWQm*~@v{-F8%ZbitT$q=NmlmbrnYoELF)-EQ6?GHaEZUR2`)_RG;;~-jeZAr1=Y95dRJ z!%-{nYd0idXm|vwr2M#mP^^tI;6_sh7KKD$n4=@od)Xo1(+!mpQWr!;!4w+-qbU*L zavUEQ2SZXEQlf%j8FUp!hPY#?IRdl7y)ZLM)bDUS8Mcosh z+Yu}TD8Wc$eVmxj>kuRic?lO9Luo=zf6Ql!tWQ1DQ^qy>ST`Fq!)^jm-5-@4%ksWi z7lF)Yf$F?^=Evt=yf4dz#J$r$9|NklU=sm&5(D*-rp4wN(f28)5 z^Rp`N2|PN}_4yU_wC8yYZ!d?qf8<(62lGq0OW%Gi@cRn?ko&^B?@Ls<0edn_@lbXh z?n$k{>WCB!b@oMk-vL@@M#LZ|c*`Grk3oZB_sZIAytpzQ&n?OV*U~%E8IOabBLqxl zoS9|9$?0i0FeVXCEzHF;D+_Q~l-j&Y%88$TZ{{9^6~P9Vmz}vABQKI z@#rKg9+{Aaqf>2oc10PUSy3#ZrU4I(j+b-;o|@04tZX>HAVphh>eS>|33Frc)HD-b zUYv^uTLZDH)(sDi^uw`9p*S-;TvTN+UfZ02zu%aK*ES~Mwe=?KlhF3=y1qEjHc-@R z0G^r^i2H{1!@+UxI5|r`hp5Bj0%oqLvNUTTHV^T{@`8a_l5dYKqV(1d@yDzTXH3uV z#I)=Hj7jxEtI-X0i2}R00Vs%;EX+>E9Ub*pTb7RYSU)t!`=iKf5bE9C1y=D`92ARH zp$S+iim%G9H?sTmM}eCsnj&M-YA~ZJF$w9hF-SI=5MwYRE+HA|T%0sE5{1#BXiqhu zGbtX;Qs$P9aRR*`glnRhF0+jrE-%F)%5)yFAt+aKX2o`ow@W;-OjmnRW zn#ziRV54VVwTLHI8(|#}J@k-v%y5n!1>Y`_E+7f_%v zT;_eTtO7aJ8=p54Y`j0JHJ<0^<9EX6b!ti-|3DDl{9OFZe6B|Y$g%`7A0K>2g>>mh zInU3|UUBwF{|j_fbW*Lh&vV}iG?=d^^c2vPt2ADVex*EwpB`R4T@OD6It5FXrC+)z z_pe;if4G8Ql5!WNJe~XM_?vG;;r;lfgts5!;ujwSpL~w9vsdC8a{<<-WMOk&ITo98 z(Hszq7-wm74i0b`)E{onP8jIm4!a|hBJuFJVB9AP=>C=n>>nA8107L#e70G_M=Kti znug<3M2*eK#gWMt9PCQQ!(B!k7;nO{nVC2>FB?xUFUIj%Hk_WHgR@J^aCCY$_K&mR z*o<79Sx|y!SJ&X|vI;yjF%|c=CE%XnkvPWXp(KobY;ug&*>QGuGEPp4!Q)es@XC@r zoSK-7T{WKC9NU9qLh$IsUp+YVyQTvl|-CE+~%bi^2%HJxZgUk>l@x3}0syC&Z!DYC}$@4JHmi7NwVz*$ur^ ztAw`ZR2X6tM6m^FGb^|l0HLSNu5fpTr#BZ47zlrNSA+-@{k{COxw3rj7cSs4hX*1* zA^;^mPN){R%}r0jO~b1&BP|A10fUh1KM-YMo@k4Vz_h3YERIRUask|2scW?SWy=>( z)C2{hSwJ^NAUq{28x6@Oxu;kpbG)E24GHndNHz!zO-aa4j760wz#-m&SkovWoHUMZ zMc4f-j~auu-d3 zj@g6B^8^MJt`h0Bk{ESqZ{QLws zLwlwX(yWj9xMDPWx(OP>oQr|dcptnzKR3S{uE?@~|9$}xXJT+%9|wjr6#p0S2)SQ> z<$&#O5bBQ!%I~SHF2k%R(Ddm&U7t@uOz%D6_h;S>JZ}HkzyFT2HzbS|=f3i9zx@iI zef=rE`sP#o^P+^e^6!3O(K>8Ow4pOF2=h}cxV5Gg9T7%E2=v%dIM`(n92_0sz*j&X zn;MCulOwUeJsc0V$KdezL_9n;8c)s@B_^uv$V5?mv$JtVRNWy_YR5$l9-WbjqchDo zy)Y9`E-4b&72*ETCOkMk4JYOn;E2HRVKQD=nu?c~iE3P!f>)MG2rWwR;gJE_(pE=0BeApA89VD-@W}XR z>>2KjdzuH~#FSu>wW)Y|UX+BsA)*Lf1>6p}r_m8NRrbT(!(8QhzPNX^53Xwvz@_)W ziXvw$%5%V?dGpbLLd2YC&ecqA~+OY9s)ehwvaELQ^(w# zo#E&>2=>kn=<759_VN$m>gs|(4==lamj^grlOaPS3R9P-l#Q&VsWGvhcvD7ztI)I5Ep2aI@m^McH_Cx(TNj zS@G2J9GqI1k4IuLX8XQt;wZ3m%^ug_9G*v_(P>wfkxB`fl;wI zJ~0hD1X%Ypx#E$remFiM7)QqY;M_b>g42CPsk-8h+CjLXqCd8m_QEw)SK-Dc2i(~n zg6oF{VQZZ~mJ~W-e$D{Q%N>Mi=?)m5;(*a+M~pByqB+qKLyfMeOBjT*$iB!6vO__5 zKcolr(T1vUs#&=?4mCMh$P)Fz@pvYyMVooSg#}X5x|Kk9NT`IhK9|SexoJJJ1fG+< zJqB`#8@`LPpKSZ~!odFh5a!_yn?SNCI1(lP;i#67b*TKK4Kt^q)n-Gr{A1>a`XODE zUrB5T3goyz{_58Ybh9K>E_SrT^;M}jJEaka+e)!8B~)PUk6Ke8+OiTbGQ_0)~HA<8b3i=oHQ_j9-BQ8!lt4%X(q7kalLr4 zp1|NS8?OR@+A2V(W9Cz3sr0B~R9LJxef@0o1cAODnjRzycs+qayAq68mwrp*zAC5E zc|CSUsFjmx`ugj}IoK~KSs|HgC$y*~n5c>pM8jvvuEHyV;%0PtibN-8r)r0g+?b2L^(JjGB_Nrj$R1&^+9`iGVK1ctq{+N z!aCF$jl*3DIJY1}1Kv~fGjVKc3ih_e;>Z+>gs|Co{pv~yRf}*&!1vNnh|(%#SokprT6rz3cR|t7O!os#2Z^n z@zUCC?b`2bEy256itvUUKReHW3k#C*`bq(DXAq8$55pM=c^?~Zz@ElX?5qsL)!74a zPlJmnKnZ0>ddW5rPtFR*xw(-NDu>~Lk)F7#$r;yG_Qlq+tFWo853X(W!8J{R*j(q0 z_0^tOS>}!<#U7ZG?~Z9X9vGkDff1ti8WSDRnB<7+7<&{3+o3SD56WVkVDaq(Yp4r~ zOp&O}wjf`WlQqSRlvDwqs5`#8QQJyr784x}e_ua%NjR%s&coGGoCny$(eA45cXD2S zAu(T5=VRX=30?xdKtJS%1fw!CO5mG}ahbVjvu2=G6k2&i6l!8(&}KBEHX#9NkpikX z19D^IknQJ=LMJveuH4|~Bs}>vc3}_WqTI@dvbuqrk8DNKT@e$|{HCOE70<&Eo zM9O`{36w2dox+?f;4>pz)N5DUC}~a7XhhL5r2SExO$f~lQ`aK|^w#tHQ2AMw*V5mwy0*G!I_+2a_aVfT6LQLVO#AlR?|-*XrAO=T z1o_wdd)4RpGgtZa`L6^$I@9zDuP6BQO7D-rr_zCi@Lk?v*h`a)6pcn{qjrP zH+wD?rDb5LEeCVN$EIf?)jJRogZ&WV6a+(v0mG{*v2u1j>|S1*fm1URwAm7;XQhei z%g3`zb8&u20S=5w!b9Uscydub-nym=AKx(yuWzr#n*y}AuOA_bt{%tc=i#C05{_;e zikGh)iKo`r;^|EdII^f1PpxaflWXenOHWo)CJ2524iu)JvP<*hzDayi8p5C zx?!?FuQSUN!&975pEwZJvHdY5svk;1d!tCg+Jf*wNC_B-0%I7;b5fAc;VC9jMyVF0 z@ul0Mps4T|)+S0=8yXZ0A8$?}pPP_L`B7ZH04`24Qwg8%_vWUqcO z4R(gr(;dl94oLHGLS0-0hR4NVlGHgyLeX}!NkZIM3BAptO0B4kjF(U}87c7w49U(x zZgeP8o$W9x&J8QGB5`$T3RV|cG1T83#r^xB(cKw^a@}UPL1=Jy!sy^240rNGu3cY5 z_3DiTe?R2O=g7&-LWVUR9U}-mX)FT0pG4Y;uF#(3X`s_VkBwIk3DsDYsdH=~G&Wd$ zgV#6Wo?uYvH2pQ$;FR%tsy=S@WoQZ_SE@^&roV5MuD_1n^!KUC>94J}Ow)f>?(3ne zuV0nnz3J~oU!Hk*%yQijq^jdJ2)Y02KmXia51;++zQC_Upjm^LHy7enj^EoZesw0^-fG3`8%%gktoYsPM*G@!45@1a?kXUha#T*{&FG9gHzH4n%fBlW`Dg;_Xox z-4|tHy-^u85c$FO$POKhtXLmZhNUz_7${xz7}gh>AyZRFXhA1WA#RnnKIACL=Y*8%bWhk>=hDx!w}8 z=VoDecr2!RL}Ej*5i?;lBVxQ~L@sdT=o>pW(50m!>~QFt9%Mgt8Y^ufFs^ zq3HpkjEzy{U zt9(CQx`-dX{t6dGy?!fS?RQ`N1J}%4iyY@*l!(}9Hz%P@+GJUHG%CUqM93JhWpWGF zPOZS21!b^1yTGh<4xE~iCgH6Ck4{X$;{v~DmsJYzB)sh~IJqzd=T>Lpsg;Fz zd|5f3THS=B3nbj#){5t^Yr_jSjm9(AiC;enFWfR4CpV160UCtl-M)~0OdPnRi?~hGI{jjydRaB2N)|3nM zhWKMfdVee`cE|J#C+%5Yhxn*eM>HD;q9(CFs^UfE#oCL)>y4^7XXJ)BAUDbr<>?8i zEKEmQS~4u@7HuH`?cKF$-Alh4MTNyg#lYXoM_UAxS$cUq1|WbVj?tsLjy}ZNZ^vjSNGx0IoAG7}w;c;+DKLEH{Ot+Qkl4;qK_nPC<>A zA8Ogz;qHNee!UUu>xqQ8aF`9@NH<0xEjkccG2s&SCZIaZfYqiF?9Oe%+Q3+}+YdyB z+{4H)Kdg;U!f^Y+h?3)QFIQw+BwRP9BGZzI@uNDWD554L0&{Q3&PPOe>p2zw; z`fDp=!>93_0++trl}ufSuu#Udp4X?TI@Nc`dYQ&8kD0Dq)uGb4rE#nOJ{9m3^!_{G z>sjtf;JXt1eg{2TO*!Ub(RgFWuCIbK6Jb*&C+f^)ts6#4&h!TQiD?{(;I?YKcT*AG z-ky%vHk$Cx)^xnRF;iQ`_0q0Qug zkpjI&f!>h#ekhIVgWO;{`2y`w6zPcE2p8nV2VjUms34oOEs~IAF>8y08YH~slJChW zDTs?pK%id$eB3-VC-j`$9O3Nkf?oaX(El>@LPh1}3~)rdizik_#9?iO0ZSv2FeN%( zU>Ae-Fn=sbiO1f`BK&2+G+bAjifYfRknJa-Xm2|d*bhLRqZ^t$+>z%v5dK$*dK^3e zmV_9jOUP|aPC|ZKIC>-d`nvVk>sgLl zrm17@E3na+u0KcP=iz?OyaW*U71U_ls_(O>(o?P=))Qv`sp}QrmEcF$!;gcKd-|pC z1#Xx92MNQK>(jM--(AADpL5vEulVrNclhw)x0pU<4q^kN5$@}U)Zh?|O0-~1q7`HF z$}ys>1RIu5M14g(#t$>Y?xC>}I61?j4NEyR-h{oQBJogX5>Cv_$K&%Wae7`Qo>)?W zcWxevkM3>9OWPaq_^N80A@pt)(A_l;&xx|TaNAP6wQmRBd*n8}`^YW$@c6Cx;OI5@ z+rf2sZP!dZcf(jbwW$Rsme-0(tjF^kYw_}yN<6lSc`!yK+GA*ZUo?xlYcdRwu(m%c z1U4xz%$K!dpz|GeUeFQ9nB)s+QD=Kf`U{pAJVeDW}EcK1RQvYbo@Cn7Z;BYki z`Jma?9TOq~u(P}X2U_c}I@gF{;jSnS9E4og{umt?j0F<5&Jv}!A}J9=qaqPz-ybo) z-pDW;CDaud$HpVeoB=~v9HQi&%$}YC`A|#~Roa>qkGjx63>OtTJ2e>%)T%Aif+=4ksm9pRId+hM;9z3Tv~X~6(B4MJ*%Sl;-^#{&J9FlYR-Sn|&W@cK1P$+z zd3lcq4<7t&3ZAN;tGDnTInIvb@R-K2ed<+~+^RRGsr)nrXqs9MpUy$aydELLfy}%< z-@D6UG-?c=wvw=v=hQwPt8(0O;RG7X_kPo@Ekm(-&Uma_)rn;b8nI(>6YS2+ zPQ~fj9DJOI)3b8%@c3jrI@yBzI}F%2E)|ESRIv@ZQn8@zLXZ@xhth`0U*M_~^`?`1{ct@YddSc=gT&cwR!^ zvzvzF+089Dx2hH==NI7E>~uW2BnNM7Eyp|CCH!4s!MO!-czKnCw-U;-v*N_WNbMy; zdxr(#{^n5JH#7iuH4MQ1(T;NOff9~~;r=#vY|6L8>MZtRI$>jp3)U35W3Fuw=I6L! zW~L)1raE9uvbfn5?G{(G7zU#$aUkmA>`@Wb7scUywBadP!F`bzEebEe7Zn+a7*b$E zj)b%(6W7Q~gV~l2QyO3FlZtrxL4`#{!Pn1UTkhS<#Z7~r2j5N0_gYAJ+gFs{AW><) zeR{#*;D{1O4~%yY!Q#Mp%ngdg#J~uQiHt8%)?xkc-(xxo8mhm8F`Hmzsif@zEWlG~khP1fB*x{WCr8 z{-YavO0CiOVje<)Q@;oZHZ&?Y4#K5z+87~3<2m(iTQ&EV^Hgap%W@o0%-2BDIE76= z*`LOSM&R(81RZB|aM~Jw_yiDpgE^>q%a$!#egaSJv*9rhA<03*901KUE_T3cbFqUL zUU)&fFQyUh{H%m2#}(3ej!u|!Mg?b?aKQ!6xM2Q|KKe-eE(j;iyx<}X{QMk*&CkG9 zRJce2?}5ks*3q*!O8pZ;gb*I zlV=X%<7Xbm=NAs)?~mVyzaG64Z$Ee)-g#gP-nwV077o9D(>T1ceH6|tD#K%Qb45w! z)Us~oYp#2zbidt+;r7dBS~U~Yyh#*300ozw?YGhHzy(+iWb{Lo@_ zLPO#pG$c5nHjc9!`k*klHwwZANJ#4lYq&G=OyMZcwMbZ-21|+&DH6(B1bE332J$68 zTq7@18klcjpf=OOO+crVo{xus&)o$+ex7i3ae(`PzVPXF72@q3kUhW|LkD|fk|@1S zzc6%0L}PS994ZBDm2qL(OpH`fh$WGssELlk^sF4rGFvgy$qS?VyWsk06CO!S#qGgS znBwh+T2XSfp}|P#*9%d7?GV)OD(%f@xmJOgF&<@^DJV>iMs;!mMyF(AetHpBrDb7r zZXOn7WMI4~z-IY;g-LP9G$&!&lrCu+zY5<7yno8xWnIU%M-f7oBsKzNY(Q+VY;1g1 zM^JEP1w+@oygV&WZEdagFrAHyiw&q}ObU8zII4m3nDbL*Bj(2?Rn(8Lv%s#<{sXrxq)q@f^XzYw;Xc zh~a0~pHs%qLuJNvf`IqH7n8Fb)i9MX6(Av{K9~MGr76|NEoXfYY*eOdezH=(teeN2 zO`_HbCE)uAKVJF-|NKG1 z^{>9c<~egwVKKoLABD!!Tuf*v#p&JKF|Ean8&)-9&-LSQ|BlhPX;vZZ4t2$2_wZoc z-5icXovB&}#`$Gscx6*Po>^Umho@xXvvAaUw2Kx zdpo;ueoYn5EXl+9)dhIx`fB|BmLYg`lLfDDNyguAOvU@xoABDkC_KL^45#P#;rKK^ z>}_+F`yGtq(*tm7PB4y57xmZSg*%7(;_l(0xUta_%W~|nreqLSm3m^9%^jTvM~pQN z#Izhwj7=SkVM+Z_pX7jw=>8~==!?oo32TE1y?!W)bV5O#7qSupP?(W~+^iI&SPU?w zn6%~IQ!L^sX-F}pXmdJ)LPOx@d7#SIXrl@d~MFykT5QEIbXrw2_A~zu(Y3|-=jWS?Hq7_|! zkr?IdgU-S3SnD5*m41O3@8ydzUVc~|ZNNOQAWU%eMz-Bm2puR2(3FTGb23I&mZG{S z6M5ETq5QIT=ZD~*kk3XlzhVC3p4YP~oGPyNJy{d#pwQ_$n0 z2F=aQ+CGg?Q{gh5ja^MgcfQbe@812p`%%toQE5`a@;t{Wswr(eM-LteE%Kz5qj(d`BBu~k3W2auP=R$ zpDulcYZffRa9a+>6jfklX9pIJD#rfnr(@aFLfpM$DmF}S$E{1IW6PvE*gY~a9t7XR zST)N}O9?jVISu;Qd`)_*kI#;i1iV|Ir)p;i;__*ntog-7w;vq$ikrys@J=Z@grvq$jZxuf{t)IPj__*T3n(0k>MnfR+H#rN)* zfS0y4;v8R0v?d=fZO+H@>uq>;RT5s`Y{vUHq~p!42E4K%9xtwm#j`77#N%*kZYUm` z6^wI>V{vkJ7>=?Rw=)2{1l)JDa7>;jmgZcA6-9$EJADA!V|t?{q8COc^v1+YXN*l7 zgwg4)XqND{B&t910_;#2a21ODd!sP0KMEs6;l=u(GRKI5oD^6j#5ANN!NkL6otok)>7$txw#-vpcfk8jbL|Y1lae3#nT%F z-d<=855P2297cxuqdw3Vg+89h2?>NPE)rIOmDOxOMp7co0$>w#OJT#I7&GdiOz=$%q_-5&4!Bq*;<+u^M2t zSx_k8o0(UD?L%s?)S7{f#U)r>R)VFKWf+&8fd!*SN;8qhA(ZVb8p#Dl(BJhkpLR&Ak0{=YJ|KN zol1|B`&B+JHN|`imNb@8kYvN*JrorcX-h?^MF$8y8b1$@xyS&a$9;y9>fU&)!gcPe z&p;z+SdKv9J<{}3?0T9?Q>u^9(wn+BK8vKOaGT1L3YGinzWI51%trHWV0Kkd1msJow}{KsE^f{Qn2N&aSk8H%pXKu&m=l9_A=MLc07mnh?7f<1@ z&z-^B=TGCUr%&LWr;p-e0paJ*?#E|O-ig0HycsXtHV?1eHWP11i2KHMt$1;ZgtXf# z@WRGCoLy$Y^J{E)cSk8+-;l4pV)BW(Cb_pb0dtsyv7w^u!X>1Q#bYy~aIh-?4|n-s z-#A~~*6fWPRi4;T>V$F*dO;MjQHLoV6dO7rA1zKyQ@Q4UHBz6h{w0 zK~OIg1&Ye^?S%rr{wRuY7qu0FA^B-YPd6f2pl3`M*jcTj_NeeuVKJm2E-VTmegWEi zS{H$y_TE{69$)9^EolKh-U#ybMx=l&HY^BnL4HW~_eFU^BI+VSFebzeOU+@J78i_q zQG$j3KFEoRfHg55*|t>V37qmorR79N!X!YA6=0?~JEJNlPE=tm#)U_sBOnl!a=i{w zit8+?SRv;YM#rMj!w0eb9g$O5W~JM3)AXsL>bi#{Yp|oT)0jd1*FA)VIAnvN@yD+0@ZrN+1EX?dM*xlL zgK6xTU|u!=9xGsQMg|)^pLu=o!3SCz&l4cZI2%I02&povNwu65?}rVW5AGFQsixSG z!E;nVG~NS2Nsv-GFvKJ*2m`7*UYjb3YKrRg`0?Y~Gc&ceAJv>5B!mRP#rvbuWkaX& z*_&D+S*_&4XLNk_$j`xjHh9*>KNfuEsE&D_rmG$%cQgso;yf>;omZ7QVr~K>$ zF#j$1opON%meUVeQNUB)6MD-39r*t28~_|ax!^Jb+J5-hk(B zUx>3iCgSXd7F^ieh-cPU;mvEC@Xiel+5qU+w-)0s+skoyVl*Ba8-=5jjku%17ds?e zy}8y^LfHW9A1C2>t2b_IbirM%F4#HD6tjrUIm)aXk3f!=;zzZ|8Jut>B ziqFyulM7tXmNF1SlLw(9&R$eqKNN@eM_Gge@&o!KJHP=Yi9x8&HY3L-3eOndJ)br! z14$_2;b;Hj?+{e>H^B^wi&WEewCV^j0co>F? z${HFPh~>sG++LD`8_R5%6d#OMV;stj3CI?eN8sgJOvn|bR%WxILO?XEqyVKBQJlen z$PNv~UghsO2k*r~ztbS~$_<>+agJIge}fnhG6k8c3MBo&KQiP zWdWE|9B5M}YM$P9BtK8L4dbJ&LgsR9^_ z_7*b&g{sLeIHLR`1|-EFwh@CLB5EM z2tq=j4+R0&U@24@3*I432^!}y{q%e z{|J76Z3eFUB?2VaBKmY+5iHD`!u@c!@0P za!OF2mxGz(E3j^1D<(Eoz;0is0r!rI#-XWZoLQ8MBQtDxd{HqTn_GydSJmU}N>O>T zeQLcx>84Tm_`zv-=l*H<`>`$f`nlb>`0~T}?AZtL$%RAsL_*w;Up$UaUpaxV-a3VE z|8f%FzI6oOzi|*hy#64*e)cZ>O%&q$2iM}w+o#~^bxnA5b_ov8EWpv(xi~Q|180}p zMDb+__zLjov`E}J)EhTed*J#Z-lFazv3pb$Zmjpgjg7(BROX4*d4sW`*ach4J+Q9W z6$>*5Vtk?`?0=!&p$*6wFiF8DEoHq(m zqmh?kgf-0wOInIH&5Y0TQp7Ft7uh7xiiwOwpsz1{+&tmwCa`mNhZD!-dGfW5Uft01 z_J+SNS3mYfe6SzVf&(zrl#B_Y*d_}=$47=^X-pV48>6r;Edp~aA)-LTQJWHl>a;kd z27ANg=>xNGxRg(T)x=H+TekwtNEJ^*ab^ZeQ&Q2Knu*zEHQ3nHf-N<5*icx2RXLdw z+8Zz@KOYlqxpHkAN-P#cxw|0EX)w~<+z=lcjm+!ID$ zBKW5?Y>w9xE)WfuF(K(Z`h7#{*~n@9@zV6?*dS=CVJTDl>bx@MQ^rt~rqY;K)lG%h z)7VgGJ?rdg`uk7-psBj`=V|(T>UoaFMy~SoJpb?R<^SKt&!BhxzgbV`J@<6pzutNO z|B3(aLXWP1$Nk?y??;yZ4!`^)Vg1kYM@FExW#K9`7ZhRYh$?JbItgPNtI$|ff{M(1 zv{sd3*_=i!o?eTFybRdw=}5$VT+Mh&ia^GK16|2DJwG33msa51iYh$2t{yLJZN^L2 z4#itLN8y9Leth@L9r*D0HvIkJ75G5F`O=OioLX9nV{`NI$h34k*kzFWNWhu- zDL6IPh({*J;g(u~UX?d)s`0~}Euq-g8G{`|oUx(A8H+ReV}Z4gsJlT}E8tsE;EXxx z1JGp@;H3{nm(>wN<9nk%wwJ(efcQW(#P>&eIA=K6AuHf2Sp9n=J!~+FEfFZqG$1SO zvdR9V&xY$?(`Exv;o3=ilwrrHRrvT4OK<6#)E8!)d=SBJXNf;S`jKCn2 zM~afm%EXuqE1Hwz(HIwwY9CiL_`6|LVkjD7g3yv`M3bm4Yk)6uB~;CJ^hEk#FT^@| zAv3{%Ap*JPvSJJ=DnM>lIsO6Pnf{t8o!3^TuHEzA)HRr{ z&#xM?GIigoysAs3v(YJIS(UHnXV4oPF7xT@{D0f{nXlyPzLfLcdOAN_Pgnmgf8;%( zcO~%Yy(jSW&h!5Qo|OObCw%?Al#$;i!`olJ|0m{6T8Qy29hlWM5}g9RvYZSQW#*tH zrwHRlwP4MP(HK8EA2})Uusbv*6{qHBin`0hk?AHJoS2NWOG@zMqEZ~5mLWjO!MQaR zcyW6JUc0Fk@7_5YAMKxoH}}lNhex;K$7go~uO7zFFCW9#&pe9H&OL(9&x^WyNx&y* z7he>W_u@YM`qKTl^z?1`@bE^wzi$EFyJsw3yP*LuZmq)$o9glS;$rL@m#Fo^?r96r zhM*i67l)mVLAasPM|(l{&2>JaN`pn|xnq5aBUTqWVpD|&Hp#ig*$$YLY=@3G@gzI# zIo`aNK zO`?L#64IJ1soEnI!dEsQ05sV^Kx6 zsLCYFkK>K#yypLx#(UR0@2RKj!ADoX>*@SX{+;&(p9ZhX z|BiasgYJ)@cj>2}aY?@VFTVc;-+nLuzVda>6zEl#RAchE5dya+6iU0uwq>ETpd3>s zbYkVoPK+3yg8~r>c26$J!pobB@cOnQTv(eUz{|yjRh2lue2A#MN<6)?5>KuulTfq@ zuV3GWzuek|H*cMY*Y21qVeTS)aqMbbd~zqgd-5K9_4salef9x-cm6^AaN!~R{Ne%p zc;P;Q-JO!>CVckjM!b99BD{U~6uf^=JKo*dgumX}iof1G0xxW+*3{;G?U8t(BN}%P z4HVe9W1FZq`i7i=*jh9gH`cmhTe$<)77h?~=Y;E>7 zMwwGln;=k)4ntvl1j^ZIkra=y#l@IZF$B{ps<5=N86Bc1IaaR8KM;Eg>Tru82a^KA z&>9tmasg&xhA6vS0i8(nf}AwuXQd%8J4?1XD6*!bLhiXW+a`79VMc8}W|U=NR(cv1 z<>q33Sqa)LCX@$yqd3?HDS;j+up|gD6H%U@39HSFOi_@Ta<6UmL$x0gO>vJ~t%>tL z;=jlbSv%Aw9o%tV0;D8(--mL3 zray9iA9`2ccTeZ%{WHH3{OJ5{d%A+20-y4W65f9E^N;xU*I)4U#Y_0|tM4#v@@xTJ zK4wiHht9Fhs4Oc&rY#f2`Q?~7Z5mdunu;+Um1wA{g59YF7X0P9GW`A4A$WOX4xSRl z_tc_7JR{IMCn4^WON;Qtayec<1g~8?08!51g4!zahK$Gt6)0>B7dTke5ft>L)8BT5Tzuc;P= zSMGxAnu2k4Lok+?cwu_RV2p^j!?5^X=tyHm{?VV znRSg=KCA_cN(-^LNFbGO#kh!ItP6_9wxC$7h>6EcvkAjeB>YT>MNVomiba_fi9##P zv>`V)2f2Cq$QLP}FClEPIT;m}B#aqSfax`*SX5kug_&8HWJuK3*J}%LMMta;%0k_c z7U+!Z#4zOBjL6EgA}c!^nd#YR8_}V$jR;`#{Y~3k2AarJ3muT zSN|4QN~0_I(f@ ziG2AzR}@!Z>B0rL=ISMwKBW~c_4TlOcv1r1zNTDT?c}-D={P+%4Nok{!}F_a@Wi4b zoS0|B%Udf&kqyI}Hw?ua*S6u=4MTBmeFI*;t_^>^Yck%tVX*6EFV?5s8Jq54t z8jn}*9D~2?nTU7yOu}F8?!p_ljl-)qjK*u%kHcSX>BRYUe5rEP>xN6CNBJr@e@1_lPL%ncnV^G1%P}gKLL)U}NzhEYInS4Ha%!UFIRc z;3~)-XqS+-C9xl>Bke@t^+i=wAJoPTK!ZU57;BI6sD8)~w?}HI6LJh;C`wB}fz^Oa zfeF{9GYeo+Mcr{AGJzM9U_gL`seCtW_Y1n+;V$6w63BT9^n4}6?FODF!UTkdkRas7 z#v&^q2sH^wXh=yxHOJdUNm!c{gE|RQn^H_@%ge>sl44A&9)cN#Sy+{ujGJpRFh4C6 zi&7)8A|VQk!a^`BJQ$tfp%@ww1Dl&CvcdzAmlO?iWEd<-iCSk%T2>ZvGP6-E;44ed zM48o!(zH~xhyq+%R*M}~4OneW#k@!#+*fPGV_j9)T9$@fm)=N?@Ia>e-pWe9E{_pej=!(&wS6W7g06qNy#7jZPWC%JF`xjJj7ItMXU6 zfBn7xpJM91uH-$zN9SkLJ3r@t;{0xuUkQ8)epiCvZ)rb%FTcO<@bj-<;o_xF@Wtov zWA2Pe=xiT~<%_#8YibJ`tMXA*l#8KNL$GztGTd|f2F#t-in_8wf!@R>7|)Cy|@TR=H%hnygZy(T!8Z%Ih>^) zk4w2HHJtW{I?f)|@eU}B?1#dzKFA2}2Sbn( z3T*~act-8r5H<<@tO5$Iir*227sf}S)?!4RD6V>=0hOsHl$w*!nvsgB znMT}Rla6EUC3tvLDRx$8;HJ_HEQ<@r>;NB3@$tjdCf^fdG47hP2PK>L=*eGw9 zefpy`$A}z`3(UxdMc~;sVUh;D?xrpclz`Ko@pUg7uMK9=dT@Ua1RBT9sW+4nWL5z% zS3%*pID(0gU_(-KM>!anicd|o;^1HA;ecYMaf}-q3%6>86sEIILX2gZMqqNwadLdF z#(lnwo72g-uZ(r^UO48CQ{*_HnCIzgdKvGJ1A;krPMzZvF*bG@>tLF?XO5%e)H@D@ z=FAG-3za7qH&83QaM1GV)vL99YBiPr(@YP#p8oHkM^|9`cYY=CT?u->K~8tvopupF zT>4V{GyM9~SNP}8-(b<4saQ07A|`d#qp~Pje$&!kiqbKnz8u$VT#UWDHelwYT9oIR zVaGR{JvPUHr&p%p+;S_M_p`GYVkV*sY457Ja#ZG;^f*gtBTZp06l7 z7k6g_2Kgc~*iReyn-Ls@yx?FIhlOcy8i0sy>1!aoH+_ljH<;}LlhpWEX9HJEbL0QVYWv&CWpi!*S;Ty%QfbQBx1Nz z0HOywA}%%(8M&Fr%*;S}b{_K0R#C6n$hBr6Q^NETn*|lI0ho~zg5xc@cyeeC_El$M zT%Z%$B)o6UwV*651=;f1v$AqAYE0*Ek*xfj<-ejd{|>8`EXK;kb1kQh+_fBY zW>jI$^hVenpKZm-`6fKOHbcV96g)gBQClhbzV<{Mn2>_g3$t)`Wv&+DJ|k-G?1~|H zW_2xI-drzUgBLbb;(~;#XBKDTaZ!TLt}nnF*H__H37KENt`aY9E5-Ah%ccGb36=Bk z&_pZtjkn;;qGBmiDDca~fr%2rw#Nwc!f^Yr2;ABnfn9t*?x=X&C<<>!wHH?9_1D7L zmAUp*&o>< zeUTN?-CLWRnt*gsTULQx>Tkf~doI$n>1NTf30hd|;pqc6cLL7~d|@{`G5Ac6>*e|S z`@k#E8~(vQ0zF@tLjqA88HwuXXbGz$Q4`JmNMw6?Xs>c?wOT~^8Bm&TLQa7dxmgwz zgal)__;sb3cyh`J91-B{t1rPF#aXz&AP?`fwBu}HEw)6OF>0_ECdR~KyoAoXGt00z zFc#UOIAi77+3Be$EzC!u)rRW)Lgb{f{%qLTikmprL z1|dv^MPnLgCve?4!cWZ#p|Omb8b_7KSs;WLjg3+nKO2p!xp4hFWh}2C&HMx%RV(xB zr_24{WP0$af8#&$|JR^*=|_RyFZk*EpYhA3U-9GjKVrtzY4W=rE#898G4)u!XcD$< zn2&WUrs0Mii*U#7OXS~nE_Ur&3%f(pjd*l+BA(rlEvhXQ_l=FlT_d6eLdke|aypLA z&cdN-795#n!vkGL?CY}O;M8nUX%Y@^t-(th1#p||aAbO#Hml>Y+39#n!qqo-G~p#K zN42s@lwq!@$!r{%k%>oVWaGGm$B)g;#=$8j0bUYzwgd_6!fD30idyl{Kig8L&U+6`s4SY(LG<2!2^(kkFt1QbRIYZH@<8t}Y*{59aY zyL)OXk7M)b9GBo6T)#*b|y~cmE%-S zCH7Y~V_viYGh>o4Pk_8MECP$9Op&*%)4Fk(Ys!{M<4 z8`qN4hnsXY9gJ|LN>gyM742Ahw zmGS-vMWt%l7zuHP)ZC|0Niuw<5qiwa?}}yrZ!%ph@E5h&6 zPq_5mFSz&}Uz8-@{ZIeIgelWdR#kzSbEjhYDuLaGC0Mm;9%jxOizUmZVAZ;5Sh8{= zmaUlvySv*XaqrkLJT9T>iAAEqCK_;XTDrh51qUW&;L!9;?3-Z1-th+9(-DUUCZr1d zQgC8^mMFdwJh`+0Pc13JLtSP(Ea&!(HR7n0d3jSKPHM`|j6G6L18GMh9-3^!WApPR z#7z~bCgYK5CfpZse65uV8 zkalWHZ%ns}%CkD6EvY}6;`(X5w0V94z0lsMiE}`CRDTqQ_eEYI-_;GH1$ylv5oq@gLYr40rkheRDJKgRwj@**XP~k$7d2^F7#d>4m>3%> zqZ739IRYhN|*=VD4b--ad)^Pj{UAxCwk_ur3yQ~PWL4DYzo34h3HEjfmzG|m&HBBQ$N zxgH!Byrl9{z+eL+hzK_R_|wwTe(M!x-3m}_2rSEE)ltE+Jmvxcv^U>;Q-h*<7YA1; zQFRbfszK6NR&DutsPqa83pLfI(p6p#Q&G!S(O91XDvirw)z#H$^-uv*_5DvV1wG~b zUi7ZM(|_j*dh|a6AN}|6lYBq%cL03{=6$6qjZ@)SfF5>e4mjVW`cV##U=ch@b) z!e#TYVcQyPx@Ik=%$DQ+6Jj>w)L(?ImHY zz%D2Vp1wW^l8`ncBmkD^aFis)qbN8KWu9Id@Mc6OVrhZ}3u29!V@$e!hpg=2`_W>+eoae zt-+4+0&K}n!`gye+}Jt-n`@ddNq{vZ-H68OJk&_IOz7oh+K`=@jm)fEYk8 zaj{ROez-U&A;*pdHUxr>@FD29OcFaSxHJ;a5k7(;3O#9Y9DfM!GDHK;ySI7?MZ<7cE=qf*lw z@0lQLYHHH=Ctgi-76gac0~t@Y>myh`Txr*fZXU2PR6mJHv$Y>q~HAv4Bn#+L1Zw zq6*VQ6(-`y%q;BdG~&Td6Hd&@!RZD0n!>w(TmtTDi^7f?U)<0XikpT;;>PB1+{<%g z<8a4_AlyGb7I%({(9W+Z^u@e1S4=esK!2o1d!a~xmnR@F8IuHZ7MSRl zfoHO$AYOi?At4ghdi(qq$|@)HI84PyRGy!|9|D7d5fDJ&1tTFM6lqZr$d8Uhc~~f# zLqjnpFbqqREVwSO44X3xFgrN~6U`~;uvsvxsRk1pYcZ*z4wH(CF{Qi`Z9{4>qNy2; zwbdxEC`MsLf!td%5+z)Y766x~B%!mm98-riV0Fg`+&r=g*9|Gb3Q=)ubBnMvH3##u z@-b9CUs<{VmFX!cN=-o_$L|U3ENK}?vt}SKI~#eT>}t}?m{wkl&BKS``jI0U z7{Ug_P6)1GLRhf}n2m+!Rk+FVdQ^KH%SX5{^i)cYpyMkenSbuwxtg*hTvhl>BRr`j zx#9?~N8=blsxK-^=5J_d_$`d3Vr3fZWN$C;m!FUKrHty2%e||{&1aTe357srCk7$S z&&5VfRmf+6G<_Za6HGx*@9I0zdr#2ybOpZu1b+XNDC9f7Ch}JS->*Ml=$Mf(=Gc%Y zA#UBs;TS(-8WyZvjty6D$Cm4E#JcS_VBNJhWBHEjVR!3rLN6Hyre|T-*koMW7=r7X z!UcZu*xhLrx8a_#7VH|GjN3;>;_zG>jxWl>{SzeI9TSH=or$=2bSxg8B4O;9Xi;wj zpji}PiYT}k9G`E+{>cX1+8Qc#8FBA;i_{$}A#D=wX^X{mbsnPr!m&>j-<_kQabrsi zR*1UmjI+ZeqdjJ4I%96G8)jvBV07X@j5BzkJ;5CA4MvJIBt0=q6;G@z@FeD*FVCCb(ch>&v z`5o8C^Wfs3o*o+T*m)5kp>0TDFd~D)5E~o{b3~*-H3|jM5vYre#)!x$OpA`kGJ^#> z3d^v)qyqEPY?zmxiTN4X*dUE-X>$XX46DPMq1BjMU5v_9Bl1&C+B3PT;sVrGR-wMS z5tStk7*be`vW#>z7v*4>gvDKzC0JZtitF22u&$yK8%u^@Lq;)HW)-0+J|0QVj)>|z z5OIUOkQ5{#x2*sr`ErfId<-wmLv6YVWdc1;ZEMTQkWhCBuBooWe4_;oz5$qDTPG5S zT@~Gc)3g%*BTif}y&D_d#vil-pbBMassYf{d3DSmx!TvKu`XpQT^WDuG*y>sNHnJD z>r{DYDi4h+jHbVq{@zqxp3`4nzu)tkG%e)$QP1bl5>9Z0m9kuGhgqOl&8jdhqXeF|1@T#t?0w`0xL z9ayntJ38f$lHKnc0D87xQj-vkjv3F7uZXOKq&i4A>wrlXdvuxY zF)7U%Es>(`!Uv*0bRcTN?NJ}+gsK=9Wcl@l&8IIi0tO(}yB9Kp*i&ndoUlR22y;eu ze4vE2$*@_BNHQkFl93_6OGR>uSyOmM*+#`g!&`vI1q7(-l)BRc&(BZ7+CYB<1_mI= zKLD{I;YbJwMtVdPO5+nzogksB(S+eKv6v~UZbm{PmS<*RqbP+X`2|>2T#B_>1-QDr z8mp=*F+DdO%gb_QzX;=Osc5&^&}>LXd0YZ2Bvh>(QjhY|Mhq!$L{&+tguj_+Dk{Lp z(n5?YE5MTGI$S+sBz80o!}f|=ska8r0<4H#)wQ;z9xEVXgSa8=kQE*)b?44}H zE>V9sHV0sPwVS9rKOCAZ5S%GcoER;6Vq|N?4MW4QrOF4_HU#0e5z(S7L*&>`dy&tE zia>0t4i`vxV}^vb;|%@Kk=PgQiTyA#z8{80^+sJpZ`4E&KuMT@&A%Tqefz=cB_V~A z9n9`_C=m6R6X5`pUq9ps@Ji(`fR(UINr5#h6G;L&syl7DD6<(c64C_*2L7h-^w8^m zjwcGwSAgeF-~}Q$I0#XZ5r_;5f;ltml%Oj+4{fF-OwP1mxrCrg zD@w4ct`=)6t8sNfIkxAQVUfUao~XA4DHg0qv0`nq4cFR=u&bmA)8j43j*UWLaUL4# z<*$~6zaXpb45AvjdrgngC6@W{;h(z?GB=yfj_ zsu{T=&L+=6B?=S#u`1~B2d)Q~KK)9j&#N!1woKE5M0Ii~Q}ycW=RTbchsOKUoBo^a4ID5Fl-Pa3I39(3%-+Mz#Gp5d-fhDU}V#@3}Xl`pqwzM6) zdncse_KpPHI6M+JkBG#Ltr56Q)Y~m1VsYmf6Lxl(aow;4+%VjP+eT*y=u)w^Cj%28Ebg|fn8RFoH?QouJ{!qc|& zG>k6JLPwebbE4#rmn9c_LJ7*wrbS;5$lcTX!;+x zGJRg=Q=rrvdu3JKe2D!=rs~z#snXbJ^yxg-o2p;sQ+ZVa#0X-|dy0-9ZB{P@!^_(9;;*3qu5eB|Wr z0^cA%1aq2Zh84v_Dllf^1ax#wKyGO{3~A{Cz5gG3{{i0EeV_T_V_9W-Veh>+07NAT z5^Nwrkf;O!u=if4aAr8Y_b!ck@70oQS+?UMS(a?cj!T@fX`7qf2*({cINXwr{SB$KXW6*FUyCOvj09s5uJ~w`TrsrDc~p2N zD!Vc~F_MFa`ZDksmt6}a5t<3O*^`Zno+MoMrQ(b;0V}nevB$Cz3;Ok#)ybH6n=q~4 zipk1t7?afRz_=+6!<>dIwj!ucKttBuuxD>TN5M8UrQfBL?X{{ZFJl85Ds!o3^r)f2 ztF~0bWHO`Dpo6)}sPwUwhauc)f27uXK;%W@9@} zlf%z$g!Y8;eyeKl6hv^KzN)}oUN)Ss7*$9mfE?6?~Z z8F#}`mIBRPpMiPfHdNfR1Lqg_^J?Wy8@X)xKW}2Wfq%{$^B2rt(w@ltYD6oT=nGup zpSzi*DTrFx5WSwXxBDdc$&C>mYE7Pm9Pf4@s{2skqsHy~k%&H?`hGOj19Q1t(%aTo{GHE61cm%&^I;?eN8Qjbq1_^eol`^3A_gf z^6~IcAs%5qJFVlgQk>~XRGN!i^yT1^KObk^IXKvqtW+xOZ%oAjdpZuaWa3C`29;k1 z&biZZxib~lx{~q2Y&l+>(-4{jqkRj`w#VV&!9qMVn2!hgX>YsIxqc$91+#F8a6QR@g_fN4a2IVx zYvyMOz0WAmt*y)3ibh=)6`oeKpCLD(06~C42>en~Zs?D1 zOu1O)0+vvxYDB-53s_(v7p-g~HKS^;3H5@G4p~P+ma8|2O#iuK$o^$LX`&+gR$--v z3PxE+&LjFg6@2QkZjb2mQ*X%V?IqNGM2A3JJ;!K3N=TC#wN^KJK!pFVhkX9%lG#w6*{6Gb&{MnFM;>{NdQNZ7Vhqc4BekQB~NaMZ#5)NR9pWedhN>oHQk0ewa5F;Kn@-6dP$&bbH9%)8;r-2`XOMyk5I zVWF+9NhRpyX=YiUQM#Mer!uoQqPaW`ZB{BgQzi6919Vh+Qeh#ZAy?Bjnkp+$R$8n8 zOu!axXQP1^U5=-!JW+=E1qCQ5E`qM45RTG3D$fFp8ff`Dy*NbZ&C!1D_WH56yI%p` zc%UCMJ!1&k+YxN6#~{_&J}SE-?oK?|*NZ2;ZhXF_1y33(aiz|PlLXkV##-$6dT5iK znBe{<8tO4*HARM$udaa0YDJ*U4Zp_=Pr!%jRvT&@b(HY6a5R`P+-$>fj~kDMdvU3; z8Ky~)>wnxRxLuMiD)ZJK%+4ajZ&Li*Nhgk9iBQbmPU5*u9hoPF3*492Y`RZ#4xxi&!E^PJUS3})arm{R5 zFtWacx=u7u)wtb`t0DVQU(39DE)wdz>|Z^X=yOufRYKi|EK~Ory}cT;o|@9^MQzvk z?=7Og2TAq&ica~iqf@>|B?-Ns{QXbycR&3Z{+{y_Gc!oyy}DuB7A0d-lCdd{&W+&9 zDJzAotqrw~HW&!Lb@vSwK=OX7z59lWa4D39t37$l0^H~?!TCTgE(VKm)?a|5&MX{j z&%u&47Sookn5~Y%Qe7&R8?vy{l!-(3G@S9J<9cr{9-An~74GYpBMA$}4Op&?p(0Gd zIbRlzw8i6)Lo$KNgNm^wi2&t3O_O7j$>wFX)$J_8{r{cR1J zGUVmvC@L;m;YF)Fv9%vpTx2OJWq$<(UP(zYO3G+&HAUz&mSNbe!H7|dW1T)6_V}>U z;l+ti50+_lN87!a2=$}a8%BSy7h^+x=j~p>FpLwvPV8xH#58T|erqKawHXuULU=S9I80WwHqhEqAxdqPt-Tr51fR`b z1E1Z7aFs~`-xaqTmztXJfTtZ7+8eRgZox=nCAwQI@Y=0ttFK2#y%SDrJDLs6IJkI_ zSET$eH>x}e@+Z`Jf5)V);48U}<@S!g`Bk_{hkkkZwm?Tx8XZfB+L4|U1_SK{my7zA zHg_uYq8l5_4X|g=9;FSPT=;+XXMd(N0+u#@($?vS~*wQrzo!Ic6#y*Up3UvEKf3%QD4+` z)KOpLeL1d-wI`oJ-jkI6#u75vg0v}=_K31vo>3ybI^=Uo$PF#$^zSKD=tbv02R_M2 z1;)?t51b10#;0eHkeY^#+qYrkj_ueO6NArg-h^$*Nl2%oGg34((R~>k8?f$7khYX? zx;9XZtNlg1Nm6mRJ&98;4;JD~ppf&qIN>e8DPJ*;xeBnqH46*2WX#)Au&XJXuM4o; zn2DXW3E0;hkJG+PTnguNnuXnUF_}O6zX|Swt#IaUK}+^#*mJg^A!8$IlJ9~oeLWho0qR-Eg`tl3@Jk{0; z>_p*-D$LK%R~}rgEiXl7aW3jgvJp0wVbW&CSZx(fb@$+$zX#W->`ps9xZF2@{e3+c z3HUKcfQ|GIqQ5tUFqPSGPcOpG4tS_Qms?u#VrUT0y1EoVU0^=a*^Os{19-fv8^;WK zT;^x`LbnsYcW?|Z_Bzl*mFY295TcE1r#)^VxSHy1ur<`8wXGSgREnPZ8Vu6b9`U*H z5as=Krvv-zP1xCF!5*g_-8^=8T^)Q@8+^3EtyMO(+nTU2Gfh1wx=xgxw7dH|1%PrZ zswp8Cry6o`in|6R+ORoz#hK!IRaF$RX zHZB7tNN=U6-%2&0lQkJr>a5u%>UjZ8hDZ_@S^i}=>7B;RQ%lbCxl<`&@i_0{$0Oq8@47S zQsJdxOCqNfExDQs4ej$*ExtpIGv7Ki2LB%BTu<3Jl# zpd*VQEx_)EH1wCOpBz6+z~IE+^(BScHvTbTe)(N=hhw=0zt zt%P1<)<)Pe)}uP*E|}uw@r`#Ae(TUAJu;{aYxCE`M!VKxsep~D&TKF#gMk{Vj07G( zFw;nDTUuI*^wbQ3E#;QVQ`MbJrM6apCmCIiCn~S7uu!S(HR&qgAcQ>n5{yaT)s}k9 zH8tRD?;tIE5SOUxF4A_M4f=6nydU#@0gMF$80rdOq`M0vJwXg~2QkzgLcOsPUZV*k zWdzHrcH3wzt6BT!$IAlRscxxW;q16y~-z3#KYcFsIkTo1TK&9a|AH z=+NI#2anl+*}-li?kB6z69p$XBLOFjllze=bsM*bT&$vUqG2NdklqmLP*Uns_a)@D zI?9R6%R|8>WIt+ORhbt6NzV+KmyjDkg^wBnPdT>Q09f{?KHOTiyWK`gzhD8Qz***{ zSB5$?l{_a`wif`(vDAp3S7ED$C_eRb%eHDy5(%}BuAGOg7d^6z3`!8)e>eIV|K38r zA2sEBxIJ%&-al|E;1lQ#jE-R&VRz@IEx2pTHr%m!3pT}5z~tpaTU(2SygY2r%0w37 zw{FRvjQuVG%~y!Mw660FDVV8C!U$E@n1vv!O5ije)3vEsY|O?2t?Wc~Dt0+aFjJq6 zsk(HG5Lh#H$y}G~bACX(g+PnJh+#X14O=in1-OSmoUtbHy{#Ck+=1STE$AYwd_RBfNy2BX1L$vp1kVL%NmSp_Jd%X5Ou&HT4cylV}a%*1?)d@Re;x zy*3GrmLfD(8)4MSfC3hnm622A$z~PMo61pIUX0A_JR~M(AU5vfHhC)aKB*i}Rd@ot z;^JZjn%dGL)KwH>knq@TtHxnU=bi$BzuiM_m!$GX{_h2W% zGwtiZWS|2>olcDOby973(Oh2(Z(S3Db*&ipb#t3>Jhtl~9vYj%C7%m>jYT+6nU2?c z8}NfUFJALov1Ba95FuWjumjbJaWHP@G&Tt#Z575HZpE@TH`T*lPy0$0+ErVFy*{t< zAnzypdvVHGfHS%@+-#`8aYGJv>iEOdY=J*L5pyj~y!*t~Mq25qsUYW9UVI!X1XN3@ z+CF(L01$g8uVv5ySywJ>ftd;bnHM-p$a?}k8LLi0_8~x2k0BvSNkAt1RF5OaR7X#i zWopf^08+ZHNtZEIO-a9CSw~c#+ILqilaysgj~-Q)iTxD&E1}+uGX473zpiwPlXWEJ z_)>-_@5#LSStP_Ns}@$)mGe?Ri+l$4bIP`B5efM&qB9zP%HM$N{FKw5@jbD)T|Irc zo3PukJqDX%6R&--UX@ ztuFme!j8a8y90GxuQ6`}8gkd8F7vZ!NL!~o(ae~#4poKQVYil|zE%&T4EJuThS^w! zD%zPUvpmv~pwbp0zc3HU>Dh>hmm#BW8#(2+_IS}17gCNVHN9eOMd=yJ%TQNZh_F1w zCp{Gd+-9ZPiqq|GELv(X&M#QHmrdAom~-2(zt@A~{bAgko5s%8di0x0&{I{4P=g+W zJuZxl_R;qCqlb2Oq;CRa;So#+da#%F>r{UimA4JMY720>sRWO z;{Q3jh!3Z{xNIxJbYVOui?eaQhqh13=%N63i*w2lQxWB-{Nk=cL_#Vx$-KHh>EN%9z%1_xc+?AC))xyZ%hl#BD#XN2%06UXg`R9PIXS5e zFrbF?+ZE8t_UdRbq7tLQBl{8SD=?M8PNPG90Hr)qJzv>gRhBX@e414G!jEf$+%60kWg8Cw!k zu{n|Va&Z|-ELF%O7jpDEq*at+U4Koun%SecBVE(u-QM09Bq zIiH9SGpLD2pgaz3w4C137d{ZW|og8)276FcNZ2nfIVB^-d+NqUxmoEP-dHg|!fDwbCRd{~k2t5s2A@UM4}9 zc_$1R_n@jQ1&uZ%s;w4i^i+D5T9`~0LeGdAvj(QhVw4o+AT2!&F^TEe7MHp@FzCm? zyIti8@Wj%J%8Le`6kc1jIyCaj?9u9#0YIm7vT&-V8kZUxaf;Bp(9ncS&KArV$}rtz z!F+oohO6{A)#<{`PzPR{?!_xpVXQP85Vn?MH0;LI*cisgW-!z{fxh4nMh1t`EikF)_xLi?#$@Bs$2sSFk>qo0U%6d1hr+G`42R12vN8X)sG zZro7HxnwLm`JSbWOr@Fd2oI7_;nU^w4*R1bhNLc^>`X=mhWUWNeN} z!sgf%+_g0hcWsNumShT;at$&n%285Ng=`9yb)JeicnLO-CJ7y732^elZ!b;Yd?GqZ z65!_4SCPz2f~zDJ9Yx#GUPxf&ZH2ou4xaMu=&B^dN;kq!i|Vh4M~GI;Qyhm*&UaR9 zM?kZcP>e+%m10*#0)e>|Em@nC$_Zz|cG%LW>X?>9!Y!Q^mLRLkS`RB#p8(H9D{D!O zSXo=n9caiQ90@@S!B|(8jHc>x+7KPI1_LTBR%og63`P?zt^pQ(1?^})ao z4<2_oagnz7NOJ>@cC=y2pu;l3_r^po{^H(K_%BzF;h7#M_Bg9?XtD=0!~Il!{TK=j zVQg>$A*zyKPXK*gE)4iu(d(|qkh2#1T&*}q^|w3NiUC~#F6(H&DjV=Xr44hLRG*a^ z^t3jhv)P7#!wyGXEgEZTXmcCkXt2Xw--<3u{>h>YyvBX}cxDh^3AN)~b0y}giZE-{ zVN`3&j+qH?aXTr9Y3uZHYjhN_q0uH=0=4`~;uf=)hPbeW!% z+sOzn67Rh8j#8l_`T_dhCu3o!<#CoK z5`>9pE#$hno6(rP0S;Qqj?!4P6>a5|_LA09=GzEG58>HS91ADm=q%if_M%OEEz`|t z$&oSdXkjy`@Uk{4qodUFwZKl`R+V-q0ry#DzyMpe!0#@??+#QGdNnduUiKY4*A1xQ z`_=h7&}b+`L$x$O)+s8_P-TVAC2fiMBuBH`lxA^gTQ-?D6=NEg1 z@p5^q!#nYMKW$}SJ5JB^V4#DR)K&u@;pA>?Mpw8Co!y=2 z^tQv}w8P_Rh2PtTu*-=-R~sffnz6IVhQ~X@c%sdZCpIJ2)lj#MR*JHp+ar3JIv>4_gjj3!J=yO+ zwchO!Z7Wr<$+nW8v~BeA+o#dT{G`t&%c8%p+rN*0>X>Ws332PY=J&|UY-)M|$*H;6 z7Mn)3mx?WM=~z$rCFT|rdIsc?4JoB1gkCw;RcCCbV%m(VjIFR_Y=bd%Gb)odLYuH2 zhUAUPdv!V6(3CG=j6ox9ZGF}T=6V81rW-ijs6eYO;~vG9)@9sHm~DZLQ^`gF7*(5; z(aCsswv5%Xx>Xx%CFlflwORKNx_7}uOIw{T;Jb&%-N@r^q`IT6&H60txp%`+vJEv< zh>iL@SZQ6Q-JRKNfr-{qM}=qPFR+Z4Cu5ow7nUG1JsXMf$%u=IU#+c;w8{IV$_v%j zs^xg1@C0}osyR`4HB?QlwRLdu*JH3ki+u(?&d~0@+2+N|Rn2(OV#mwXE%~c0>u8H=OzertXGi+u( zob7gmI=uv+2QH5b?QR!bZVv)37lzy0u)y3;u~c_G5@`zx}hV#lH|D@A&^Oe)bEc zcjVmMZe(T^BPJn>Ha8O+wx{4Tn_>}@k%w%J9)%_g;tTSTTv~*6`sDS5+y>}VHp7^< z4TjXM&?RkxHgTge)f0RMu473j*l0(snHyl1+EhZRI_j|sP1RQOvD7a@?GK8H}0%-ulvQt74N0SlK~3CO1W zO;nqksW3$u-bLuGM@#VzR23)EW){L!Q36AS2Gs;!RTZH}2nq1C#!3{HmLWYepKznh zCG`_x6PY`Z5VuWvX02Lz@k!-)Vr$iMJOQ5ABC#{_s6`JI)4bP((JBLGt1Gd~q{V|B z9$crjeWs zvC`?m)xka-^|fQ(YQmD)fc;g~*hBlJ5E6 z=x(=hDqPfkNkoI@_K1dxC@3lGlaS@n@KKMi?o&b_Dk1NwDWRUX?Dvzv7X2Ayf9iYc z_UiQZkoDxf=zV?Cy7Ha;Q~OwpsF3fG-_tLD&g7qU*RB;56lsx^R)DzVJjA8uV{1|t zlJiSZY_358-QV`?ENn_o!8$|AMpPzmVs0k*1bCZK9=9G9@f)B?*no<-yHOT%2lPpI zp@#4h@Kq6BG9qy;q12GGg}~d0`s}T=pHzA>a&h`+2s5#xQvS9XO}X2s=I()2Dk^AI zErg-~Pxc|ev(pw=Qw5r+{&WfJl&O^0Rb_IUoJ~}kge|RbbvpOYX&o)Mu_PU(g_$sD zD_|w;Oa?Oy1|y7SBQ%v7lxoV6onL^2qzuFmcyaXlN%2WYirazs*sVxRh*MNvbR%T7 zc}ld(lX5&!j8ZmawOY~A(t?1KD$D1>va1E-)jCX?G`JcF;DoIi2Xz)a(B{Ej+Tt@+ z7M$RAhpVfx*V&5GV*_~l$UZ!@a|YK(h6p?l4m8)}jI9=5AjrPv^y72&4%}y{#Xf5- z25V~&XsAc9qXQwYUjbf#`|fnO5%P4xNq{*CxsDDyraT=u)*Zx!fgbE@szWF-9!oj- zxYuCARjVDr@)A_#Wxzv=-QVs&xQ%L$unlrskArr%jn+8SiTnD-@y^+6q=-P{A6B6! z{|WdajX_oDN!+elqHk&yED|bk)c2zSbh|o=o=T|4h=xWq7^2sWo>#4;gbLW`^NWVA z>`yX!dkI-zT^4OkC2s#L@>+dQU3UBacdvG+g| zcMqp`qICN@l*fEVc`$Sp?QnGll^t!Ul(Siq)+zS1DR&bbMcdF?v;}tBQ7Pk-&i~T% zL`E?-6K2(fomI-_sPbfcsjR5aAuw}kp|hzJ)9!{o@eUqy9na@8(D8T%+HO-$EKIox zFyVPsH%q$luXd_1WQ(^cvqqadfGrYR5iiH()s772)ti>Jb) zt>p(-MnM)^OXy{$XCNml8~M3;C?qJ03yV-vR17T@o&Zm&yddzTJCfV&Mz7C@J;5N3 zbo#NMO6)?Y3)lPlaMIz#C3^?Xw{{>{T7rO7D(DUHQi-W&ze6J&j=X6!LX{y0xs=3*gCiHf+!sBwlM@V+MJO~kxo$Y>f zx&s8B8(wES`e|i%b_Q^git$LB1IH?jIF(g|`?AXLw6Pwi_`P)Jq$5<4hsBN-47Cw- zb!PZF97-*3lY?;OF{DY$H?BQQ$rHIDRL?S*>ts?dtQ(N&cc4U%XT z^+^#85p`Zd)>roty*~+gt?n;+8#UxT6)rNB3J0lBQI|!Bx*v7_(a=;wJx27g4rsH8S!J1SFmK$p4=<+QpLNgJV0-v(pm4j41Gpf+zCY6&aJ zy1Xs0NreRGt1{QaL?tN7td=0F&R7p)@@HX4x&!7^+FydrlyWEG_E{KH?xv!nT~6CT zi+wj0<{c>Bu?_{>*73Mopv&O#v*Mx6NrEOf4Z6Z?=y}r`#M0_@sHByZaq*01lQOV? zOm(!OVoRk$LaeLW>*Dr?$nu_2c|mKNm63_u>>L#2<=+BcSxG5$TCHMh1*B5bD9><^ z2g~&O{h0Q6aIhqOGY3gWcU2 z4|QX)yAQ|4CUI_f3|GUw_{!`~yx!=-hpsSw(AkeKG`8V60`%cPCstf-nD96daJL{B za3kz?!Qa}hK+j8M81w|u)!~J|sfFNcgRk0%$r=kDq@YXfRmx71_)GNLnAy+2dqssXl<^=Sc4tcXLgIoi(FW%6Jm^0LQAeONiK{Aj?72H zNR8V86%7k{PkldndGvbGDo}-{MD)BGDsbezX!ywf)b*uiRqZ*UE>n*cecb5tk@e&} z<^AZ8bBkUU{aviZuQ$}MI5Pbup|=M{Qzf(%8pUNLNalULjbbN;@GGd$AtkQ>nY^bH za&rj1ymXWkC}r7_>9?{XHxs)2EM_Jua#B&AolNkgLX($*itGfK3R6&3oPwItG$k!X z$uQ&*P+2>m$=nV@P8>{mT#u>Gj#bi_OIym+XT?C5zJ&@d237g-Fz3cXo4ysL1Zyz? zTTF|s%T9zYHwC(a3}^{EO;H|nylIV8cVyl-b=rT$C)6 z6yT{%RAf+4RpqI`Q&pZcp^!FN(xKf?*ac`~dkMSApdTxP-FRTnBAz>X01qxN;LK1D z)!Q)kQjv`Wy$A+9=;`ufpr;$X{k<3+9l=;{KW2Idu+%?eK z51MLmiFSQbTY|af8tn5ru`lF@uhs-pK^DKS95fNcUPlL&9u;4!3q39#%T0CZbE2Xw z9W^-#I8{@FZw31Cx0CzvfB8r7WMvgra&xg*T8Ja9jqv4UBFxX|E772xs<5q{%CDml z{Wcp;^pEn6=C#E8Ual#*{l(@g(31tL&{KgG4Ir5pV90yX_EwffLq^?JLWP8YDtceH zm*=VLsc?}{;im2{`n~8-x0NO)>a%nu)aB9V6Rr5R0?rEHCdS^h5+1lyd!is=9P}$X9wMN-GV>*Jx2{GNFX;uQTekFp!Q0+AqCUb1Svw zWvC=T^qO+$%1fXnaI|HG(3BORyeJ3dg_#5qrDL%GDGTLb#20dYC&{rxDl4c{y z$j4N7D)`jbWfkSf%E?yB>jZQv^aOY+^rCxS+!{Sa8X*_Z{}-2_e6@Ls*jlN)kijkm zc+zQHM*A2h#C8q!;_U1=o<6!l=&cZb^Efloi>ni~tUV!2_I6=%YyguZg9`M5-68Z1 z_G5@Pc(kuy8E{~Z;A`ivP_Rrx#n*>N`-gGf-lhyVFkEfGNV63?-3}ZIbz-8v7Vd&P zv}b0Zi7K|LtQ;ZQ*|5ilA)Zsf*MSaq6O7hU6s2uNS7sU>Yih$gEdhMc+J$GT8*sd& z3@0pRtnhOk(VH=sS%hPzW{g#s(Le?0Y&2uO!-)$M)1(QnDsn(>Z^24rd=N|BHO1mw8VU|3RA*ntBFZmlbA{-kZ6jO-#kJ!D(;JsF)v z_Adhq$a=C|Qti?4qaXc9QHb*BNh#it2F_po>Q|NPP+eEr0!l#h(2)@u3WHp_wuIDllkPDRl6rF_D`j+Hrk-gqOPZ7i=Tu6Y zH)ExeI)c(tWrW#60GaiQtTY&P%2d)M#N|3hgFKgl%Q*d&c|9RpQ4EtNl7@;R!mR+> zQprNXwU}7~9Ra4VjHoR=FEBYTX@gg(r16%$Re>k31$x<(umU)NUbM=KhMv^es(ovt zyPC;OD3upV&kKGpO|+$6uNQs18TtsS{@y;!3=ZPN*eIS}*^4inIe{1UFXQIaD9&_u z;RcsKxU`4^!vokiF@~j~VT^b8U}T^l0|R}$;RY}`GKBE(Ag3dk?Hxq0z6s6zZXEn> z=AG@>*Wt!Fs=?Dd)}DY@=?lA$YV$%vGp^M%U?#s9)!7+vwcF9zhb1Sy@QiTbhGq&PQZ7aUg(2EDGE{yQI>#a3n zuD%Xer)GHfM*263;1j*|cLbhPY21!Exye;!B%qN#uF{cTdOZBepZtl^NLY#pq#K&_ za**#;fGD9pCQ(A#_Q~THWq$%Yv7oX)Y2+(Vl;Nf1wfbCJnHMmgJb6-iEhCwzjh>}} zv}`Ycl6^|ZC@%tF0k1Temgnt?N|c9jt6jtdgaU2pJ)-s%mSL&nVc{|_WyjL{Lf|Ps zi0bE%8eKW23cPbT!hD%8yE ztbYVNft^g%8e6rAiV8fniHZPEZc@3x8yjf@eLkhd{OIrqh6jf*G%$!cLhtP8D4yT9 z7jK?8ikA=U#Y0nLxZD%Q^}zu=wR<R zLIVRBCfvsQ2N3EB5?T&~+8kKw4B=$YFi!OhVrQoxQ;qdFZfnG&%}%^z@4!W46-Efj zuKH>W`r9!;@OArK2zPqn^SV%1TMdg&0}Yj|H7OO{MP)cfJAAXN8~3(*@MN@-}%mWZvCpuda}LR zKUd%-Ju;*Zt862oz9;L-cB&1QeMJujppNDuA@9k+2C|JFRq1!nRr*a)x$Vk*d@)t#P7PE7-qoQ|p_^1e`24zQYpm!8Ya0yXBxAfpWxNHLXl zM7i-Kbp|7}OlefCs!U0PrJBmKiu0CL;QcD}lGUO2oDH^&BX ztuKrleZ6>kaRE>BhIn9h4#xvw>}hjhv9%2@O$BWFQrN5pIPA>`27>6}`r+OlrL1s> z#~A79$7HA%yF&w5^3f){+}PRCfqS{`BizrE%`LdM+KO|wdYpASamMGz$zV6;I=tw2 zy3y(OQ~hbQI4{{C^|{$k|xCs60&{t&`~nLfSiYfs=P$$$v)Kq z5v0kAKv&&P-c$Fj&I=#~dZG|zd_c9FQLMO>^U8YinPu=)^?O(Y->*0168xDwr-gI! zGc1?m&Ru&5J90n-k=}%|a*d++gcGH7f5jE0DAtsrfDUqi61|#`*!Gw$`jz-D$857wW9K*{74j@C=e+- zk{e3Ocw`z3bRpOoK#0>}TB0MNFwXUKhPcJOu zDgktk^!C|X(M`n{9O&Vc@C$|!qRJbg!W;5;W1_1Uqh3G8 z9c?)5cHu;011{O>aJkBY$J^WSDwX2JW;^bsnmpDS#B_%TLq47-f!65_qpPzQes2(d zmk(i&7o&b3RtVU8M@R5BZ`3c%&*5=e?F02zTyiwyg0%{lv<5suP?HFR8!s=pk0q&Y zlvbc}g{c`80y(+q_wpB4ekjyX$BX;tKp+E(%HWy;4;i39Kq4LeB}7%JD(&FGg9^yR zUP_1^RYzix!2?vqCa_&zURKUSJ+2Bqfv5~@puQGGDCZ}y1-4SjA;*jkfu5X`+E-T% z+2()zkN=@GgOTH}#s51Y#`b5wjPw^$E=69Z3yZr^UZICpCoMQ+EX^9uM?_HtU5|(} z9a&YZDaX1}DZ^T=sVt{8)XE5>VvV#iqKGui)j(gVRZ^OXNGjl3g`5I9HDx*1lT<-x zWZtr{tEsfBQxu&jxrmyJ0F98N>eJ{&Wk#%RL=g(X)r=@R6?Q84O!Dw<&P#coKu=(& zrfP4aRi0E_-yB)!B2!6s%M%V51`{AQKRh~!D;ld!N2MEA} zt`3|T?!l#zKAh=r;uKZV(V7}u^?GoFAbVlwB3_-G!J9p!c){w#>x~|z;qg&(HG0dn z@K@ImrXFVsBMgs^=u3ZYLpS zeo{fR7Qem_=tbI&{v1F3MWo1OVqzX8#X4v;yPH1ExjhRJYWDzjgdAZ2V&qE$PbADlg zqO=O>oyBGrt#;N|`>%?9lrppO3JpriD^OCQMVYp8H3?l^B~Oa9CiF~Fu_CsYc2iH7 zi9MEbK!K(}RV=g$KNWfcEh*27OZd1v?~|Y>Dlac5S81jqDo;5tskP>>f~}z*?SxsV zJFGxYM#u2d=DL(;&UT=KHnp4c3++zqao7ovPCPI?fEz7sxLMnbr=4Cr*w%p?p#UDG zih6Zu6z}#=;RpUv{Q2-MyyptxQn?tL4Tn%^Nw6h7PJ?%Km;}5yo z5b7bgNBhtj4xpc(sk^Nmdpf&tttX7H0z zw+gt3iCBHT>U5)Rt%TUcef#z)5Kti~&^mJD$gLV$sT7i7p(I2Fit>`>Qj{RIq!Lm~ zD(XqzlS&bRp(v_9_=7)CYSCqVQE^g!sDkcxdn+}pQkf#_ZrHHlmI_oYs~WPMRAxws zMHOhvHj)BEQFm&&qFO^M%2RByx}Mlvfv^Bk$`7$a{C3>6VT+P? zZ`_KzH*Cc{8@FRU*V(uwmg_UOF$v4{TVwb-lFA@{+vAkn5|e0iINNm_;G#wJ;fEP*BQNf=;!x zk}@xC^wKjk5F7t7@KoqUS74~nlO`&+YHRrmAS$oL(F!kZV1Vk#Ckl@WsDp~4ofgw0 z&$J*km0;mfgo;n1@Kbu5WYDygCBQK z<4=c{@T2ZgJZrP#WK|tjS{>Nc<;8H&1%H8i&2dq(ixseXK=w-e9ycj83QiSck7f*~jR z2|s_k1LM3QXZhLA4EEsRu|eD*g!WMFUZo;`s@94p_+#^?Jd*II|64}QgETIYlzAyd zQGh36C&Dx(r1C(8Ty)nib(u_6H5lD@RfVRgDpf&6Z!fA(Rb&z}uiDj5vdgj$)e5T- ztwy7dCn4*r`;l|Iyd|JFuB5Q*w(;xhy6&P63ptL}|7n5A-oo;R*CCQj1I&sudV&mXwtuRR#);i#+!t zTIJndYbz>GX_L1)pnz;&S8r3EEaoBf1biL>L7F`{+uGnHpxgwW+u=l@xfy<48TyO! z@DT0i8~z@=Rp-X<1}5>t&T+idf=D+%<%+d;9QSco=VW_TWl= z3(gRt$Ee6w20~cu3t%$r!)O<+EG^q;sGEwf8-am7wA1=_QU&^)4!8=-vDaY3Q!N3! z>FdF}3$u7-z>hDCc`4bg-a?kVr{>4utkcy7mL8QLE z9Uih=g`5fz6-3ccQA5^O_w`ADxgB=$zPwh~lTd4QZ$GX~)%PUSb5+-i-iI1;4!57D z?BkOlxfZ{U5Tz$7kJpS`HvCg8?>a%?Sx{PNfu^(yx(W+)+DJ3wz+ zmEG19+_No((u8W$pd;tVJ2W3RiR9`?1;nG9kJLR6NjYa z6s6ovPwQK4Hp3znFH&xJ8}!8DimgsbOZ^q4r$X=cwt3R~LWQ1`XIiSO(9~jAhHvWd zxRrE!JxaMzYfB4So9$?8Zb2IX;WAfYn5yy$ZRV48cDzfNy;awNH|t&aJVCUV_S~V( z$3(ve7k19zbYCxS_VnY)&;VZR8^ue(0ld&Nj7NJ0aDhru+V@>2;4XLfU|%qRVYd^5 z0Uv^X5Bdgr5vCO#=5X`DpdT;vcHy1*A^hhH2l1B|58?B@ z?RaL$gZsz3aIoKvNp~amaGwi}jW|w7PuEy6VXMJmuS*$f{z!B%X*6ur^IC)7uR8>Ha%J*wAQOJ=`*xp# zLDPh?!fKQkSHYyOgQ3zwi%SG16^TzG*x8S!UJ_%_V*(fQY;^PUMj7EMCehAL1&=cT^J(V)Wq?BZ(Oz)q9 zo;sR}3O!MI@@rtLuZN@EsX))^>QLrgRC&?>w$)D9wX~w6#et6IR=5o&9JJNp)y^(_ zzTQc7H;gx|4m@SG<7RUkMk%Ems>{*Y(~iNQ2QG^d-Bv3WxR3h=NAc#~19)xsUc9z% zA0C^T!hHnVW8q#rJ2Hj``1+{^%;UmV5#ll@rnIkDJKhm(zVd|_%HPjz+UaJ3b))*1}+ zyFB4&$AeA}ZdO#{b7m`k(;LRuq#N4*CYI9W#pRP=hXJo#xPHGvvFM@0#Da{7E^xd*L{$3{L3E8c<5bS6AHvYgN6%45^e=T9_E6zJSn6 z%PGNzZ3$Fc@rvyf1$OtQZA#v?F_L!^Xm=5AcPqIacizM0gw-ddlnq9w?DLaSvA@bx z1tS$8m7v(`d$uM}dBzijiP#>Wf}}KpDz6Zw<+Py&=?Ox=PS{B~o?4k96%jdkxkyM# z{6y$QS7Jz06{S3H6{NMbwMto@z)l5TOB(^_aw#LL2=K((`r2LaJDEKGZvJ{a-q3*G zu(jcjdM5Eobu*r6cH*R^2BU^b^!Xg<>knXDR2pq^m%9VrmS#+Nyf`{OhGX;NxUy#s zH>bvMlNR<$(2q-9KHQ+PyU^c*m7XBxXnm*px-r(*gW*sZW4(Qt7#YSe)!i`Fu>q{~`f+i508cK?;l(}6c=gCJJhgit9vYp+YvVid!SGIewW9}L3-sVAeulH{ zZj2f&SZZj*l17II^%i`kz7;=59!6L@V( zx(O(03@b{{Mf)d1IQgYAgU8-gWx|8Bi%;sR@SEWg+{bgfx4r?_x&4aMfl(^F-tJC} zP^FD@_oAEN3K3ufp&ad?G_cZPDi+ufzSZpm{L>h$hk|P{c0H`;8^v(WBaFAuuucPNxu&B$_>Foefm&>}hA74UUU!6w3 zr>-9zvcKqk3G`I(MKAxPh(3RbXl1z;zs``q3I9fNP4fSr(v&RC9)qQ_9Zhvz&{s6V zpsjyLbiTaXqxH}0d zNvX*Aj8tsUrV6+eWyX~CqEmsIs5r&W#?i)dJua8mGL?+1w_4Umi+wkbea}`&DF>8o zcVIIWUs^^ktaXj3w$@XwN_pPLHMf#db1O|IZ z#Kp-;T$!50;Xdw%3hvTyFD~^4aoXp?TzwU~n@s3zucg}S!s7e{rY8omI5~n{6C*e} zH;ao4JMqx&{rJ*_>v-$P8N4yS4=;zu@OEGr-|8O6S3M!T*xZ5VyZUjOR(z$!jsvZB zES6W`sHqw!Y8vp^^gOR5@eRDNRduP#20IL*_KXo*t3prgt+W?Zp?ABz73hi5Q+i<(7AT`DS6QpjOv~vI z=n35HGIbJu?XYv&+~Rg{(yaZm5HnrR5!7wfG9;&_-D#SS+_oSl<2b!!n&{&NJ zySwnn_z0Fd2}fTCM!EtR>FrfUS#f#Y=p-~lVs|+WNP9r;cd#>n;jSR&`}%QeavBdU z@5Qr64&lMsNjx$(gsbj$JjQLVI9jmZ*?<{O6J}|h7YMxNxd|Lvn!};_8SEb$!&Omw zr_SN+OV{xBiL-cRVGmv&ox!ugK0M}dE8WjN$N2}_{kY=t;bLz$4tYCp!t24Ma1WMj zjd<<+Mbb?a(kj~I!V=o?;*yEznKkIG#it#&L^!9@WBXBFSP!e&P4KxNX0F z893YgWJM)(Mok2I8)NebyaH@bF2<(#Jlsw2t&h!8z<2kKG~B&CMad1E-x!mIdk8Lp z-Flg)6%}bStP&$&jE3HNX+KEtiCvaQHx?FGpt0Ej zOO^FkpeGhr8YW9chEjX`&)8e(Yb!lZs%k8#Z?q9|EpQNQt<6$H%c+2m3eG{**U~D} zHl+qvdSrCb?h3F52&-kPv_l>z4zxAnT*!wb-Zq>LxN$WYz>D+q_}sn~Tr(9K-#+gx20=9GjiOAsNbQY!t_*r*MSOyLazCJg{#+onAkl!LfLyS&fs!bo)uuI}1Rz(t^^ zTvw~VrfW)XE&esdt)hoj=oMyH!(7>h`Wg=^D{L_88W8RpM}O}GEY%ih4HZgd#=0HJ zW!Mm#kBxB!*pgU;jS2aadOUX^ExiKbPf!&7K3{KMsKmi?J??_j2 z{dNIk1bhO#^#rDpG9Lv%6>O? z*VZQ423JQ1m7a9w7lr0UzsH4z&UPH`_TuzF2-hZtadBt>=fWYJ_xkbl$OPWrwGYot z&ft7+4`$tM7$6iwZU_3P@Orqf5Fr}w>Vlt2u#4OEg}SjYJcix9RB!$u?j4`Ta|^rh z_Pzsnd2k$Wb&cS=lY8;b#4fxtI)`WX?585!he!7xP^LEts)zUQ$8#r7;@+K0cyvX; zcN~we?876=yKrNA5*J4X@$k|-9$8w%rIAq_(Fq0znM6yU^R|M^~r|fxd3|!a-#um;O*#vAd3%DzsYknC@`ksmWP< zdFOt7ZTBHO*EfOZ2d41S#3J54cnV)Wae;YWv9T}hKZLJbxP*6Z+=~|u9l`^P3wY|# z0laYXIG#9s05|s#e7l!$d0`Hx3BWURv)J9=gT12z*g4pZSt`-Rz96PToj7}Fg%?t! z@u#pZTGgqq*Pyo+pLRqlnq&r;n>>WvbPG!IYGKs2pw`lk!dwFy>O7d6*^4%3J9LIs z=*6a1B0j@_t%>Edv{ZWW#n?*qw<#e4z|Bdty@`cXeR)_PmrL+5V{>jnFk*#84Myhg zA^7g5+FQS>`b7P0Bpf9pDljzyK&m}?HswaD#!a-!GL>Zly-20T$I#ofJwbuq=56t) z&>9tsTPcGNtd{2q^kk|G6)yvbCMEtOdn;}9#M(;d_o}LDg0r6BYea*M$@#`wn*u!t zA?P6Z>;$4fPqDT>A3A-V1fQSa^P`7qZN~4wzK|1#!akfI8pNY>J8{Mx!b729JU=vr z7se*=^xy#Qb+zM6V?Fj(SK%C0-AQS0*WtosXD9l6RE^ys^n`-wrRr;K;5pS=P*YV2 zYqbV`n;AQ;b$GOU5T9FE#w*NMm-gdJhfm|H$1dPYQcer9nu-a2_2@7}n9 zw=bT@(*)O(hgb0Y*^_wT;yFBb={z1ga|#!hcjFkrw`XDu2Nq_P3XtWgQ7nxTy8aG~ z^@VWsz+PTRqUk<{-Wu%I;$LGZl^MJ+=ckTP=~XEfS6kWui@p^FIR?~NoS2|eZ)#FET;qYOcKYA4VXn}Xn&EVwzz1TN9iKWS5 zOpWxRw>yASCy&X`s6se_oWO3i9)lqZW$9Y1#itj)(!P=xra&)0vl>NYnXar6=E@cn z=UJfFHezUC7K1~>${3(Zla;ts3u05v*p^g@9jQjdrke?@3T&eS+>)#z^a#L|3T#c& zU|YHtTT;uhi4fed3O^;|^CK!y07xsmIjMjuF;9V@)Cg~k&8CIUxdlEc7u=edh0XCY zzgjs$*xk8Wp0|Eml2WUyRAj8y-bw@M^_)*g&O~EN+b!rtTU_atA)~!WuMB~n3Os?I zKu@){0=+6iuZ9Y*P88oN^qOt;u+#Q7*W2J|wWF2U*5**`t%vaP5PHEt2$Qs-XNP)m z&g;bGj&?joYx*!1)9HW*#~pTD@VoHX{3uSg*5eg!^ZoHTyf;3DXQ=KT?g`==E#%Ff zKHMA_#`)ep91R9BUR#ThQ4gn~909AIugzGYLOf5^`G_}wr~8NT_~-=oQh8pUp2i!O zF5`>WZ{VFr9>G^1yooR0ypAv3yoL{7coOeE{Rn>e)@%6jSKq=nUwZ*xdj1(aee*tC zJboM(j-SMl{VUkNcQ1A=%wsQMy?1&F)5HDf3kNVhF^mHT_D4v!D(z$<)fxi62n^St zw-%pnM7NQY;%9+gL6(*FwhsCVyHc)KR#=0Q!fFKkL%go1V5%}j8Yd^DTM?I5jpS?_ zcBGgIJ~QGd6XPi0KF^RO&HgaX4+L>M;KoHN%9m-0&)KOS ztF3sVt_5Fg_2X-fE_|WA3lICdaMI_&_4!FWbZ8%5zH$j~+;=a&^vHww+LMpsJFh&4 z-~Elx1@vk9xjV{feKzU&k z%(@OV)C4GEI#FwJp*X)9uJ&$BOwLf~R7aqflwm_$sul6+wMfZnKvGs6lCqh(HpFMu zU`K`-+a)ti*p^|$=Cn#|PS;^8b8~71Hl~yjfW;92GQ|RKCj2C~rmqRw|@-14LhWggG!GoZv&?rq@shiR#Jw*x@+SOY>7!mrP+#FDl)0D zReNQK$`k0RBdthzo-UMJYE;vu6CuZo-|MnQ*BMr^6qlBXUI%8)IvneCW7gS>prI0h zoMQA9XfR*fi2JGbUR>Hk@D1ack$${5GmMAFdvI^BACGqW@M^dluhR;@6B@t|{e$?^ z?rHqry>s}Zv7LC8+dMiufyWQ+!Q-b63f@3;zC-ko#3lQdY&B#8MR6#5`GDUUo645jh1&?h6!7lo70WhNbqf- z+S`y)j*SH0CZ+&zGhc5?q5_mk5GuczOf3>~^@z`;HI{aOakRg&w7Ft+rOYojLjj*C zzD;qW`XbO%fVT?0^_!(na4aouf&x96mS_y9Z{i0e>;!ZGPeHK0lA`uxs&+>cbr%gi z>2s?-^v7T{QtDPIPl1!Eg(*YC*VNR&My1ze7vRy}wza_FY@x#IfVZO!ogGf}cwFe^ zd|#U#GkouGYYPt1YCp#154X2sXK_AGQX$SY*)YOsC%@=mV-p5iT5xh?3{NdBrr3MHHJOrCX_d2L9| zwj(jUp3^3X@=MC3`b(=JtZEUPX(jY5*phC-_N*#wV{R73mtnw`4E<_Wt|~tbtu8@G z>$`;ta7Ts)aalU-NGp%D0VMP`#ph#dQsHWupHzBeDA3y&E0r12j!(++sPJfS@7WZm zKu>ZbAt+ti1ctdq6=-O-!(3%iz!#lTgDd5F0y@>+3izbCia<|&fVVskM0u!@fRoUY zvOWP{odUgPrKg6o-GO#zD?*(f1l>+5yjJuQcKyvZj5SqZ*<8*lj? zc&yfl>&(4o4QA`jT-S=lzAhY{9>b~eQJkhFz7`JQslhJ1J`%)hK@Yw$F@%?UJ8`|C z8ZS{(0XmA_Qa(}Ie5rLjsd%JV)Buev|P@3Ne zO)=S^YlpegiIRdkG&gh-dh@U~HWGR!tSi-YA-B+lf)Woha$1m@)u@0^)Lv3{J#B6S z!A1CG(*oxchRhu~HOSDmBDu5yvANX*qMkNa%KNm~k|yPUIzq1;TT{!ijquwlwZ934 z1fDd2E~M%!!j`0bY^2)zm!PMpJ*6f$QCTL_^qeBpH8d(C9jlcXl4{wW3|TLr6X=QJ z6Y#0fi#)`;QW#*w9E7DT5HWsO&`H4Td@~;Auy^J7(s;vO~3OC_7w+=s;cH_sxZB%H@_=d9zZ+A7}*{)_Bswe!cWjNl|hLb&B91DAK z#BIk(yBSXgTJc9G4g$|UiT`-yAigyr{@hM{I5>v4yZiC-@E9H( z9mRp(5Uw2Dix(fbiLbx*Dt_y|ckw&lejh*hCgJw(*YMrH+Hg71Ux zP~}needGQ2@Wrox6;HhQ0?uB!g2Ts-WB=a$*fYC;T~jldbA46|x_9TjP8c{@MmF73VsBtRbD=U)L8D31$(KIwS)810) zW#u}Nl-7vCQZI4}oJwZowIC(C5y?`E`1#|H;J5eg!RPzJ zgj+Wr3%KxkTFs{&ZFnIV#7(ywPfkqW{J;S2TiJ()&!52?Pd|xozWo+{{NcCpyTADn ze)Qo7_};hQ!-oXj``>y8?|<-3eEXws<2%3kA-=`wH>d)?^4>dm?)BGj@xB{4eDWmr zuB>2*aGjf)!o&Ei#-Rd=^q)b*t-|7NcvKDLcX-4GYyk&1^P7-(xD9o~< zIJX}9atADxE@(=dVXf*=px4~m8c}*hB|cW-wBxfSYnUGVj zKrbeTfXkw?%dlX3PBpXYR_@5Q5Pn8Q>1|2XVrv>zp8~ydY)`F-fG>%*m(bgq#7r)r z`ipdP+Z3-hUrEKr9m&|l6z~c3Hmx>a`M95MBDTe+!C?X3zuNvXk=E@z_h zqM@fgoJgnDKwDOf%JLEzId9hMQDu@w!B*7OHlV4|j#hgs+FM)D-e5y#lMUVVHRvbw zyrsFg(rL$Eo!f=K+!@Au4jn%9RpY-M9s}Nf0yw(|f8edh54|@0&E6^e@%$)0-{r;? zuM>xS9$cKB$AeT-=LdbbOeOeqR|j702@q^8cqrt-vm?Fu;;vadLjXR}AI2jSgE$fL z;`w8Tae87L*9ode&Yj2WPd$lmeBn)e|6A|jN8kB2zW?nH@SDuyC31(-}w%{MT`5+yYJ$)&wUQ}-hV$1o;Z%31nA`4G)Biq(BI#Ku7FQbw>x(( z-MXw)!6z9(UF2e0i?#T4BXS+8&|927LEzOZ&@0JnKxJ73dfKuUSd0z@dNN#GCB@CU z!creW*N&{bHssUdW(xFZb8|{uNH3t8%eNz?um#CQcElGn(%OpBvvO+Xv|1_S+nQ-% znz1#5(4*D8WpPtA5u2;1zA~kUhk!38tqj}KO0bz0cT++xHWPl*ghgqnOtmK|N>7@u zsJ2$Mw(AMKt+6T4P^Q*3G{90#iz{Vx%81JHFma>uT#d}EEF?&|o;(gRDN%V!TFL5D z*)#;7wzL>J8D2_T0keV7Gh6tJP=f|*q-RE3eFHqrP4G9@qldP3tfd~44K?U0%EsgU zKK#zE0sPUJ8}HPY0*d;NoHY!^|-LiWc_If*ruQWB9v+`|!c=ATE1dINTAl=zz&}j=?DT92GLExb_Tq(^X*vPxl_5U1axXyo~rbuYnuGG z7Hje8M%0flCyO&G^k{GMnxHFfQS7a*%+C9}71Ps8%CK>jqV%$gI*=i@w#zo!|1h@q`C%rNe7ZJI^m_Ym$rM-EJZFUX{sVaNvTa%1b`o_zO`73PbWSu zVd2%td8xfE&S@g_8Wr%#*n0YkW;(f6I=TkTPVYfSyO*q>(lgW!p{TM8Sp;5Ish6O0 zBb6XbCHT@fpThaHGA=9aKzx1+;_{lQ=xPy{+kn^{8+K$-@d^C0YZUnH$gROPLT^W| zs6RbnsOMD5`m}_ePU+1dHM>&Qw3D z!cmJacN37uzTk`j_=!xvxg4i z{K;duapfXje&%U><4a${_dbxayl*SO6NM+x`|d~Ip{*tOIQ{mw-^cs!zsLL*zVVH3 z;KdhTz`fV6DJwyJMG3lVt1w(wjVUU+g~l2zHCb_@%Z*drKAi0Kd~yZDIqR$%vo?|oNMc}jloBL#ZjA@sie!MBv@H@^NgJpbIYxPI{> zj_=uvBa6E*KQ@lR-X3(cIZ&(9qcy(*i>@Fk7eU)CzzOXn*J!sE{|e%E=}1F1yH$oDv^$N;{EL9*Dq?AWScI zBQei`^inqp3?V|$iKN0-Bo^2e#TP@n8<*ce=rtj}pk7gX@r7JgU`0x4J(VF9A8qb- z<_^Lymg+Bt$}c8egYBsTy;7yuMoelkwk73Zb6h4BUnZyN*c9`z%~gQ6F*2I+miQzX zYHF47^)y;NVW(G`qi9!~qLdWmBV|Vn64q}(!kzaZW7AfYC8t4~l>=jbA!h8(VRd`J&s6|8ig- z@Wey-ZwoW{zr+3bi$Fhrat`2sj?Li@+CBJfw-@hrc<~Tb-*SBuMhU=hjTQcyYD`dB z&G!bW?uM~4GK`tN9?T5(W6#7Sc8^bDFD>Stg+(mw-ihV?dvW~85u84L9H-Bm#-+=b z@E|SfE6+cVuYT!my!XyG@X@zFP!yhGYncK$HKk(XgZI9PFMshZJn`rwxN!0$PVU}| z6FZkNH#&yl-d?z!ZD^=8!dGs={=gui7b&|F=qaERsi(@lwOET!IphJ~%JdfWq@7+1 z)t>zp^h~-I1$x?28^%T!&>bFFg2N?vRz%PYx zOrzz^rxi{s?ohy&kZ(tPUX!Bu5(=9MK54>IuYfPHxB=-rV1dbn9oc5uTiRUa4yr$? z;D{9f${+@`x&pl&silZbE8(;NTa$7k0}`x4?^b07p{IaPx~0V>q18fkY& z=;yn#NzM> z#;Ne8#zwJgW)4eJvsj#-#nQqO=6CMI&gEqsI(P`jjvT}BQ>SqD!g<_x;|894>PdY5 z_0QpJU->fLC)89`_u+Rw!293&W<=r9)_&{VcktCOe+jR>^dj!RaUG`#&?7r{fqN~u`4j3z)cl*N<=BW)gRflSc^|DRxiR|$`C7@-!*?4W%;dy zo`6q!W;DX0cPRE&pf@xygMt20n9WsKmtX8fUXcfRgk0TWOT7y8QkhAG4n^@L6*-Vt*g}ijK zV+4R{I_yXn=tXR8C3eH#JhON8l+H8BC>2Z)-d@#3UfEOoJwS z8||zSdOhu}z7iD#p4OnHpD#msS~`sJDX5M~M)i&)R1tc*6lPW?stG+?ULJe}g&3gP zn=Z-2WOfox6MC->`0?eL0bI3L@mQ!hVAV*J^9=L`x&C zmzCl-8l3n+O9=nTJC5J7`SFDkBc3kP;Y4vM`gxq@l2TNblt5EljMk<`4EFV6gf@4I zz>^#q8phbz7^bQ8X2z$mFiUGo`?Ycm7$Gl&!IrCuV(_IBU2IRl~x3iPi0p~(3O;jsrZ5l@bXH!kju>EdpTud zbvuz-=tgFlk2aSuq}5F;aUr$Tg_M$ZLeD|C*{S&KgkFoH_!0^mk!NrtF0U4e`E^uz z7TV!z#1ecvsQR`G{0P2?)vX}-XmeAGuq`?77WAY{PaT9n;I~1__T=%9TVs%#laE$M z2h7GQRO*e;==CTwQso)7FzPCx%gBT^F&&L@X-e9XGoVjPgMrX%rM+#*%t3(rm?$j9 zN@*!}XJuf2Nj8p|N^zg70hilsxYlmR;rd!EH8o(er3K-JCKzdx8{^_JQC5so^)*<| z&cbW;?RdAL6My8M!1wFB@TRVoDz6TEwHkC#N!C_h1j9cX0m-4pZ$NIeHW)PE+ljJ&S8sui$|P9>9}N zK7m(Wc^PlK@jAZnh0o(FUw#{3{?Zo}JNrcf@wFFTz|)UBg8Q#t#if&{aPrV0EY412 za%2c2T_FTJ+;BKrVYl1iuo8L`Gs=Y|!cL&4*4nNLJ@tB9i?#Ul;uD}(QRt+_btup) zE3hd84#+bz1bW?}QA~`_!ECCQ9vL0T$@ieNq8r6!A>}SFOcydo9j7?U*%IV!*72m;0RT^21tE2wmKEv}dQo zpPGO@r6qX0z76kq2JwxCAU^CEz!TakyyW%cM0+b*iu0i@DpJZGE#_*3y22PF=!S+z z(LXSR!6Dk)sVR(8{-UX#RW{# zmXG%JU^d)~fetU6t*vNkZblt{38#i4N>8q<+rg*A8uixV(~jGqw`cL}E$B&wMk#q? zHM?kWorGRJ@3~=2PRvK3ms{vT0l_AjlkY`Vo)_5!Uv6<1vI@D3$}fY^%hz>N+4%@L z0#P3(@H&xJ=0TQLrtS#%ikyh2-Ayg0^(}EAxuliLnw1KU#6nu%>>9*mRU@Ua9vP)g zRE~9s%Q7LB;FDe(QnnYL$!T&SfhW-WxWRI?;)^^bEe0|1sjybpqqW%?Dc7@@QEIM4 zIW4JyYR^o#RVAgOF)0I$$(b;7UXz`H((E*ta&yp9P=runIYx@Ln5#5nCxNtVwJL*m zp7FbJs?&`FE(a#9X3RF!Vvtk6)d+{V0-Dq~H05Q&k+=g#Ypr;tXBe+|`|ws^Sb1{U zft&&ysMKPTu(ai7!&qDny~c>H&Je<(UJMNkQ|S$2fbbg`A6I}kF)@MZ$w@5C%wX^Q z0xldmhD(Q!;xLup0ou`{d-vk_ffXD(Z~(`S9wGc_bx)thc>+z~clq)q#r9sk!s(@p zxW>FpMR<hiT%i%%ug>rzG#<^1x}IcSR7QI;(#8^cg{2{A4MQj>3nP#7*`-0MydVkkl& zmGF~Hr|L^Aq1r2TBDu7U0BpvNT(P-L+>Q_={Nkwk5^`&4lkG?^mVUh!1$ts}<1=+g zD=;B3s{&gSbFnQ^6rX@kny*B++mrIWSnP;Rfw8h0o^~H}Is(jyCLeN}IhV47b)HP-j7(tr`n;RXEYoj1%>ZxYE*rm-@zVFClz}s_}@$grzzw zoK%7hgsP*toxtluZ{Hv-?Fhj~l{Ye?Ku>@-#p&$i1QrRUmHBxbU)qW52an*|fx|dL z=$)X=mEqoxD0X*04pQ|UIdTNYjvZ3~c;duyoH}^|r#U^t=}7^jC_{ndzI`~fi^_3s z7RxiVwgWww>+Qxae-I;04z$N>0|s|cc`jL<9aAUoGZ#TP)it{>U?Qj<$dOVyVp z6&som^7TEmwgNjp0VfL2OXU}VUM3Y^t|^2xjfdcCLuy3_Qp&{Y+7-2zT-=QKJR6}` zi{t`PdUh(vMkM9cQ0>v`u0k(0--!5(GNl%GdrBU*C1n$QVsoR*_oOB_7F)L`qNGrR zaG(bUgBd!Lkt$D*5~Bt>Dm{H!3H0fiFr}nJpOlQ!gm@HXrXVjT6*;+S$Rhl*(~?n` zMgXQ}pfW8RwIvns)iq$G!-EB1Cyvto9r1VKh|hx)onBn+3*+)oFIGYUEE0r=J6f@; z#*9h576+S}@YK*4E;ZP3)7F491{3yHSHoLciWYN?0=!V~0D1^J0p92kE$-+DhQ~%R zF*b%d0&ihz0!tHP*h>f=nViPWl|y*^#A#e5xGwD9hl_;Wk)=hfEbqZSf^21F1qb+^ zOeGIe2_9M@5V`Jwy~{YTdpA}HPigZfqa*L)F?LD2z^PG82K|_5b7G*W8BW5vRUV6J zslsqiuVQP((#li`xyY0Y?bgg%ti`7ntC!&~ei5;^0zGYU2g>u?m8L6YWQ>&O3H0RY zXsz~6>|H*LDvK5CN=p4Gl8Ou}y?iRYT-w`QPSdkGkWc7|)s=F+RNCOI@-C!v{mhC0 zRUcKLt_!*P5P?T%8p9~98$_Ntgmf*HAEB3_bs=5TK|neQ$2LXjCFa*7h0`RGIIXBb zfnGw6MJd;d&#FX99<6Rl5q6|8Q}bzcbEx>TuyqxDqWF}jq-}}ECh3`xo`-O-7xDmb zt-*i_+Q~AiJOz51GF0Z}!JL~1lZ>*GmWtx6G~{F^BReMz1todReB|ZjqBu7n+QL$p zWq?uws@`CMn^teyNsHSR#L-X?XS##9+8x3iqr&Y8&wl!k|K zj<)t;Lhorp?duzwGMM?1S` zb{0Eno0lfWu{1u0g^@vwg?(7)4`RaUK#2Cd-Bv@TXN0b}2;(Ee_nTJcHq##ldzZxy&}S_h$^py za4RF^iVFP5%k>ZhUX+!0)A|N!b%RuVUSv}d3jESbeG2%bTra0GNW06|RC;;lFfuEt z{3={XDQ{OQGbD5L9+cJuQDpHWt+Yh}Up#GYa(*45XJu9+F~^MLJTuaZs*sQ+?f6O& zn_h&N)I4mb+S|SgzDQ%`M5X_&JRvPEE)_19A9c1yWhiKkQ4g)LlJi>VwB@KM$b-J1 z00!xHmYIdJ%uM8^BqKjP73Br_(3O-xS4uc)D`BEqtTI%g)>w@y+T1DvtC^O$kM?$v zD(^&pFV6IYaop`v+U*^)*5fRd+*DRN7OVBxS!==lUO&D-k-~add70d*&Chb8#njE$v#(oqWBd2#Y~SIEM+mU*d0hT}i)NslZsh#yBs{LD!(S7N2sg zUXaRD(Ky|^bdf*UZr<-5N_#y`aij}coyDg>ug21fqesrr-rBIPyx51LJQtOYo08B^ zLUy5ySzOSGvXT%AivkMx3d=(%&`51At!=UQHt2bgQtGC{^C7v6wziz=PU}{bo#!bZj7rWV#K&`Zv@D$q;JF%o*!grBIsa>Qj6BQ~u7JCbv;LmDiv zLQftIDd3ZbfN$NAh??34v^XLoCw+SfDzucn7FuK@zbFHL9jee!Q;Qk`Pat5gtwL9W4ReIu!C(*7$wF!UFJAwDxI&s5l!-!Ukn({KZydDe^Is*fu@U;l93`wyPEF(CJu7(r_!&HM`~)62c@hsEKZ&z@_7P-DSY*!6 zE->e@Fc-=B**VNj&tQIL2GgSIXlM5>%wabnI?>mQvFAbMJA;4UeEyR;al zf-D#`MbMNL@*MjWrT2Hdt^|0Z^wboU7wMI|7Hjb-#wSA0qd>2`z;O$D_0^rUxei#2 z%{X%SH0o{5SZ62?LPLA2p}nmv>4Ls2jLOn5O7i?DW)|lAkxv*(lNJShlG<+M>$?fD zFx6Wxat$G5RniI@x{#ssaVq6|w7jNHE|&_9h`miOZ$(N;6VghWkzQg)n!t~0Pik=m zdU2VR1fGfWx_<<{?S$U8giHl`>j}N}+Y@llwm58wNkmbZ79L+0j3x^-RA(~8lu5?d z)9a{Si%?!z2puh^A+HeC`6aMq6~LOuX?`(PAeWbyD`V=(fTH>e4XX48R8pB$6Pz{G zCbTr#&}OZHx4H&F+TM9nEgo(4<2PJA_z%HRyl%ANvdN4KjrDlk@5e=g^PN@?UazXh zgN@CYGjSi4l?ZnRFiLAXHZX{Z;V}%4jG~Xg8<-x$IDt1s@J&=NNanV=ha+V2vb=;FOYKdc6afL62$RpTD`b##aczi zTKsE>)$8(?%Jmq5US)}w9HYf0^vVkx6zIv5(NyRiK6DDLEge`_ONiB$1kq651AFBV z8nuJ4mh}<_UC`zSxD(paTpvofPHq8Jp|%GV)g#EF(o3hU&6W{bbOFN7uK-W1ZMNQr z!m1$h%>lmeroDBeq`C`*Vs8n)1ggBW;%21K<|gOYBD1uC0JI_@+rVWeBcD)cy+SB#4-t(YP(Jyq41B=9DC z`Y<&xgoz>A+M!|ejtygQY7Aq9-sISb(v)NufwgaJ9LEW>tMj|?=%Hh{|L8HCpshVJ zJdU$-yKrXTL9FcDgT*O=Zj^xI`ZE&~+y?IOb+&AnAUofDz)F^ zK_}ttG*!Tsoq|ci`ebJi_7Z+Sg*k+NG5UIfypW`8zk_k?zEkXwawD|sqO-b zy35pd(E7SiU?c!derOwdQAF^i>)H`73Xhpy)JWLL<0h++RcceLZA@A@QV6|-Obsn< z5q2aNAU3&>z{{o5lXAUuY>r7%o`Wm)c6)3J(y83qI=ra0H7bpib-GGrv=zNp1HGmK zg)(Y#ZXPNKH%loMTu~{3R|IQrK5DqEzN`!_RC@JXzq-5>HMGSRJppJopw3o+4ZpYePBJp(!H|u9O@s=2qZUXEz?KYQcdTE9MEwi&Rn*rDd4a=rC=zAWVxKu+?Ic zz?tdo$9OP|>Ha}Xiqccs=#62Bpc|#7oubNHB&<%(FXAlo_{1bm(XQS&co0_)9>4)Y z^4i!OZcgvSdBX7g-u*beXBo@VlVflYQ-ed?@35jI$7rD^xJ;gXG1E`z`Ft4j`7zw? zg3D-vpLV_`BLfXtIcVZ{QB_pPU&=z17Z;$nFU;%d7yR!NZ0srofBEx|N%#-{AcIeF zar9H?qtxKfekLY4644K7R@YIti{4%$`n7rt3E7AGp6pxp6aBur?597KYM4l<^U=%J z;@^0vml!XpL(A7-EcHTP?4nq6Fk4_KZAGgs0IR8$BF2v8T}KcO^wzS>~tN$aMK=iR!zfgnnGP!KTPF)w2pnq)AXXiG>9BsH`2>GmG*ksdfHok z0GX9ketIuK=t92HOE9{TNAP8qJCJAePzkmpRm-e!C_OZ!OfMzhO6XaTU1md8iItXC zL+BZhoK=a~lv2bHe6cd@RDuex|$laHrvru--wcoY*;h1;i2l=tF6X#ZYkz$RvZj?aJ9b=vsG1?CL|}@+R)>4 zAnfkIcy~A9GoZj}Ngy>giLpT{GFsfx(Gg5g?afV%V>iJi3h(ID6!s1c;n?&vuC1)# z#^FOaF+Pd&{bM-SGlHAT2e|woF0UNGsoi_9m+Q@m!XxZP`uZ@);|+$x81L=HSSW-c zj~C+tJ!c18dJ~!oOJU*XXei-lrlmI()9RO%qP(;SVJbahMX9h5L+!N>4}L*iH${GY8pzjvQU|mL-6IpT2O#`-ta9I4hbw7Wc^f0uC=OVmF~D!0RXA26(K#a7Ys zE?~NSKN5fc_kVv2QnHRZuO8zkKaoZ?kx*f)Ml>`fZinGo{M(L?f4extKl{hfb3>Q! zWVXRr(gsI;0FBlT)SBBdKfND=10z^>s^=0O7=H*)&OU)>m!8IxyPn2_JDk(q69?1-fRG8e5TBT#fVsGg1h>g!Br;$&gbr)?QM+ zQiHoAiE1w{mC$4ENL~eBA~aMl9bVdKsf18c553gnYPBdWEI@8%1~TONw^V$&%u<4? zGA|dF0;)Y)I{}`Hmbin;P^O(06M7qM=&Y%NhicDnt%lR6M-$ahRbCFEnFd=?4!UWD z!(}BHwOMd!Vh9h?u0AwAfujW8VxSWfT>*>`Ktnw}7?H{Z+SvY~0SpfeQIQQQ&=Wg4 zN((wWG>DaZ3Ai4DubZ$7b#@}`XA*clUN43Ty?!b_uRH_9%+Evkc{pEhFrl)z1lp2P6!W|2 z3JUZhuv7EaHMSZmL8Uf8N!m>AiG@v&*F zJ3DY456?c1=XO1V7nh&IbNimfGY6i-Ge=*-BL|)iwiT z-89MxzS8PZ6ju+Sz|xPx>ON*K@=QS#nu91ZccH}6r9iLP5>V7$mfl4udXxsrsf7(l zFBY|DRg_+dp@kN=MzOb&@fj6}PcI|*N@#Hl2){gL4z|UoE704vs`fT*jY9#Ii$G6r zwm@fyJPJ}C7$g-M73HPK&(1<-Y6>z^l98R7iu{apl;mVVOBHC&&qH%Lx2Gy}5q=)3 zy@6UQ25W0LtwC?K1)~i%4AoiDUQ~dl^fdTskwf`~N`Kvb);b()YsEvfr;m<}<9Qc=Bt*oS9AlK0WuhWTus{?_K zcDS3H(N=3iBhS6TQVnCd1{FocC@1&|seHR>jpdTM9eV0zCZU2ug^Gj$iSOStjux|KoqeXFvN{6c!dL;8WxCpZ~n_#JU0=Th@{CQ&~^;anC*X;4`23 zjPfA%AN=44%KCE5|MuVh8yXrKlyxNUyz@?Fde>cdDeE6Pbm$i7R6s|c>stKVj(-Vy z`a&1oOS@90XSewXJvRn>=P)Asn5(iG`sH=?ftQpMsmu_y z7oS#2@D(x(n0dFLrz*XTTVj!sQ-BsHt!JH0>6xLCiVP#6XQ+gRb~Gz99Vv+kO6E}Y zWfOV@*_kMz;?w2kpq3W5iAtxHcGX3j*G*d+;CACSD~7Ah=r`!F)ZB>0hI-8EjW}$m z!->jT+}rHJ6?;4G4FvJn$S58o^e$7;Ep&Hbv^R)Js=5iPkb$0F^fQO4)COpOM>rkg zbWmz^hexn9JdV9%lUS4*-L^J#Rn;O;TL-_(f$knZZL1Gcy4y$|dzrr5dV*lu&^pDlIlP7HMf|3WQFaIH4>TAPTfLZQ6u8?zjW;#Jb=A z{oluL{Kjt}m&cTKRp_Mxm|kgSM_0S(-0LN7TE+`T52+q65|yBW)Xb3Sy?DxmgVF?FEzXQ1+=V1ipA}u zl?|Kpm};uW6u~z@08X0>IAm|ex%w7-wQCX|bj{$Mo*6vl=)|i-<9LQj?WxHbTo@U~ zK){7i*oT4sZYsI~3{t%f_ezcGFx49spA0(C+fO^zLu*Ex%XOsob*Ke^W=jA7|MW>j zK~%pN?e#W{@)#pFJKQ{eFyKaaryIke04;3@Rv$EGa5MR#r9w1ZZ@qu!~NK z=yEc3{>qgrit5_EdpC0Uy~-mw1x^xTSGRB9uBb6tCoeBg$@S~kE1;6cbNp@Wx2psJvW)E z^K0>MJ3dzJoC@@ouM&D*1$x>%Dbs6*q1cJWYEgL|s4;XP+&PKEhfZQ$xONUR&iy#i ze-0PMF5&9b6+E=(K|FQzDLi%ZSv+#=X7_J<69EazwV`b(##(GYnCwv;M?JKCE z+OyE|s*2CtGKcE61ynocU}>2~b?X%08-~7q5CsMwa%pc1&72Z?Sqk(Vw7528meiq` z_BI-NiCH=%W$F~@#ibQ9BhcGSTe~AZ<5!>;lb8mxJhPU{Mn+rF5l{lX3Y|vjol#s| zL`9KI}Q*qN4;+B zBP>o)H9g`E;*0GAc*i}87n=PzT&TrUp*}oJh&?hkfnx;MP){d<{as4)kAYskCh(*T zZ=`1cL*ZV|^Y!2$`Ua`?m;{vDoFn3Z~lh+wkj%5Z0+I0hZSY__~VZ&@Bi-a{;pzI zT`remfdzJAfu--TJcL~opD4wa7XC1%q$o;I_ASTy=%bGm^%thCm)GyV|30d!s_@)% z&nf$s_2gVE7K;K%qVAD)fw{o?Fn9)EBj(-r|AP7Xt1Nq+=jFQ3sUk-NXUplFEOJ6JCbQ_3B9CDX~L4NC_SYH zR|X*17Oyl?)*3Bnb-7`zspfp7J6Z)*Tctry=oM4#WmD-TA|ovwx!HNh&dj0B%|af* zS4ikdg@?98x}}xEsw+olV;y>#dkL5$ejoPO>T#~agPZOyJl;+_>kQypO9u|s)Z#dA z)F)=9aixC%hx&TZPn8$!4^qi>6MB8Lu~K<4jFEonhBk=MU@rnxfjyi`P3u@7gy|qv zUuPEq*9kXmsJqn;Uq>5yq--x3RNC3~dE5&0x_pEj=RN$qUZ#uI-s5z^+fH@Y+76ea zLz$Olt<5cH1$O;uEyKuszb#LCvx)F?m{=xbzyHyZSji&3x?QtGNI43%IoMI1WtT zz{1#d1bxS7ot9xE^vrcLsA`%=m3@Jhb{3}Q8QA^%;2%5zW5WpawgKpDy|lSuWNX`$ z_IvpRU=EdDuBL_4MikNBmYJPMFRVgxt}z0=tV)7Ti+7)W^-%5kaoXL9%ibWa2f{cir@tRvLxkQC;W)zO-L$~|E{yww=;twd z0|9gfyzn!_gzYe2PZNft{A?j-8@k+->IAXREvl}As?G_g-GTO&Hn<2pXOjc2mUeiZ ze4mQ1Rct?jS64%oM|EVDE_wu>^iwX^YLQP%pGRw-OYmp&2f#R#XVUqzo=OIayiu+H0>VHM#;Wzdy2n6&6z4D?doGjZ|>hY&Hd)Z@>Mv zVs~G8{5hC@x&Mpw#pVXhrGCGTFa+esZoVtYq2kP>Tka z3O+T|&l$Z<^!gI=T3vSgzSrW{75tmZczXO3@0mY~^4wOGW;Qb^p0ZjLrB|bOqLxgl zGkY*Gyo@KGcpmFY@@WT)JkV1Pj<|Q@>hi;Q{`_lr;Q~Q-@;N+l;blB?{Y^YZ06t6* zUOhmyxAYJehAv?+c#8BqjdtfgSgcd1ZkR`neGzre-LN#zak>+2ffX2RBQVh3YOUQU zuj!)VYiAO8TBoA)atOU#Z8M5Z4rr=f$S#%23?q_rBDJ{5Ih6!p8RFB6sP-bz+mVZ3LN5{9;*yb-Ukr!KOPgCq=;>iFo2h0vCG_M5%uLTfCRJVrp_fnK6=dX~AS)O7 zIk`&tQ%-&^3Q7x6rYnO+Uyg!;EP}5XjkL%^&COWp4C0tSgd1a%I5#wa0|5_?_Yiut zvU|H+IMN@&nc+b!QiYB7bP;$V^b%-&J-rI-qUq*wwo$%Xk-xmJyd*D zcai~Lq^wR-fY)O@chJZ)PCwx?L1YTCdOUO2| z-tFrO{61;9y6pDlYw`bY@OzZ*lDrgm&7MMOHW`vGod=pwp4|d%z8!U@4y9pooyCon z<+FJ8l{c`iyu1?zT?oZlb~vjCabfpEcOGIC!OQ4$AA+@Z2GzD%R5vZ4$+Zvl?Ym)Wm_nUn9@Wj0sJ4%x z*yyLaa}jP{Dv+32in#OwrLA6! zwAYgwT&4=W#I!8fY3HmBO@x*KMvFzUw$fhD$k#cUSt!iSLtaLv(j~1pU51%TMGjSi zRBGfCdWD2uQAG(#3BJ6-Tq?d&n9EDyG+Qv@aAMBs!KvXfoERU%>B&JH@9n}Kw*xDE z0UR3Y#>wFU?56sg><(hECxkwMp3-(qDlPiZ)6;`+PZ;6eurh?xKz}cKLqT*Cc-`F2 zODpRm(0ncz0)$!@Rb1HFjy?jfyS*JqapChN=s0XpNHI>Yz27f>FMxzJgXPgKOeU% zo9}$*JNPgE<-h#OO)a)o+Utor`>VhDE5(}1YZVLxrThW%$D% z{!oFafJ+6FtSfMpvcrsw3`|Z=-s;UEP!iSo&2N75)@wE7cxof(XlP5QujM$B60)x3 z?aEIL_47w>EBlWA{GSy6()(-iFGu7%l>8WaWjS_~Gb?ghAdQoC1p>Wxn01Z(;qS!N zi;v)`C!fW-GHn;iD>_j`58P}SzzqWJxr=Y%;iE6$>GN;k$;)rzL4xnxzK3xCiD&T` zq4(&q7jR+Elh`?Q857-S5Of_tbHf7aMCmomp|)iS4&MRPx9ubVccHeGKqT~Z)*gb- zhZ3`&*|`e7HWV0~oNq;m$q9WO->1z@EwCsySD=@aEjG7;YOjPAR~jj26MESMUZfUR zpeOcrM?wnfXyuxyex$t0$eUamDa#FF=Ia6~x}uys6j1S1WM`o)Jr#w?i6~4>Mo~sO zigU71T$qOvf=^qiLAeYfp#}0JYMVy_;6u#5SOXab~f0s;A+82 ze-LNJMzE9iZH~)_y9qV`yg)<0+Ajf}fUmck>Mk5YI2=T%y9-@iex;3FFYRQ%uM_>j zkODoI$4zV60YB5()=K5lj1ET&Jgs&FcplvZq2EDu*Vsr!*PuMQvDwyuX3jS;>uc+7 zrHu-&#$-`OjET$RYGIk&&K8+nAy}4pP0OekE>OsNVGd zKla`NPO|el^KQ*dGgVg?Gj>&17d108<1jO7NYZG;Xj!(*6vvTc#}HG%F+1znjxjL@ zyoS&w+2Ce3$!@YqOwac`xB8As?U9q%`4VS;b4Ev(p{now*K=Mt?{iu4b`uO+VIwmk zW16{-U}&1TPUqCTU?g>E!fq?bwgO4u(lMqL@2hQFKZ~{(?<3PbI)~O5gKO*k%e1|C zA9H!@`djwHS>bP^@38>AJ1#tqTzuN-e%6+lHM}D=jgj7YKKot@-*%}@q9e}(3|n* z$KQq5Jn%L=c^BWIfYWZ&ZT~vGL(K~*EKs$hz zu60B4HL1H=uAubd?XwJbC%Jx>>TrW_)L6U2h-F6*NcI}xttXNqZ4giy_jEGL2ElhTE6H?Gu6SDy^HhdL^GmPcM+EhDX$YbzYFbQKiVTf z|b8R{)J^6SYE?{ z^_^H|7&u22H&0b3pPpuTIx&Xnsd3CsjAD6g6dO~MSecr_^oW$)AjXG>jX`}>W~Zho zgLKuR4)phb;R$+Y5gnRYV(nfFkLi==(Q4b?a4x|t(vQJXDH+bmoPpwkF&?` z!s}l1W+UP%$#(BSyk!Z^%}eOX?Z%X37X}Bl!s=J7&-`z6;%B8%VZK8r{;OnIZU_x)5mUAo$cn zBT2=VgjK1g96_UOPYFe98XN?lg8)xsh45(a>_)z|&1fgqN$7RvIiG7q*zH5i?LyMk zfL6Z^eK9`5J2i@* zICvOuJbeattS#W=`~)t|&EoF04XVDQIJZ}!>u&6#tEt7#ojb9yvCgT^v5egW+_B|# z9G+XjE{5eBi_2J~N}ZaWLGRoI1{bC=LuEHNG=fj^o{1nq*oXZdc-;=T zoHi^h&hl|m@7to6)qKV_T?q$dg2c_BRJ_hqV`g|-yiMhSdVu97fJ~U0;iAmkM`prh zE6mJgZ7;sB;_^D}XP&?KK4d2F%;RO6n&&sy6~o-L;%jW(rekl}{~yNh^?UrA@aw+o z;uGj>Q7;Vn0(vq_7-(ChFj&F-)Ee%*^bqd9=W&dWj}dyEdysD5g_iUh+FCX+-**xR zCa&P}`XhMFJwJ;FPrd|?o_hrzI`v{adEu3K&9&F#wb!Wz?|2$7rYgMW^viMi*o(1i z`5uP27qGVe5M~zc#uAm@{QA9^UA@~-dR-$&(K&hq>7Jd0+bUXmHjwC4+1>(@T?~(VPK2XRz65uh*b3*BvgTkGB+{=L^TMx^oX- z`~gCbuc>=F3nLsWUMVg>*3i=V0$|LZcjz-K=!G0g~WLn(x?-jk+CWKz~s#oEYpZuh; zPL&f{e%ssLW`wCGFicZTtH$Xq?xv<%T=l7a&1+tRm%Z#|hQIc;uQhsR=v-=`{0D#V z2l(5+{ad{0O>Z)qyJ+}JO|?zq|LR!ntEnnHbls1A>|@3{l_$#76h_;B#JtYB$&19x3{5Qh$)L|0EIN_qy4 zBiptY>FiF1K>IN_d0jmiB-)18F}{2ZS;pdt)-hzdr_q!ff-l}q__Y&wDY!x@ zG`I`N2jn%4N+EAY4MSY@%$Q@SI6OXue49dC)!eqDy{*j%bDI-M#0b4~z=N#MiL}RI zc$P}1BTB`WYQ_+i-WZ{`J~fTQgw@GIhjHZKLF{5^y0pH6xv4QMcIC0VBZrd%y|}>8 zbECTry9vJ23=OXxI*4n>jxqE6;!MJOs;5&J5YDhf4IFIFJf^U&NGl%u@2`me6 zRC7~fLl_#P%A6SDuhayl1iU_~y1`-e_VyVKi`6i^}cXxLi?bD=cjsC_AW%CSU zRXdxbV$JgNsJBCkpG}RlQv`cHl|m{SF}@2wLt3xPY4lccak{|pTEN?ayc@tXHxw%x zRamIctJ>bl1RV9cP@6gRj*vPs6^{U;!O2zYs-`9~wbj#ontg6{uHxERovW&<$^eq! zr6w!I`{+1btN55pmo6F5Gn=s}34u&7)v@Z@_Q4N+&{!wa{VBPD8b?=FR^k(%_=Ew$ z_rCYNMh&ugJ>83@x}Let<#OS7e&=_L`zyW=bKX2gUQ<)EdBt1y!yBI^Jx=O<^E6kD=BD4CWQyjxxNR$Dw`42)yGMpO{9jy%i;WgQwBjc@UY_J;)J!J^4Kt z?>LA%_C61y!S zr?Jj(_U^l0kIMw%o&%3!V*WBl=PzP%rtm)w51l_LW7#Zxz{rorty1 z5rWIyXE!F-P9Q(9hD;1s5w^*sP3ZGBuwAA!N@KC!%xnQS$uF2+RG*R8f znl$=b3@rqoy5Q-%5b!)sCsm%^XdgE>H&@KIXUtr=x#0&v?+ah}g7Jb2M(TN?c5`MZ zE0gNcHWPYk$EW>O2K)HqkK?h&9y7X}X}hL6Uh7n|`!|2{H-@^>e$RjY^Ktd+Rin+G z+5~Ffr=EHWFMjchjeflXtcn(NZZ)6L^`*Q_2j~nzwgy_4Ys_JN9`>y?F59w_(r0$1pf?)~LCiTD=4P)2GqF@V0&Q5X0J4hO{ea>R96ZGGc9Nw>OJ) z-wN_WRA~!`80M~{e{v_{RC%#XKa#BqbK4P2v?JWyVf4(fd6hyw2#dpSq?@X@d5l`z z)~-Hdd|5$HU9D81A>heVR5(GO7h$)Z;Bzy?3!}9uy6G`=G5i}OICe9vJ-)FEXZIh( z?F?1VQ0*y2(=6dMs_p44&b4>pj^Qy}qf%4S&|||RMti>FON%(Xx{ST_HLAH~PUq>< z3~9%y@J8llFiPN!PN?>E1Pddh1l};_2l_D9-HxHIR!lIAo#^kuR9_#43GIG{==}s= zhdu`towCT}8M;c%33R5S%L;Za1YL6qE&QzNi6YReGk=t@3sc>xJ)N59g!n!9-7dHZ zI=Pds_HVQN>_u#Q#>|zQ8-5V=;{DJCf!f$!~af4i|wfO-1qrwtegYEo10eCIoj zE@}s44tVF40q2^I4ojpcp9Buy~t7&WINhW(mQ$@ z?E?%ad-kKXdmqNfPhfIx_Zi{rNrtbx z7w^FO+!f4^pU33L8LUw0ojvz*>^t~8hJp8DVEhchbrz!wm(V+T60Jj2lzn>%yA?y} zwT@_$8g&ccSdes&?V$B&GI(8Cm>QKK4Ju|V2+@hpaLA3pT+=T*FQah5h}PrD!XAS$_avPnm*CpiIL7$ z3=?`Ioo(n#x1dL3dxWFtXiA_vn?pWFu(eXn(3z?(BULBZC7T(-CXG-&DMd%% zN!!r!qSMMWLe9ml`MwzVCnk`;FtI zj1;a){m9<@<~JMr>u;p4W>R&^t|FH@l5$fwHxqcK>0DBON@Sq!Zn9zs2}%N~?oH5> zDJg=KoX)$-A2MxIx$R&6)8oYD_ItPZQnShd=ybW4Sm~H>c)zQ4DH9PbyLH zFD}Qs1-&2DRP<8#4%Ifp)mN9fOv>oW=JJvIf&UOMX$4E#;KYYhZ z+6RxKeds7UM~)MIhtNUz4fGwxNH>+u+?_Z=$Q@j`j`hjQSQ@{GRj%8;b}tU@ydTrU zr!d-o42v_Dv1^0NRESfYpPapdL4t5->H-F)&Y@q42c}M=ZTOG@z1-klBdk@IG(j)h zzky`uBI2zxXd7C`#?d=4yYm!U`{$AFnnbF76p@x*#L~S8B|D7p)*ESoJEU6NFdE$< zhPEEmT3rV8{E;T?J$MAYgwPG8#}6!*DpEt-fX4-&)5e#+k@HTY*+yF?g^}KFLVOf! zvokobrjf&T8Q?p*wiCN&=CL|DfrZW<9PJyzsop`H=kk@gMO>U;!l98d9GsZK>5ZL6 zrsBPX-`c_g=4WOw!H{Wqd<^}g!{{3sLf_D!5&HBoyc#BG$9lRk+C`Puna6Mk)tj2o zcsyuzdC_7em~9R!Q$L#GQ7ZB*QVdz8TBYO+Cs2|EV=~@k^dpuxMb*Vl4UZ#6?*_q6 zWnt>qs&Lj#*cl4X{*20#3wr+&`$5oC!N0;+HM`L^rFxR;(>*H$6a)m4C!c)MXrA)N zfBeS=6m<>(N1!n0wV%$V?P|a*Gr^{NP#K@@Tg`N|ED)K>P*bx^Pdz$h3U38HDLyGY zm5FM*5|XM+Pg!0BWGOw_`Sa(EdwJjc-e)YEW?oBH3|e!YK9`wTL1maGu+8na?1!@g z@(LAg#v>@`-SG%|@?#VRgj{-*K8()nAlkFN$Yt6Y5veaeBc#p_LT~s8!_H&qo;XQm zbrfyA`_VzgH!^sfA?qnj(#QHv5C+FFHgtwBFGJbM3mED@iV*^Dbm$~T=(B`f-@plU z^&Ugt@M&~2+#Q*@#PIe4dZ-@zr_T_2htblrn+oqB!`oE@e4XP5(WDw&LNC>|i2T4R z)(&04-V@ghrPn^VfLzZMqJ&-|H-LCX;A@93nkM*CMp99mCwv2XYPzDH8FNdkMk8f4 zT~V2y%D?0);BRV*!%OI?iH6VXLcs4qm};+?;cahcJEjS}g{es+>E`aGWvae4>|0pE z+SnvkhsUwfH;A1*eOSunvEJ2*sw9X}HPuJb$5@(VmDR<#HfpwIXk|qovM@Sgi$Nn+<-S8%?Pu zG$rX&aO$txL>Jr)$b}<_g&4jDLx==|2>Am>f(0L8r!p@UxGU*`ptn`wDHokw;qAY~ zeh~Cj9;T2|=7c|v!3PuWhrN}g|iU+kl)cO9) zfB7%pr>+D--M;W_T+zq4$xGe8dQu zO)Cz$Wdh&Ub%LG%{)RWaVROwb`%z7>`wqUn2|i;5?z!_3bmvFWLD;o551=j8hqg2Y zPqquKdBRR(=XA6qPcP{jJ7%PQ>Yg}3HFgR^gwy!!c?=8^UcE;!N`OsJwT;n7#xJ0E zW9fdPWAvxflf<+%vFA%Ss9eQ~Aq&hgx5kRLpR=B{1H^fR38 z+llPJE(|Z8ruti_+FPf}+lj8xU0B(F5xc4OddJt1>7GKSYl5)rMhv08W~ja1jfxC)M-%kgTG33kp;}6}3h+H{`29X3$DUMFM|-Q0 zb8c>W3QKddSYQ~sNB}PI*MFA4S{fRm4`aHw2P18H%rg|;IWvtT>uWg3kob@)E(ncf z)y9rb8sYH#@EE43497ZqG2GgPfwn^1LDeX$J6H!p)=qggjZ8d(CIT<*bRcK5qnDqh z%kD<9-im;w9(G$jd|@BL(XbJ&M(AMyZWD50LN28K#U8KWez#Pe%Sf~!*s1JGu4O4a ztuqy#fTyX{*}ueo5cJIORt5NK${|ye6or$5fWlF!Ey3@(%Ee^ndcjC#W2Rb@nab&F zU;CPI?+R}PxndP42&u2Hl%i@@wN4<_vfyW$ny*L&3#`iCqB1rJz``)>7Qn(x^Mx*GiNa{PL(!shT-8+42_=Q{CV^e zetlyMQztn;b_Ro^RCB$DF+}ibx}Cr~f?UUbv=M%ji+4ihg}qdHBlDMd{NeRJz6JV`{1% zsh~(WWQ-4?v2q6n`Y@?VhDjye9Ks}hVzA!`L1*}48Rc)>XPmxsX%TzZRTYG( zsI8qNAg2ZfF-b^H^bTN@;qXAd1HI`ydQv%bwd9ab7s}w&d?ZEXnQv)EN2Ueo7(r%l zL{mcpI$a*jHl;8Yj3HTPfvt20YHQ2k3A*7^4+|< zP4$`SMmW0(IaA@u1w`e$`81U6t#ZWo9y+SI{#nGE5~_d~as$t(u<5q#$*Dwd*rYFjJZ; z!C3n$6qj1nwUqBq`>VWB_o91J|77zXHC1R_+^$b1;7aK!R90>CE&I2!g4W|-ncuZB z!Ig{G`407LdN(@q1RsHy&$l9-5&fWsSw%^iZ(5irKVt+hB1OQel*y^fxKqe4UAQ>~}eUOJn(0X(Uo zFcp$iQkDv^C7Cq9*VDt$v%iO6SKGH7I@+=r=$c4Ftk zA{AkwYuIp)`nt9oo%_|_wLP2XA0kr#U8=>XXfVp(=*mzW730Y)LLl$!t{6T}1xh04mdVwx0?+VJT|M{s%3i(X`` zSEFP>Qm*wnm!<-#xlUz`DsvN5iq)LLTiv^AU5mp{neI{0lTuSfhL+9j&H|AMD4kCg z4_YThSzM+kg(v9hK1^84bq;f#zBjYH(FCrhQm6vxtp>^emd3xCdSZxn`t#ART)Ydp zG~d-6)m>f+ud~p6t1W|cJKyV$B;xr9d`UM-`o<1X!5zi`!8bB<8sk)Z)617JF@F)m z3`>WnFA#WlV0z_R0eZ8S7=jXJ1YQ3a)gR~k3BF+}Ks8a(xTe}M2~s7vsf(2M1#k?&ujGCYK-6+w6>T6<@Z=^Q8UM$z0lgc#%WNOK2* zF)F=q3*3G}&qL_B!f12`(V$fFR)(yhCWg0r2<9=gb&>w<3~dP_xte*XHdD|O93s8| zngU@o8wm-bXk)0E&u5X%r_qv2p*fvEi;@szQs|>Z9--o!A05NO*f^Far?ALSc7m#I zgs>YRSk>LDHIwH48KVh_3hxtyX1E@az=R^U!+(lL=U1V5~2qY^bk~*Is|HK z(Nt?ey4H$hWgWcROVO}>8!UB|2J{>ZfenBWS^}S!%1+QT!RKOVY=TeFGb<}h=ruH2 zVYM~FzPYT*4OMoijIcQM({ovI*_Ao32F7M6D^pXF;$@i>m0UeGREDS8Qkl-7>?z99 zA~mRE)w@CSN;fYR^(TMwC&s=4mkC^RUguIfITK=NwL9-YXL}Di=qkTUF>1H2G)8_77XdOPtP*nK)K*7jd~t$Dh4}%(BIflY z8;K(wBJ_eGqzR;SDnU>YPV{6d&d@i86ul#xGAbRW1{k{b^f7epMt`mq?ae8)Qo*$n za2dia5sMSoDGys~hvhR~8i0On8`P?qg16CJc(#$%>CH21c=}Gp}!& zc@Ev9Ij?)Y*?kn>^A9rf+9piRV@(s-w%)6`ulCz|?OXQ48vj;mrzgJwy}Qw&^81bs zhOv3H^|z5LNd&SUG$!g$-&6%_x(=Q$2TJDFsn8}5Q_&sA=*&s_X-q9%BKap%35U%#7h6!cD`ZRj9+Cr@Bv z^)hlp2ap}ui){Zca74)d~GK}%nghkL(Bjp&wtU#j; z%^p9}z93TG0Ad73b0mx;0VUYAq?*vuOeiK|Xm3fPzcq(Gs<|$Ps~rqi^9)m4s1Vap z?n?!!o(>vkH%tW=qWkGSx`&{1%Y%MHsrh{#W51~K!YS0YGi+5e8kZf`>MGbOD&a1# zg0Hd${@QvfKnonU2G|@@Xa(R2Vse$wIjHWOo33zHz!MDJgqvA`A$QQd497h5fGR@> zW%bxl?+vy0JAC-?=EJqs1P0TedHH1mzzyuRDO33vnyAO z5O;R%P6K@MndLhOG={7M+wf)>J4(Pw2}<4dk1^yO6*Mnke(f6O*6zX}fj2UD&PX&c zvTz(&3PidMQPPmWqavBPhF?lju&TkWW$#QpL$zR3=AA8la?N ziV~y|f=pe!LIN69oGKj5CL>OIqrK5+HlzJQT<=wxn$=3xrpg2=IRem0n0Xr-;phE? zY)*unZh}nZafPtdwyE^w3Q-lV7nkwrngQO6l7kvGy}CC-Im&yEFpLew;)o`wN~!MD z*jY7P>dL03CEss)xKw~lNbsK-u9+`zF>o|rT$^cvLohI{7<^lyVw%>Q`?4MTOP9jromUl~efHqK#~&>LSkiJ6rPRCJf9 z(yn4@_uW|7c?~nGcQCvq1nCoVw_{X2EU5AXy#c~cl@&e1RCKDPojOP7_8FD$oubk^ zhT*w0gx+EFPoKop#uc=W9Y#yR2FA+EsI7Gm(nE6j7pMZ(g;`8_ttPK zIaT7Mb6K?Ha!6%!NK^S5A^&FMZS~T?3Ns;B3`&`fGl5$SLi0M>-#mBmwM{c2SbTl0E52`SFTPjJ7vG0@eRC?)`EK@^ ziq9*%1-ySN)Az_95yDU3`sI7Bp{YF$Zzcq5%7MC!1+{IpsOhOj)j$Qxhe}a4Qi}44 z3Y1K)pTXD?fwXd#kh{dt^eW~Ez198qVP4Q%yMy4nN(Fcklk>M@dj2A&<}PBK3a_B@ z&Ju8E82Vnq%qAE|rq5FKsjToQhGtKrkLu2FhR?a7{l>uGg<9KHGy**>kgoYo_Dy>>h+@|96<+RqG>0+ zI}|||rzxtqrhpd-hN^Lf$q|OO1xrIQ>5M-(_QNLTd z`jM*EO7Jq1ENE(EU1`l5u5N6tot?<)2f+9~3Qc-c)~REU9z9xkrVJ}QTmoJ(^kn7> zytT$uR{U~HiEV|K%!Gud#oGm;Vqoc9CU}a^Ywn}<=6o>-bzO6RZQlwd^Sq{QT`o4Q z-+G@W(6*kp80zM6rahNpy=DJ8(_cxhZ&rVWGk08sCly9R!inmn1@#?PRCU*)tgjq9 zhIU}v=ysHhmK1z;H`U#)+cCfU5?1!zNoQzEMYeYEe#~xA<*i)CG<{;>0;U$Y-|`jA zE?y?|ZZ|3`MyAgZdJJ``?k46gQ0bk);P?p)sj)Mm*Qcg1RC#?<$I&rz5SjknNcV0S zl^4xQIM6naCWf~HULY|9U#u72s4}XwaVl?xCzORNK-dLZ4B$BgI&a)?JAqfI#314A zel*y;bPrtK5EfT9@PZe=6g&6qW2oD;`Esj5qZOUq?Pyi)EW_4jhMzgN50j}JhNDRW zFlH#fyoYKr5<`Eo8GY&l8w?@Ous6fdG3>AtXoQuAuyVT)AiP2hF_o1$Ky?-+^g`TE z6(vDVgC2&tT&IDFG%A}aGQtd>n+UXIFpN}KP>UOxP{ItIBbgMxmn_mMryz)ORAsI5 zTpoFWjq3$(wK6wanhWGyC(j#@E4b2X31$Y+Hfyhq!d@;LKLi3_tF{YzoTu_Te*8q? ziMnB0csBIRm%kWJCIn1yY=wdBX6nqetzatNZeCC4E(VWj#p`t3&5kuq+lt|)?OT_( zp09Yj?$KN>zMrkv*}806@wslVfN6+;bP^cil<9%LsGl)}{7T znQ;a4RDC-SJYaxubpAZX=+n!WFvs-^RE5)v7cnw<5|eZ1Fg8OK$?$f0_njD;AqZzr zAWe0bXIPu=T0(}NYM(<)`YL6GA!ecn=U>Ixn6kuob zL{M+{8Oa9hUP8~M(b3$fYj83=bRyE!jFpYu_?b8VEcP8x#udHng>haBSy!ln2#7W* zBm%94N-9nG@tL0ma49up<wGwu1t=yLHKxccm;hH~s zWD|PNdN;LDc+AWV#eh;@TZPvOy;XLp=`H_{b2VI7v4?t@$Tv1N{-05qdiwO_>1XY` z-}^TH3IBjS_Z>iWFV$U-4W(UHZ0ok-wmuuS4cc(qkd4rjTTn90@OJ0nYuI`CZtOkr z01lpcl+%X{@GTHj3%l>a@}9dfyLJUr%M5FmZZ~{&?{!QpQu(c1!X%fC_IfeV^bvya3T9Srr{X(=!Re!D?cYHAzy|WYD}1RJkjzgZqH?@=FWljF z!YvOC0;Dijp!0+a?hY10TDv<&z(oq6^Jst|rIsT6jNTX?0?%bYuQ`*)?U%3Ojc?TxNp)p?-@+nP|TfBE^%FhT+LCpl4!bG_$28!KWpH3?zm2%_#5I%CLG9x#hU?$iRdUAE0$+vUg zwyZHwkvuCEn9orO=Fvh=wYM6;lcx!Em0{)xRJq`0syRdL^1Cpu#cgeQZp(6?9JdpA zc}|ZVJ^38alRpDJnfW4Vs-6+*YNqydYR`7d|6^V41J%AzZ4Nb+ZB@bl8BI^0pjY_G z{Vt*R_t^E&UQ`TQP(i?z4%ksb;Fa`q+TXZ2brq(AH7Gf7@;)3mc|Q)Fc?d_(KaK;Z z9>&I@d$4|xe&AmEeOTCeC*}-zUBdLzMFV;&_gmV14GRo`1-!}m3)s2uKJ46kA64FY zEKwaw?Tyc#ArwzyaO$W5y^f*X$aXJNYZz?>7w4uVNXYOzo&Oi!wL5}Lq z=}$1MO&Vo*HdmBz3lMY#g=eIdYV;V;vpN;tdSG+<(AhhHCtmm>yzCXP!uZ6L@zM)+ z8HR@Hj;5MQ72In~TqEPKLc$DTl4?Ol03+y{sa)hO1e{#$_p-?(^63_2H=B19m!k=4 zW?5SCe6brfq;xaui>|H2nJRnJF*&NZEImWO<&7)Ftk@HDtfXxh2~Tmsg?}VnH-KD zqtd&nnc#Z{dS(b$9PZr;-~QuGe-~4Uo71h8D*stcPuGo~UOsD5dOIK3hq6H{q31x^ zkPF*K+*EZAlnfDi@?k=6#7VD5$*GHv;Q0B6aQMuFICT0!0`NiXId(sGAEh68fc~HX zx#gWyee{L3t61NA4_2rE1;ROoxpONFdzY_Zag`zQ@>NXEp2x)OIgC!7!p!pR7??PU z-qC~T9NtUcV<^3rj(Nlx)&>&;^nL{5-G#6=oP#f#C+w0`a&cqSGaJ>NDl6=qw>mW5 ztq(O;FQKQlc_CPAe(n=0fY0uQKNQEz!ZKd?l9%D~)jQGI)nh0;qg;z2t;)bOeuUgC z6EiXp3ryUvP0EijXd}#&5Ww^fLP_4%`bse$yV5 z%!KG%H^b#i*Pf4aS09JqckKMbICA!3!tdb%_>RyI-;cGucN28iu_5?v+==x)*RgZ| zeORQLTOkN{A9w(}sPLB8?!w&SWlYaq!16i)xq8JoZg~0xdPWYQwU?po;4ZZGts&hx zk3>#M1cnh!4I-TECF~g9232X%VgS!Z&^1!oH98r_5_S!CDm((NzR}NVzyP6*p{};6 z8(N*!13SarRHhvVkDkEY_uh}Cl{Eu)tx{5iN@urPZ4*S@okn?Bh6*Vwm81#)L5KSZ zR05Qyh6>zVX9CQ~RxF4al?Kmtvxd}ktuvIJQI5Fj3a@!>!v#6IQN~DD*_~8dYgggk zQp%l2sHHMgWi2Tr6AYYsH^ODJ!fvUBqrMjQx*FK(s$s3I zf~BUy=(}4}TaFq_C93Ny`JL6E&QeXUA^hr$_JbND<5_kCc$>b(%vw@Y;pqi4mv8xh zj7#0=@32+f{Xg}iq{odc|0m9@-g^Yw`W@IYk({UPh#6z z65ChfD7o|g7vjqGCvoxGlQ@6J^Ks(BBZlJJcl>_rF`#!JHuhh~o`d&c-{A+b#*lVx z_uT~D{e<8>*n8+f96t6K4jy?FyZ7CX)s1Te-zAJpoW#)haZE2>z#x@gAC+F4(n)o% zAd#Oj%JQ1BV~Dj35qN!6dY$lw^Kkf5RCi4U*wIa>X;AMvsyj^uz6N`c>%*KX(Ll&h zd<}L2Aliby!ExTxDeT^R0NuU)2K1!l1fZ^-ZUb)lu8u+_0D&XV(Qnj%5>`@VCftl= zsWO2~+vz6c1X43?Rk3UR4L6h?&z~=NR!Wk}Et6-cNrje7DV=*9(P#w0a1h>LfFZq? z;CI8~a-iNpWhS@TP*-n3ZFManR*%NoI#}!KV5zM|eGS1?O^8)hpuVac4RmXfH&mCw zLa(bSMQv>B-F}YICyHX+KCg6jyNEvi~?Ue|-Mm zGkqpLf12=7Q)|i9`%mDuK@Y=PKT5{vgx^({qckmt@JVq6G@-gh(cMn!~-G!C)JE`pM zz{c+DgdM@i=?c}|{L*DiOrOE%F>lLa&*yi^1wtGZaBBSO8vw$8Zbh8wfoqJ&R4<&ZzRZpN-IS`(kLxc41_E z77I&j=&;Z`+Rvy|pfZySoH@cvO+J)?)lh|I zSiI@FhN!}T$GX6%RF$4l3csxG1TyBPv4Gbw6T3Q%p)-$}cXr4f$S97`AYDE6#%Io_M%mr9^BT~@3*Zux(Vo3`~w zsJKz}e@@eLNyVxkUJ6g)M-QI^$s1x@PslcGqff2((OxV%1k4knLw^< zj_c;p(l&umQqYhU(fHRM*rPX^IM7ODJ*Orq@uJRaaLTX@{yRt58LF31Ai6 zR!ZMdSC7($MwHu~D78DVov_{0#j$;f;VTth$wm`O_9n3HC{^B> zJa*hZfbEwiaNC6$l)U=`AH!RJ?t^&wPyQ?(dGfWm^S&43+|?%ty@#ms?!lpx58>RE z7hx~M-1R+oVR7ve=9VvDVdWwg)IM+VcB;FJSW-K^WsQ$Pl{cl(_86xp2)(1|8rXwO z$1;`OEP*#^)YuAmt=+RoUj|~ z)ScZGCiG$mMp8(mT9M9nAd@4++PbLFlr@>kPL&u0S|Rk5Vk6ABtbF+bpsDUu3#;y0 zO1~b9Mq!LC5e&lb^BJA?m8GPip`oD2>If|Yri$UD;8g%vh2h2XRfJk)Ip?=}1-DgH zqPl{|RuES7it=idQoZe{u0yG%5x3P_QSSDkiVCrWpu5fCMwvH&+iWhB5P;il4(uQh zcksSS8?C6YHlVV;7L`1ofmaDDy@B^@F`SSi;1u!-gpGury}lZDekPRz zE=&@70@oJkX;wIsYkuoXWG?H4Qx3f^ed$a1{O3P!G@meELQ@@Ts$n9PVYxW*r>RhM zE{%Pm<4tAqm9Koo$Y!jm1_@W9Ql)k>yRMn0YpMB6PfyQ{c4~6vyi*@#V}K+jTcE2i z^w0g=&ut#5k6iSDZ`qHj>2LIHeD^z2dw_EJ?0)GTDBszPihW6x9co62{8$PlrxGZ+ zJ&SF34PwW=v-CBTeBdLW!h1gaN&L(^-jAnV`7;c2Ux7>4UVsB9?lIKfm3v;!bw5S; zJ&cuISFo^pJC)uQED?UQi{~*$u#HcjBm^&Eo<7F;0VO6KI>6~6^p6}wd#_a9BB3{f zSb7}cQFs{6 zdi`s~DXK4qlUYJ6S8z4RRr@?aPf(MJ6ZG1t>{N3r*d?ggf+2;S9wYryL%oFx zh)Rs=%k*jkPI3XOoWLrhml9IjQC_y4(;bFuzPxOQvAwFYyx=@W$5s(^TV3mG%6SdK zuBuGn6R<2)Z3I}O6Sr9iGnW^o1f8H);`CyNj{tOf3J`RAvEA##4!+Q3ZWk)3GAjwc zDuPQ&(n4=w*xSgdRr9q4x71V_02B~yT$URZ9-KPr>uw4?O-$e!UIYhI-4z2v8I+Y^ zQ>oXLDOuZP=2ZFe)PPpwQmC(Qamb~4jfSR>U)wc?g}|s`Z_Pnu4N!q_nS@_*2T-oR=$E>6_m4rW;U}70=(YA4k)3CZ(rm4p2tM-LJla zs>2CX9E)Jbi7>XGP2#rm&DeICYVTSG+aByg>Ekmfd)|5hdLQ_e&*07P`XHYF(%0k4 z^%vme1*yDi^t%o4ow@uZb{)JMOFJ)NY2zYRc3vUiZpYO88H`LG!^G?YRqs#uS{Oe^|}wMF``?Y+mAB48|7X<%KZc#;a9?S+q^*n&s|ui0xS(jQ67$< zoT{b5?M4-qjLHM`nN-1{$_}H#!>sh6*Vn1yqsG{7BNS|wdNdMt0;1;Ur>Beu%6OtS zO>;MZNB_3H=P(Y(@enfIqbdx~o{Z>hZ> zchlcTT>%RjR{om*-}CeZ)Sd~W`ho`)7v0!#C4?P!MX}=^0`S2&wmpv_?o&f3d&vSy zKK!xIQt5pHuYKds;l9USjQV*l~GvApLZ<~PoBdI^iW2)o^vvAX9f72jpd zE}zBJg8JMZ$He?8%&n=n##wX^?=vballh%&Sd#(D_6E(Buj z@B}jqX`76YR%Ljm!jrla?23VBv-#k32KnHI;PwQm&|-);rI2dLA(L&RqGMRwYC6MM zDm+ytn3W6apzja*VYOL#ehaGk@>fv-sq9UoUsdqstl$e&O;FWvzFPC8gxHSlhF5JT z)V33H+e=VeMioQX^cml7cpX(>4V7QD;hX2HD1VmMQt1_-SIP5K(W_BcS%t zH*#Lfg$k1fetx+X4GfJfoN9f8;KvYoc6wTG{^r9~FmuU-p0Uh*idB&b4h@1Ml_%I~ zupXsTmr5(nlB}%50+qID>D*?@b~E+5t}Pe*w66fF@V9uI zAXuDwO4rtXYm5tXcu9@9d&~YanfWC0`Q!ipC;kEZe&!?^t}-gT%YvHg^{BkpiHZk3 zD1XF-(kDFF{-P*KU)hSX*9=MNeHw3i$9wU}3to+LS0BdVGuN^I#5JrPV7R*T9A?+f zVtVB?rj}1(V&Md)7EfVvL4ML$9-Lw*JGS5O&Y|7N_pTw=vx;VduDNX%&8>yzDaq^v z66sMy5(Dr@RB@4mJCHWQ+6H@=prg7|`@2S^doP5vbqr^>0?*5^HV}*>5>;m77Nj$3 zULxQLa&lEVsBABLqZZW|2q+jZQl!@qbb?)jt&t&fJ!+_MYUwJ6E2oMp-F6$w3AYNu zt?~xsZX@)z(QiWym0fLV2`puFO}Sjhsp<8CAJ^4T_0>}K34S#MUoAskc^yMud5tk& z0Sm)tO9ewtdVM9;S2e*$NLAN2pu)m1mmzmGVOHx8pe`6joj;690=&Hd}Fjj+<~gqefqklQ%lNCnuy^>qYYt&6IVUPI+mZFiv7re-k?G}5PM zW{t<{W@f@qwl$>HvRvV;V50t>pZ)A-4b>)B`JHL%t|sVdKec}pAmz#psHw{16ut^_ z=5b2#ul+S7q|Cgw$_&kLSEG*=t2PZ1{?@m?^@hqcA*}(xb#9$c;jL-KYV?-~#!b)h6$DtnELK#ocEyvvv~WOUE&~a18yE2QWbI9@&ka(Y>7R;dBrB#`mFf zcsE-6*OBd6L8@Z`NvS$cW4TGhsqB`VQ3beDZoh&H!D(Yc*BPDz9x-xm#5=r!{Dx*Vi!o zRI?R&0R2D$zop8x2rC0jPJSm;c{MH{s)A8eMVe3-OQL~pA@u6J{O$xUfR zd-QfJ?K*+Eb%v`;M=-Ih1fYk}KXm{dLpu%d33{EJZ|z$(x{Y4KlDe8g`#9C28kGq(Rfen5p^?Ba%GwmBG7PWb%dc>> zo+{1KU^9letnr3WtTrm`s03`sbmR0Z( zTvUmmvOJlR5|~O(qoZjENrk{FZxitT?9cwpP;UaC8M3Oqqt2u8YKtMIfq8T;ZPT>` z$mc!pc{eI8e)ea7_E}|py3c+4_8Hp*)z`oN^*6w6ewJJIV{G~p8NbL>df)pN{ulln zS3h!cI(8pAi-nCN zm|i}JvH5)rS$7+Q^Yo7Iroy9|8`_|%+sn{(+2}r&Y@J3TKZ$5&6n?6=U}^}>9W!X@ znnOH4fpB^R{zN}~@m?c)N+3>k7wt5F=LzKD@n;FU=w?`}lu%N6RCffP^5HeA53NJM z^TN$gPGxtocr%()S!A+Oa;lNtY*R0ePob~*L zs;`B^Y9ahAXe977{)N4fZl&|PsioKO`?0vZuzUP)_~wuQKkwG9jkkIMk({~Uk&w|{F4ex&U> zPD;J?}M8O zJWT{c)!r6|wX&z5e%b(uz<2%nbwh2b>4~OhIN99XY_zMBI+GGpIIGGF!OJ|KKq^R? zHLPZtp7O&<(W$KPwXc0GUh|sQV10evSf}%7&~3BQVq#+AM$ND)X{2)1d_`ff#<5j1 z7@besHRi5NVBPweZrP8aNsWJ7&{L{`f56}2Pw?hnzY`DqQXlU4U<=NCB#1*F_hRp- zJlOpy7uG-P#PXM%nEwqYroLiB$>Ppqm|Uc~n_NTh=rX#77SY->gIw1%G98m>X(QzF zqll-K6rc~GL@y%E{ahZPife~6NSKAP@Wt9T)4Eg18Ts%8x)#{I&1iHd47a+P(BO!{ zVvi7Ve$-febODblk0EVcV*z+h0WTCrR6`PG(wh|$QgzBkLddDjodLc=#f5rGgc(jb z+-ksVF+$if!fiVtBxP4YRaefJLeQ(`dNnJl~_z`RY+8I;?UeFpG0R~I)5mNHY>odC4BxM-{^4k@MV6n0DXNqMRER!|eH ze)*Sw+1N*FQQ(nDk*QozO7S;;^EZvg&H|<&C0Lr_tN^QKC?`&wxDlG_z9uIpjqL)T zsRE_?l5i$R?IE&A8_LpQyA07Fyj=p39!d;c8TdS{VtA4jTn z6v^B$VlDlITrYx6T}BoZZ-iju)I+#AsMs0_H9O%Z_&I}V19XL?0Ve3;XmG|*Ptesh zhEUTGKy|%$(>-)I!ADp%x?yFw<_UxmiZvmoWSrToq2ScM&MdDp(>bXFe=gU`7oi0K zm1j{2RWS4^EmNqv0Y6(1} z@^a7hg z*f;?w@P(=JLO~B=AwQDg5aJrZ+^=->5hNJ0Ch5_56rOAh&bDTF_#JxN`8VzAhJ)X+ zwIh$ZR>Clw#Ps2PMkj8=L^wsTo1`Ea`6L}uQPOy!9m{NM*SmkMg{mi;F%JzsnVH7w9~@fUdSd#>Oo zf1(d>d3zt;`N2NC?GxR2*B1uxp5LCryT37mxBl)pp8nk-y!v;0P%<>N!Ekno;i}Z# z6f&I?NahK)7%EPXbB7CAjpcSf!RAe(L6s360*>=G)y5KZ4bC{~Y)a=I zp|UHu#m;cHF=!|}!A>DzYcu-&8fXb>Gl$BRf-IZfoDMMJoPS7hip~qkOa(Y!|g`xZ^IWMo`3t@Vp zJ)OMJ&aRd!O|`ZbPOWr-Pl`_=u8Y8PS8p>Ep9#Jg0cSYj7$N8aoJtwG7}h!oJO=^i zpn`KW)-Xh^g2z^c03jG)nCsK85aiV8^IGhcs39C{3B}sV zaw9#Hh0_MZ%h1TFjZ?W@;cAVP-wp$QUQPp4e<1=dOvr`kL58zlsX|VTkd?r5S*i{9 zSP3~>C4$yU#0k7eods@(8lVC|G7n!2zHr{2_u}1we%sF zkg{tf^xBN6Cz2!Z1UsrYFIAip0=QxZ@J#3#fD_zQ8_Q{}RVmyHpexkOO4Ydy*j3g# z>5d|G7eScnPOviutr6H-1vx4@s=7j2b@id`FjAR^7^XQ0h}s$pfmcWMR*mgD2snCa zMK#ZDAxs-lR$YtjDkv8OsRBzmE#tJD>jlRu&R16!yrzm_5SMGJs!>~2g*r8rsHUrV z2_dMqbZSmwmvi2sDb<_G>72Dxb95g87m&K+x&T92KiyMT4tG7L3}@Y{pintoLoIxb zweV8q`5E3OEDebByKuMysBg)kVt5qWCZ|x+Ka7gO2{aB*VOuJP(p(0v(S9^^QDt{! zV9O=pOhg&d3V1V7LP25a*=7R9w5@>o=OCi(f~C2ijuWVg0ksuy z+P)QXCM0$JV(6J>&TD!zD9g59=a&6=n*Ln+ixr-jzrknT_bA@;nnir%r>F6Ww~XV{ zKR<>~|J(>Z`}Q&X+S|wR$+r*TV{hxjN8irywnb%kS#|#JMTnuSKiX1Uzsox<3vI;SzMwYs^F z8xC&}euk$_sWg?H!dN8%P*!7QNKrZ`jR`^MaeapBF5Z-cmx@O1)G7$7GJ>nTf)JyZ zmJ@tjE>(^>0M$K zn*g+O-d0;%XhTOgI_E1Z2^E~eRw+3z!&IN)nx|9Qc`T}ss3zp<;HL|C!G?PH?G`xP z^|0A%5wp7xvHSV`MqqF2N8{WoN+*|4lBe1mT0&L#6mHA)qiT2(jkEKxO-;cuK7!c7 z1i}Lyh%_e{&PI^+1&tvqb3PyP;Q(^U5L%J}q?-xCWEfpkdVR4bdeZR0L^JxDljtVw z+6cQWA)3lk1v2#Ab5PJDgp7YGHz~rOX6WDjb^#!!vMYwm&Fp{w@Bh6y_Y7FfV~Wez zZe}KsG!+<4IGV>l7r=^5>rCHjy8hO4+_E2Y)1Q%Ecmnm6Hb3D;u&IwMA*ZY;X8ya{Mn9?;vP#hjZbtZO(bUCdr!iHZTAMovr!Q=% zyJRMd9HC}_ZnGJRQDI?JR1~16VG^U!I04~gXj*5$ZhI*eTv=hdy{r=3=sN_$+5+S% z8*R9apxQz3RXbd$b2F@_I+WMCyr>~)qzJ327Ui|-@5*Jvxm;IM4+{Y)0LmN8zN>_& zywSpWOD(+ycDi%3_SIcqP;&yEmm#d%nAXB?t4E;Gf&hURvNj-SZ-m=tgUw+<)agOU zL6zqY!`U?i-`ZZ(&aI=QeGKl|{n*h#rPnoqs%gS-Yz{T_>i!`FrbprJ$sv%4ArX$C zEfg&bL>>vFlgh3&MdjB*;L+P!xQ%e^OSWL3Ig7z$7Q@LD1_{01Yzpmpg0C&b?>~iH zHi;vL_whmcwt+j9nS6^+*A1=B!YMTiqb3a+}zNVSS z6e}|wW5UZkekF-D{B)<4x_|kuQ5+C`Q-T36&R`Ka~&Ew-g zKY>sF;v_!*?nV5@d)M%#_bhRKmd+?jx$LCw1iL~rlN6zsG)5w`dE*3L)KGZ!4u-2# zB(*jwIKhvKPT*69V@<0q9#qyjP*v|R%IZ`>QElNomo*lx2JCTq)b1{cSTapjkt6UJ z!gh8Vpfggl(_33L=#EDEYB$DyoZ)LJ0cR?^?Ofi*>1`^*BbThNu~4@tg*Ak3jBpF?jx&2=s4 zY0sjwHI4RcGrDtWoH$76DKjnUCiH0x0-?k}hj>8$-WH{&^0C?3Sxixhs>#a2!UE>z z=CHK1WUMcqs;k%=-}pwOE#6khY1ssm=GBl`wY6&Jr+kO%m7z3X>ba3%sIFRCrIk`& zUfs)coky<9jYl7S)M#3x<}c>hu)$!^$i&ju*JoVk1uuBPjnG=>FwavA!{YVDQ*&OX z{dHZ_wkq6P_QRUKQ~fOp&nWO$_{xVK$EV)155NAitN8r87V!D^&f)X#o5z=bX_>x) zFT8smpL_QVzVMzIl=!6X)UM7*;@NDdHzo2k2IbM{V=jiX1YSLXS7-B5(Rt`zBXl*r zhQL$!D&PrpDzB>{=xQ1qu-II%xqR>hB1X-u#-q)&5_Vl(Mn#2sQ3!Y%IjpNoKbgTtU^fzm0lg8XXo`@Zd5v5*iInT z_yZ_&QgQl&Xk=*Tpwg)i$FR-Phzf$ON}#Lbeumo&udTPE&O%t$Yu<)>eui2uTkRg> zbJR9)dm}+C@X6J$RgHt`PF~aNbim`V!A;P4=w3%7d<0&QpbPWy2kZPAD(6>WS=kHA#5(+hVUP+rmM)B~k>O|S zq<0g3U5O~VQuH)cAH7?^%Qf@6&ZDKL8_7Yw@FOEg_IDxMr)<(W0*?UX_uiAwqL=fh z44$%n~k5>SAiO6d%|>3!96_8o5HOKmDSbsJL;3$txp@ziv+$x}N$&8sPq^5Cfn zi_T+0#)O2XYCt?bK7OM&hj~8r;xO0ARWq;N8%jwfD-NYi5WeeO?=srtX@50gd9HiZ zva-1dwlaM--HTZZEmN7H2`=6DR@JHNX4tWj~7X2kqPcfbV_Bd@BDEU;p?E z@oPW3A76au27Z&u?u#E-#Fsw2gx~(q2EO{io%pQ}uHnn?U&NQ+H-{3Bk)l0y19%Em z8ysOn!8K?=9+lHk-3fLzR)&aFaZ+*`q(>pFV5f?S$~vkn0gsT7vNQV5Iy`WC0|-!s z#G6};vb$V6m7U7&HpiLB7!8f7>{M|<05(yncs&e-Yif*UBx+w*UaBr+70;@r-A)j0 zuPXSq>S~lXP?=EyGz1uW(@Utp9El_r}!(I)J$@_q@lnoSqznOM}H=XF2Xc5(1Y;g7`!V> z@aW`D~5+)q^XjEx znhB~;e)5w>_cil)P30y$i_a>KU@s`TW>sWw=S`)doUvO5v$vek6e!Ul?VCRuvUnjfKGD zv{tYt>}pKl8ETeJWvh_BhTiCp!xe2oohM8s$xuH?00!d(Ekk@?giz!U1w(iT)u7YG zaFww2cr^9G$9X?J$S^xZS2L54*NK?lft1gMq}PG4*9O1e25*?4jB}K4B2`ZqW0#S(MRwNayrOxwY{?qO_QSt z?plNG$U)Q{+Yj5Zod_RTMP_mm?ft{(85%<;RbXbY4~Ng5DLhwtr#aQEpm(Ez2UmP= zQ|Za{qW|Sz{-ps9xj>>WUm8AhU|_&V6D5#nUTNEyjTz@DhRor^hi~+{ z(0&3_7o!6u8&&xE_~VZo{b!Z_-Gr4~{c{z@76Vz=FWyJ6)cu@4f8I!taPi_r9655t z$o&_MMsfD++07#h_aso6P&2c{$V@e7!q2>S6LRMAE&I1NegFLKjfPeHyL|gk@ZW#) zb@;7cz8zovl@s{wPaMLRKfM=U`RrcKAHdf>b^zb}_&)r>ukOPieQXa(T)qU=SRqZj z-5sMkQ>ymBjf9>BMMp?b1ywTqD|REC74WF?Y7`a{c#TdE&*evuiYnfmMmi_J6<}wy zyHoj{YG*lBvlNB4Dx2}B3t2URwqtuCpPi;f(9?rjsz%2GA4A&G%1V@SfB6pnitnhc zL8(#^SsJjNDo?rMYJCBt49^}*!Wt&D5^0n%Of92|s}%HZxPYy2oK7`c!|=Vv&amA< zmCNwHir+<@uZibqAvBv%?N7jx$f3R|i`p<1ay&yoHKWlJfrE-rWtT3WJjfrE5Imgs z`NQz5p>ZI}&^=7|A{y`@Nzk_h{AdpOkcjvZjnU&lG$&(-MuJEt;zm0AU^Ipxm0c>= zjuau@$xya07BfKCmukUaDvg1b7W5O6eXaC-5`*nrr@_K|x)5EQN9g2n)So+xZAT8l zdSo90dv_u}yMWa6Jerr65no$GczqVT?z_|Y4ZTrpLda=(GXNC{jJaY{`JDbnreeAo z$TTl#yyPV>d6wc*XsWC$rl~=(z-TH*?fbw358MEcX*%DhKJ_W1T+Rfe_BRs<2u4zj z0%=hSuqQ)VhlnUtzbp{qVi@ix=UrZ_U4+k~BLYf~59n_Q;-bUsrB79V%Zei+j; zrRp`6IP359SNNMhcqjhkOON7@zj!yk@#|;sJHLGv-~8$s{O*^|;&*@T6n_6R$MA=r zK8*kVtNT%+ab@M^fZ76{%IVB;X3Zui3STQ~ZJXX`cqQkn=+y+ELRy=XiYCmLm7!L1 zCP%ob`xwJmDLhqB5OPv>#)xEuo-!G$K-x-Grk)aIgq?t=E@FaSg#kS(J_C5lkU|## z2|ObQo?P9G2)y#TI+WJd8fAOiREc4=(d{T_$Z4UvtM^io1!CAHU^g;MC!lL@ctPRa z03Kmq;|NkY@;E}ShCc{3eyYY`3YApEwNb*ZxgFKXJgTB;*pjUV_!@k1IQ&g;QWZM= zQMd>=xjPtxHx!32+{92nVTAiZDhdry(-aINMHQN%YD*DhafbKt1OXTDAsz~#J()y4 z97RX03GIYlC&8AB$BeSOzLdh)6#CLF7)fO?%8+*`m%%`5nxQH|*p)+XXFD=fFp>2& z1gUIlZ@+?)BgbK32y5TH8~)Xui0#>j;GunRAKVGg!8NQtdat4EjOR)3Cz+**%JvG4 zR~Y&kNgSV1H|p_F3^L`!+p2U-XbB!>{yTw9J!VvXr}YAYx`q{p!Ge}q#%A`;D2B`v zPdsr0wz`LDWqzjVe1e|3qG_A`+;q)s0PQ zv}9Z1r+ZL(Ex}n~d@47X0%=nO(ZMzK>%2uow zf#Hwfr>rhFcRZzuMGwiJ5 zJ(e>hFH@~D=W7}nh8ymrE2I_h6w)%BH-HyHjcTq5y7~~q+HeNdAu2^duc-}HEgdLt zZbMltgStd3tW7yX4K@ana0HWZhNz%JO>hx(Zh}tk;rHW>H5;A!y;Q;8rj(HsH5o}G z70DpM@HrB1L6}M|-okZMaBayH+6cWaD!tZd80qE+(&-3t*(loc33Rlk(8KR@tT}_} zmORFDISjRB2)r!%dRo!b*NrSyS9sShcut+8(z_GoCoiG#%mvi$KM4EbBM9Gq9?rAJ zVLi4N-b1^v^57=)^hD`h=TyI|3MN#wabqRTgo4zO2`RaO0VJ3e(N&D}4UI2R>k&_m=%|rYBEN zp^BW1F5v(4H~8jL#jq;|o>3cn19%0{J>yh)8o;K$k+5_bAzYL%a9ZP3Xncxxc_H1C!q}`r ztd1Nq?ODWH;#56e)Y}+>Zl-@yCKP2rF%;f5Dm;4S_8rE6JX$a48D3G2^2&mnLo8~z z!>XDpo~y=i!LOPju6kwI+&*J`ScS56RH2nrbQX62QZ&0~x|DuUi{cPsjP+R!`Di!MSS zy|#hq{-f|6zX0!81g3Xz61V?r{O%h7x9C8u;>2!jM~~8(4*<~7x<|0 zIQMKg2o_RLCaCmstJc-rR%`;5pqI^Njs4_OX99&x=aq8Odb4suWxmDN(>cs#olA)c zbZzr|QkkaWQVzXB(MYLrJ~fMy72lW6r)6{9)~N;|((y`+s&ff;YD0SR`}(S!Ih>Ga2o3=bd*N3Rb|pWk0;>uVO&YT=>Vo$LD|YVSMP$MSSwX z1^m}n?Zuz|{2Bb!hcDyrKY1Vi?-w4%|M}{}_`BbK6o2Lhq zdbOZO;8oSy=>=t1R#}ho>UyJLZ%u;@c8?F?c%l%>5_Sq@)m_Y}jqUD2rY(n5K8+k- zz;t&Oo&>?;sz!ya66N&_XRFJwy=)t{@3;-6JCw0lVJ&?-LAPxOs!PiVLRDxGf~wpg z>?$hDP*q*d{Y%hbA>?WZz^YQz)>omys-%>+!DiDK5mv6NgUura*GS0OsPwFa8C9Lj zhYEL?9z}&CfJy?e&f!6$%MY8BZ$l7`HjUkpg2k1E#ZRDxvZ#rrQBU<{C0y#739gn- zR1;j~RC|>nD#CDzfNX}HVX8F}Ak1P${qUxzPT#!TP80f zm+-e2loxq>I?=Jk0U%MZF@#T9^qS3|*#OR+aIy)uGl}k4Co>4NeE|UCmV(nycJS-RayR)DwbrE;j*3@VN;*wNG?5 z!QyP@HiFKVK}{fq>Tm+p@i^)eN!XHkI8vRkwsbMvZABf!Q)@J1z|GEm8sj`q#0P&o zf=G&?tU_Ui#ib+jXufluW7a2$dD zQAFB$jPZOs2*oxo=lVyG@9RfbZy$y_d+FWi>hD75KnMEiBLkh780f}$e>X-4doeLQ zh}pprPDjx@JB7}bIdrWpqK%&3xrFS#HMH*EKl^aiCpP6M@sz_OvO;dPlo=2`m$v0~XV%p81X97v*HOutm|5E6gYEsjC z?zzVZU)9!6ruF*FCRk+}4@93|jj~nNx3$v4G=;#op!cJho;rQ=h3An=|M&mmk6!*0 z_`;FX_`;zh_{!14_}spo_~PMR_~P*meDdTxK5%XnAG&t}UwF+fl&Ec5jk<{$UJO0E zq3#sMsu6E>y_I3G!)SLGWXRT(VmO=6p}niK(Cmb;)32DIXLKK9n3bl=i>KnS`xzor z-Br7&%sjQ&R<{jh1XNko4(upXNOT*@jf9?skXBw(R*t#~f{x*@#*eM5sYDGGo}gEy zV{1wnhEn}mwo!F%M}vd=a^C8$W|&$5ht~qT%L0e9fr_x!XbPht5FL$nxEo#Y@H6@d zdk@1}hnwK{5Qbjfr_YVbfFIStFsl6#)cWJ7^CygHtv`m^U<{T}6YNxWu4s;cYa{G( zXbfi=)@B*jw!j`?ILlDf6_3Faj=~p7aa$gZ*aq36u=Zpk#GF><>JN`nw)N za7ye)>_kD~MhwnrX$#5h*@{A9%t zGl4GXX`kYSmt;Ll(4TKx98+ws-Yr}1A; zoxpD}n)%AX{q()~&0~A;m6PlE;;BV^?({TD3e8Ol;47#-K~E}A>dtB?JgGaYgMbKx zHX9kYA}dALUg#@pb|Etpe9F`57Pvf4DlMuoYYi%`6{vAkqt;!E(z+cClNq*FY)2`f zr@?qC7~U43SMb{M!cd7uLeEMnxLQF^p)Hp!1fR1M{xrdrWY}#fK}BN;D(xkx_LicQ z(;W>Z3|T9vC~Dwptb>=ouC6)@JPgadRC>|6MuZurdo32Y9F1rs@M?W_g3f{JfD3gh zcMSL#+WOHL2s1>Dp@ClSZ$d*b$>k>4g3Yi|;WdUcgkT1)a1I`V&qc_&6I6Wsaqz@a z2xq%tO?03tH;jhmgM{4yl+3SV`|Wq3_MsP`@v)a;+Zk0bTt>$dZnJ9QaeQ0tRW1LRX z=O#xmGsWfEA^I?;r@1^kj`77wjI2y!SiUrcsl{pBapxsIP<*jByQ1kgRoFA1v%*XP zIOcRK^i=j`HWSfQs!oba;jH#E!&;SpX)qr%d=(f}&~Khcu4n7Hl-|vt_k+S){UDq7 z{e!Hy(nQT=^cj={)x6KGl`5*C)n}E`lL?IGJ(#9rr1aE8=a&5_rm|&yxBMG?@BdKg z{ZIVm)33wt+<6y%|LPU|{_VHpk1t)pADlUluOB^uuO2*v|9W5_zJ72wzIkY)7^ zcf|^?nqZK+E3~_dBbm++b_`iN)rG9k>_nlg6rTFZwzm>?sTRbdQ8*oTJ_J-`<=c6l z3hb!bhHaGulZ7vULR$i_ylk5>2u}fc+^hrp7@Nn7f;JA=&*G&YI;$tmOT#GY<(BK^RzuLu@bq5@y<#^No!ks*O;>H^)@B^V&=bh? zd~Ge$D+bp;SEg6oJ?>ws^mKkx(J8#u@ygOHEACZc-p3cd@C9QZv$>2+_fQovZBxwrI8S{8R|~GDq=~6vV-VOEMA+cf|W;YPsSwxa~w%BTPcztV~l19%nmN~%1KPa)XVl$nEKYf8{n zm7`Xb6jX(EbzEP&19en+7H<{oZ8nr8OW|m(M@5K=p=ldDeT}GSEk}7PVK^H_YC_FL zYT>q5!B52*vRDzeyXZ~?3A>2hf-u8hrFxPYlzVJ-T&_dNQHP+t76Ciq=v2mHKj9ZM zy6ZU@+S&-b#zYgHYA;Sz8_f`QIrt+vhO;?1!&G|&o+H(Y%481pgX3tudJVlVc{!Zd zt|M^YLsWIUP_nQW?#Ew^=nG!Y<>OR;kHGfC>rngD8{vA@&%*osSE1qL?Xb?wB0M&N z%=jpBqf~mMeaJIZ?U)!v=hzs!#~7+k&7*yO8JUG8v@9+mx3Y|ml|_cIbLgdt8(f{j z@WvD-*QX5FZ7fVU@0KML6DFuj5Lq?=b)$Q&7fy0v8~Yi*VLSv>QVz6>3DN2 z57TrF6RuzW@|TUzFQsSRtNEF9kD3=mH4wV)=a&8OroX(tb*00A8dv?(U*nH|`ls;K zyRPB)@4OR#LdEyj*YC$)-Ej~8^x`%A(YZVDho`UL4^Lgf?;Sgb5_P{ap(j_DEK8$n zv+S$+{)>Vz*tOwO2weYTmP(4_W;A8;yCaO7qErJd%TN?;52j>Ysn+-9- zEJ8)+cUs_cHy{{jM8H=Mx1$_kFOPBZoD7e{4l5%30f;z)^caHf7(Br+9FZU#Qngfl z?q~|0Xqw?|n)7M+2|Q;L=hJPlW!h1j?}BA$0=DH1I1U~`+hfmz_4X^Mx^NAaYmec! z)7Mb4bO2b7B z*)`8Dpm}{aV*3vvaQG;k2M@x3l^$PQBg8F~@9{0>v;Z7)x)pjVhf_GHnpm}M)3A+Z zm}!Q#YX4^TgeWd2(=d+adE~0;eXc&Mg2~(7_O=`Q3T!IVyBYN4dS;8&*FR^vFNL?d zXN{s}Laz9JWuN`*XN~ilgCgDR8d@jlX>_(*_9L1eJ>zlY-}xW^4*%`7KZP%!yNs`$ zyB&XY;S&D*{3ZO&<-72|@3*{s<$;CVH-_Bj4>bo5!b?R@b@K(Jl2t1pkFxG4= zkwQy0ZzT3q`X_^^3c-{?WH3BJ-D+YQ%ni)DnI;8$6( z!x(Z=l@uzsQ-N!x2>rJj*Ni{NIzsyRD61fI)B zRoCJ~vO9=Ss~wHea=2sF1Y0H59aW;6%FoqkBnmJR2DrTNd))loUSn8Ae>lppHVHq$ z=c9XLE%cPpHqW2V!=B8bA(ex*wF^}_0-@0~h}|90#o{`~Yg{MCia z_@7tqz+X`P{n;IN;ZL|+Vo^2}n;UkeZYS(QktQ@H)5zomw?ZRhWkN~ksQ5OM1gI=e zzrISrqnvUorzx+fG+f=pwi9?pKiN$e3mYV7tuCZVn zYB{Z|K_f#_tCh=EZfhh6trf6)>flSc5Eu%;)?%u$W_Eo{l#$$EybwN!BofkQR$`s)$(*wN&2BgXTDjqznQh))#3P#nQ<6T<3s5s4#^B=}mW z08<1ZRkJUXLxA85^}*3Kfrj20)bvlmxv&$yJx5T#wjYke1l1jnB6j==>@#Z! zPt77VNx%({AwMz2Fn5I0adc12AvZx4IJbewp)&}byM~4{cc6@tKJesA;Cu2ZgdTeW zP1o*&@Azp%E?hv%ombF&@hmduPonkIA@m&Ihtb2kFiik1?pek42d?v(Dg+RPC+rzF z|H({9>7`b8ui~;hLC=JX2_W@W$Ye4%_E89|A5FDqGr=GQrBJaLdV-dELzt$0i$St? z@7^2xE2KVp^ym#Rsi%mUlt7?W#fi>k-n)5k=Cbh;ZJIeRrKjVi^oq9^+vh&_IpZFb zGw=tQYHx2Ty&uu^C(xgZkAU$M{L{Da`>%REK6mT_K7Z&KzO0IiLkIDVg9q?ChY#To zPMyS`-f;zgeb06L->Y|{#4cs$2_T@dyLb{U>6|f^Y^(asDy&u69U<4z)oFN^0Bll% z0I%1`q@k{1w{6>w5`ykFD!c7U2vAWCrGQe>Po;J*FR$doSZUZ*e?W`i|lK|`hvv0e|dBOx>my3sTh zKxiR|`jG}Cb~d9bT>)F95`jP+BCZA`eQqi~0?*?`)a`@I-T-&PhSC_rty~G}=j#zU z8s@YX*(1#;6P^jD8*!HhF^?Zn{%Az}VMGYO zus?!Okkh6FBFYb#N^#m^q{nYcv>?UsH=bxiEH{8q`!HPXeQ@;+z&$#OhM^JICKnM| zJH*ek3*M1wBnUi(v8j;>_(^p$G0k^1!aknVs%WSz^?*MwKWk zQ`F}&KbsmS3%puy4kly*-TV&JWJTv!c&l|L9CZy7(mF=9x3|jlenit(%STYHsg&89 zUo8IMl~3b~$1mbbhfm;3`}X6v4(`WS4;Ng`R=#xf0KR$Z6#n-7Rh0MxVMJpIB%521 z&8c#t9U6H|NdgLeW;X}vp{KJMDw!}E86Ik6u^m)#8f~nQ^`sglrR6A5lF#xgBZYgZ zfTz*Q%Bu|k*3{JV53ZE#obn;6=O5o<0FD5&G!SyiPHZ(AZ(5a=*k+}=t0(XpU~@O3 z!DWHPS&v3v1FWHX)cVR{^H)%TRm0mTnE?Lpk@N1Q((397zSAckhDK4nz#219U%LJWVSXilW)gdTrf z;+Z^Rt=))s_aW5VX{3Mm_UGXn>VkK02*KeAga*eD9~nbtY69u;QKUx4k)4>PkD+~P z3Y}xKNKdaIbnqyg7cQYn^8NJfsCe}Gu)X?qsC?-wQ1$o=VZZZHG@ibj(0c^o$6o-) zqYoqeg2#}4Zp z=!!#0z4(HbK&5@nvOWF1%}N3jj!!@Rv{Cs{3`c=SwY@4wRsvEbA~3hfbv{*M2s9>) zG_Mp==Di8X0;JCE;*XZjA;2nP(Y2%$S65e!doWG+t8E|s=tno#e7|YnaGl=_Z*LYt z>+>tTl~}lCKfG-+5C8N%jpM@S<)8i@zt11|FC96L-#&B_zp?iKKDWM(&+T5vXZCO4 z69;zU6MU{eM+NweeaBFeP|d8ys>lm)1<*;&$))hr@K_mA;)y1>JRa24)-!~yFvgda zvMVL<)X%lXYQuIaxe|?8A<)%Wu%oITJ1Xl?R$Y$@jb64nCPg)&S6fH$)e~ZboVxqaj`c zN0LAb)gs`xP~};WbUV<(`;YS6afYQ)yA4r4L*sZgLc?~dDi>^vm8hK|T&O`s zwghFV5;%v-;9n+am#bhuY=v{x21lGKlghD);EQ_*Qm>%rMH3aDpqKP0drA|cA%?n9 zhRsb>da-7t!YO())m{oQhP^StkKcW~BZEjs649)8zU^;kJT~7=P|dd=tkF<3skCb(Rx+Nn2m`NTzh?eH0Vb@xGhbaN}R{K{UGWYw_PP;`YXD4MD_MSG{(;k6p-E)t8vMgVC*ZKZ49 zM&H4ZwPZUVa00M`>P@4JDYMCTLQNg})xlpK`^)MYQEg-~QF{MwsLZ;E{f-~TQ-Q$4A=Yq>^gOf_o>urQL zV1p;@K%mJ@AT}TnvLhOB8fAIS9yeONKE!$6CITuJ;IY0sL_570Iob#BJnwnB67JnL z1P{4TF~*mxqXc!6W$^8Cz`^y7!%n!)`cXMugT^>RVct)C(<6Ql!&)Cf8bF*OZrmG1 zJV1y=GYHVbO>Jn3wGq4-vpZt%=`#9BrLxwD)!+*WHVJ z{{ZsC1IPf6KybfM{pCjc2)S|O=`FKk$SzJHGc$$e#d#!79759_7ZADU4uq)GJdZpC z-wU6Foo;#F<7jx`VbtCC5UQVe9Cc4Uf%?ZDg6o0nh~Ie$ofpqwoY0#)vKLE-cH*AL z1wFNeD*U3}0G~0@zrxyL00=B*c4D~{kNKj@isz-~q>QwGG0b-D+I2%A>D-!s#VcN6 zjEQ0HBPAxd$jx<9ak`%R#LCQYve;Cip`IWre-`L&22m5BQh$QDOu%~Z!3T|PrW!S& zR;=b!rl3oH9mLW<9GaBe!t)Odw#Khhu`_h*W(Kp?!woP zUclEk4&!&$58!LN_v3Si_Tf`hd7mNxKefIKzrMa7B}M~df==Gnp-@(3cv{Y(DcJ-! z0a0hEH!3N%?_jv0@U^rs0$E8ZA3%n!<%C{Yy%o1n!QEEdU?k~m48~CF4xonct8)bi zK|d-i4g%6nRj7oWHqylf=t!l|t<>(h zCUoYKXy=bndq)>K`uov7MEDJLqJ6Xny%Qto7^m`^8b)r80Gt^|W^M|JjU_}W<3mUH zBXH^{{O3<2aODyL*RLUP-@OPv@F4t;KL*dso`Uz4FGA?4=OOmMJ;*S$?LT`QvnLK< z`S32R9$dryPux%F898a69pY}z82>hbX97TwdFxx>YGgc7lM^-M)jsBSSuyyuO*!V& z+(fSaw(oo2`wX>b0!#Y{kgAMOS>Wqm|9ZUm#VM^ zR16>G?bDz{8c#zRQeOJfml|Mw*~?yLgw!h2`-4CD1LGX#edw5XzVn?%S);kG7^?b= zYI>v3|C!Hx##m==FaFH3kAM8*c=x;Cy}5MDemLXbSNR;j%V+C*{NZ7E``I`AG~R#p zPW<}G+wt|oXYt$1d+{5qy9m1teCptCs=mGW;>KS58lBK9$1iMVG1&QW3Rgxc# z3@w$nZhPqt!mjlDpjQ}Y)_Z4=Ng*RDmwk z*xaZk?CKb5TBy7PJ3-D$*fn|uxiCZ35S%>T5sVt{3di7zMBygzeDNea(FA<4B)rk4 z&Hk-1qt}QA0}l#3^caB{3HYhV2qYg>7^iVA3xol$4e_WKL7&wKf#XRBd`(n-T@JXq z>*1SnAh|b&-izI6oN7QtS1FnnLq=tZGgSv)kSdZt2nni?W-5b(ac)8{5a#vc1algZ z-ce)@9!34y2I`L-MdaFbRIlxWb7=!!hJRj$QK3W@nU*%RQuTGDn$exr0Nl;!$hV`T zvj-i0y=WikM#oSGx(U94fnE#^4Pa<$4E^+x*=Yn9??TUo)0jMc1Z&6lVb75b>^{7T2cEcx zLBqFhsJ%_rZJjZ!7!IZhG{x)m;+r7Swyj{2nPqvV{d3#5n!vVopJL$3%xlYv_tm{_ zy)NKCS{P%$m{XE|A)T4OM zU3cKuE?mNYJ$?pX-m?!MUSGh6S7!0~g++X2aTUM5u@fbQa5ig%vYIMuubUxcb!|1u zDhN1Ac2!uY98Wc}h47a1l{KiKx~nGOl&JGIhOH%aR+Krss3Gtw2`yWq1+G+<0F0u% z(M{mFjWH+c?aHS|*lnu2Mi14UmkLY3^G9GK!f-pm>5exW?h@#lQdCS?!Y@rw zF?>v9IL~$59-^xZEkYmvl4JT`>f^++U~ zaD*ygYpz0Lo?&c9E$Y(cXh>9`bDZ}OsiC^GAmnG*OyCK6NdtO5G*Okt{eo@+&O{#W z{z*jk9z)6S6iW6UK;WsD!gBY6NZk7{n)dF4Z*T~%jzL8FxL?;0;vGH6bSv>{JG$Dt z3~%jdLt9TfdImbs*Vl%j-cF1U^<#Lb5B-C^7#kbG`u>3q+=?lU0LzR%p7 zF?p7m+iqsX--UV4#pf%wn|+4=*G!*Ku1_yFmregsP3O?J{5RkI7XF%7c>gzl4X=CO z&)_Fs`Y7If?Fv45@d`eF`Yb+q@+dxYcprXaWgTCdUBD-o76iQ>7EjjmJ3&Hya^mBgp)jHrNR^ zXM&(hwIa}zhb!2OSWBldA4ulmCHz9^R``=?gqo?Ml3Z?TK`=%bh7tr_oZur2c|YM8 z6Nnu#cPGvO9kj{K-_2JdTt9i5LWo?^B5ju zXdCnUxIKtwU!3Ppqdw7w=H-L14Xk0u;09_=-3|9+uSEUbFGS z>|M`8$?R@ecOF5@fg^~IjH08X6aDRN=ZT!+SA!d^d*9?8n&YLzq5u7;~o& zVCnQetex72UG$wN_Tc^}A2h;SJygO)@xxkZuwr8G87tnW*q>!LUHR{2&v{pyc9U&4 z-TkI3|J7!m<=^IP|G~E9gJ>eq>G{#mkzDiU^JGHsS;yVXpFLYJE@=55cosDt`rn-X z1*dQM(=X!%zx5$J|5x9Er+)fnc*EV-@qzPK@hfL<$ES~-#OGIc<4XkJFV9XCpqFSS zVB9WLRuO3B6-INELMW@o#T6)}vMXaqtFdP*jOHgbD6eTid7TZlRBh!NP>1lV@e!I- zS%O`CB1Lu7f(C+1shH}$8b4Pdvk!K)DwJm9`DJt zBHlNI)~R`9#-`!Uw!=z=7tZ$~-ZlVNJcnp&AK{pWn=o`Y=iqH208=^mIPXhl;inq( zb9*2{7)A+0&impq1fn4%!V$uXiZ2vEz~e%b2I*m#swOB&flRfsE*Dy;NSj@1=jTMo zr?R|4ZEYk#5BU(|F$pT%lt01q=TP50gz%nYsO()r<-`F*?tdwETzUl7M_z@RbN9o! zd;m4WbFdt^fLcOu$GLmqy7N&451m4Mau!`(-56@mqo;@7+lrx{4vh44VYIIYBUE_9 zqf~o@-qhq6rs-1?Bbc5Z#RQ=@u{es!)d@_mPh)O<7K^(Uv9f;+3kOy)b9fo^Cw5|y z@LM^t4=bnkV)gW1Y@FSXU1tvxd4|E`VmI4@rK8 zFku-C6OzFYntA`paIm5$NbcZx`X#*~XBz72W-u_IF~izZ+vzd832+J*!x)@W zSUZWq=`pIjDNHZVV{T~w5*hMJyd$!YV`9)uX$xae}ZT^mY<@ zd#L{QpF4#8rw`(h7d~b@RE39)_RNz-;ekLF9InH}+=I3bU z=g{zszDUR@H;nQAW6=~GEj-Vjb<`c&jlh|+Xu5hgs*jyU#o3E!y#Ha8oj3!_(gy6y zd*C>I6(vW`qvqa6;JfExB=;UghA{7?(i`sDgx+8qx&}MZH$)IB{rk`$`sNqVziT(T zHr9|^TS9v0GBUfC(YA91-7AZjT3W#heQj|bYs<4(-MwHyZ<)S+cnuq>v^c&8dl=5{ zBlr#wdWX**!XZNMd4!%GLc@w5X2bQwYUV$=nV-in&6scG=38mnvp#C^e$U$QuQnZB z?EgA5k1D>GfBjj0SkskE6y5ZjFmK0n^EP#m{JRC@31BKj5zOS8*E($x_6o5^0eIgB zy>IcvDn{jb|Av#-eEpL+{n@wU>SOOMNXWD;UP^P&O2*ywV-UAUsljrmH2q+7_uLxmvEPM@B=V9gPl`(YDS) zS8Z&q-Gv$gui8=Y8mET}&x<-Ifkzc+CH(9jLeCc_@Pe=fRE8IZJrsq5VXc#}am7={ z__TI8)mi7lIx^EsNKP)m(=&{~z&O%tyV11w0DS8^k=%Cxo~c=6xUN3a0av~U&X#t< zu8pB?D_nG?ce1s#p)u1AOSS`+9G&}GvxFqQp8M1{FGkmpmqv|pR5i9uMrDe#IgRQ}7M7kq*t-Vd>YIRld>+k*Ps6>F_kH99 ztVa*Qea9tK-*E>@u3kmy!w;b9;_dJ)uA!xW1l`?T80aoQuSd`u?nL)+HwK6M&^CbiZT5s}SuyeeD`zPd*Rs3+Lb&8ArC4p=)=iF`Aj6 z*E>My4RxYtv={xO18AEbhi7F4q4O7EJ$*YYcU*`2B`=2KWiNpHg%2YB=-o&jKZ^dN zCox9QP8>dk$x|mWed;h47}~BMUBlkP8#u%e_vrE6IDT>uPM+R}Q-t1$QwQ+m3m)Bk zxcFp!-$Sc)T5%II_ci^;*>2pY{y)i^V;SAbRj6uV+lADcdYqx*;9{!pup8T)Bg2j)&4%46c zDa?HC^;r7s)7bMHZ^7~Z{vKTZv-jiC-~A=L`r~iGBj-<`M9OaaHYN4kf$FLX0WAtA#-O?`x$m+sNF zaD<9UZSCky33w7oLaqfNhJz7RSmZj9Z0|v`vmebpL&%TLVCKLH4DULOA%=Y&Yx^+3 z`e2FYVM*j+B|NRoc{CDe zjRbTf!{hpv3@i+borI$=PSDYV1f7?l^f1(oB(i9ss*J@mh!ULX_FlAf4Is#SX&IbB zuwxWq!Z)$L53%`WB)a<1KRANk?sh`26$5?k7#Qfl07KjU5pEkFLT-K}wi!%h?S%$ag&K|@mLhpqyEP|eCnl1KcCY#13J*>sP^#Izlm;U|i`|jk% z3jI%SQt>N%W3Ok;78bPK^k;4S7n+WF?&F?wtGS_QWAUo3{_kbFmicX%-(bN_jaG1J zI#rIzHE(`d2jpnu;ZK!*_~w0s$cyy-5gy)K5hU8v6W zqGogn;e!_t+;axuou^RNHi_DyB_vK?fqnlmH0<05_v#+_hUU?b>Opm)4Ye&Ds7bd| z5#~{o&7+!LmFIef%#DPq%NWC!Dl^qVr&8tma5|4jDuV<8+1lQXSh5A#JXISNX)Dj$ z*EWjoPTgxiQUqRWdlz~;JB=nPgKDZm=nW2Z8SV0hM~5&lIfnG&EW&3`pyuKwG(7MW z(r@|z!aws~)V<)<@IC$nyjLzEdgcsLSMNgj;V0mK{)-WP@r%*=_(Pbze99=tJ9=U- zPM_Lm0Ppx#$^umYEOxGwV0kn7`O^V-I;@Fa5vR z|Gmt--m~uJ#!Z-8%ulNMA~W`R#tPLt#r|yj!CPg|z13W6-s4U8+FW7ocQe1~asSp< zbdlnFDSU&4s)<5)YF1JdO6$I5OgAgZ%yXD0DERCl}z1{WfLax8<9Qf5{C` zQR(gZmDge8W3R*V$6t$;U;k+=ec=rl`PElq>eqe}i(h&pj(y`7as78cR8V>viA*Do z)ew5MRC{$wB-(bSwoBAG-p zJ>8r_M#&5~)wmgHg03Z>Lvvdm&F$^T_Vl8)ZxHSMLxy(_P~8kqFnpUr-`F&I={*y3 z=$=?W=j0;Vrk2q$yN>SVz35v%gznXY7~XRNBlOPsJ;)Mb`BCmi&r$K^>8aicGS{-*y%j;u$2&Wa$akQldl0P*fjb1jo<4MSb)lm*kHM}Ej4`|&;V}dKofsPK#mG=U zhDL@kJU@f31G~_C?L4Y4pF_nR_rv|v(+IupoeXhb3+H|J!FS~XqUQEHd9`TNY{zN6oJ9?Rp*V^s5|sWH#6*@&Du zZ%&Knzsuvazm71s$xTy(-$J9`XSS%(vbpq6jD>VvxiK<+VV9y~c<%3h8{ejzYyROM z3-|mT9(3bA4CiMrYJe;=kI}jH9hukq*0&3v@mYS~UJP z=9M`$`oXt=p2NIhx#o?~R(}Vr*ff0+<_DFl(VzC^YE86CqYFRw#rIXmc~dO$yT7@ffI#I)fe{ zU}k1YX68(#l1Z5 z(D()G_V-udqssMfggo~DZ^*qS+yCJUpS+IT`2Zs7c)9=ZUR5{s3b+KO^54p_ zNyz&i?f>W}qI`dSe?EM>g8BQr`G5Wps*mpBNAc+W@_iowQW51*tUM=B`xy`JXMBkq zqnukg&p+me@T(u5=U;!WJpWU^=h61Zd{UnO8TZNR55AXwu7rZ?O8o!&ColPh91#CK zJY@Cn(OZ8hCq{kf!%+WZM;>R$3>c{te#WDd>0&DQYktPkBNgG#xi7P5ewUOLeQrp{F$(U^1IwdVPZW z26wET(E+DYcLF>{vJK{BD-3*(G1US+-zOo%UFsu^s0k8&j4}dIM~hrGxdU}$JCGUa zL7KN6>9JlE&+bx6uYC7G>OvjmmC4?diJq z0c@UJ#HvIqR`EDihZ_k(FVYG0tpwXfLM)%>sVE#po{hh~xf53(Jd3Aac^OZ=@HC!! z?g>2e!adx6a0Ac1^fX?2?ioDy+|zjCm8bFKhn~kPFTQ{eJ@+DBec@HS^2$f>{D zC-EX*`@*}Qz$@>)gJ(bXQ9SqNNAS#tU&B+cy@DrSei8Rycoz4bc?wTFxQDw>+{P1k zZ{gnEn|SKp9X!v+@7=kLCm-C$r`~x7Pd@WBZr{5{C3y>1u3y8o8`p8|`VBmO`6|wy zyNDAf&pwQEIC<(kPMto_?O7Z@ahh=oCr+Khkz+@(@4$W>ICuyL4;@yy?j(*JK7_-E z4l)j??ZJZwarEd>96EdiM~|Q6_ddYI%hz!1G(Y?7MI1VElFuDy9L3RN$NAYO`MP5` zarz7%3p2VF8&)~7EH*u1`Ywy7$IL6~Yd}QT#}}oI&$Rj4%A2_R>}$Ak=UH67{S0p2e-RI!|1h3^?QOjLkx$_{ZtuVFVcdKEbzHyy zJg(k(8rPqA1{bg1#;Nm<L2IoWZrD$8m?} z;=zF@AgE+KROm;1w1YsE&)0iVBq1)}pdOs}f(<=w;O}SrbBoT0+h2^P}EwsH{? zG&I1#*ITrDm}RY8+FYGp51qvX6V;xZ@A3JA@CO*7AfZP9h67Z4Ap~M!Mg+d7M2t!& zfnc(U>aT^-Mi{mslSbzUW)$LM)awFBbY(;Xh z5`5@8@e@|GQuF;AL#E~YF9z#MRU!+x|h zCJ>7R(bd_GJ$v^+7A7}Q@mn1ZRRKz3S`%GBO>I32WN6d29HeLDAUmfJnOXU2yDc*p zTefDXv}Rk=(p3VCHFPI3MN-WPU%y`E+modTWE6t5Pm6nzaxH11WOmWk^lb=4VyI^& zon26(h8<<+7W1>VsRXxa8JWn)%ECq&kD8f-ikb%6|J``$wYPBL@!KdYuSZn_-&2arY%^vIUTDvrePD0-{kP3vAqWlUl=(><+M|qv0?L8m4k8< zx10Gr8+m+^CQB`|usIFuw`B0WxhSpC!xe6VC*A>*mzA{1htdW+%C%1DSQdk+UbGI( zAlfzrYp@x5cU-kyYw0w!Ha`j~8?Z@MIFtpxa(Vu;@_7Du-uOL|R!c4IuyLardn*$T zWa5EjHebnZzA_16D_@_Tor`q-zDJL(UArEcnb~mC;Y4{)+ISshWjVF5zu5%4g^o_I zQ|a5ZJcs73+u+)i0ss1RMAoGvz99ptP2148B@4}IS?J8l#&C8HmUx`|wiV!@L~arG zmXu+CWfe|RR9(@Taa(J{WuqO(Lw+2Z?84#O`*HfcCvf;n&tcbBUck(kpT_uaJ%`@k zCg8sQ6xzT10Lf3^MCvoQ(emXd(fGBe5dG?X#J=$qT7Tz7^nU9ltdb!rs@GO*ZTe=l zE=6umCJGB=z1LzCm6oEsx`rxFk7^d0I+Gn08WXBC7XEERt;Gqg$4|TFfh(1y^5O;H z38-ugr#1_(Icwreof$eXMY-f$A`2qWGI7hz|UMROPyszftwpdpcjmhaK>J-RpnD6k7AP!@<` zt0#y}-U#x;&1&e1VPFoRe{wI6NH$8li|5TgO!$0L= zX?uIrYbL3+mA#QC`M+O?6;)rx_?5guj^G!B<1g^ovkx$K>>zp%>_*qVUFcH zW)iKFW9XO~L(9kj60*WtOA_8_04|>c9?y))1#|?kR1#|P!3ykT)|Je@l6?W5WH*kESMy3%WT;E-cqn z=Iu)pjl-yisk|Hp0?oW(3*2iqBeZH05{xEB>zXZSU%wSyJXgK^%<+v|uybPu4iJV1 zSpn_m_wLEd!GYoeoGdQJweo5_Q{I4QDzvytk#*8+#Ide8j+~gqzK>qP!dITg6k#{` z+s~kv*0uec&oZ7v{PTB_B=p+8_yA2`zK_V~ZX)p6>xg~%E}FmoBvw5drjn^TE;5iW z>qZpjQRPXJPS%RjOG|jw5O_5v8|o~smC5~eCTJZ&n1XSbqX{$+V)}3hwY0Bg4YDwf z^xoD&Q(X&PbshBe8kmhHINdH<+JMTe8{{@bB@m8>2s{CuAKthZzJyP0{Ykzq8KTvV zAe@LI9HaFO6M~^Am0?7+1$t-~J*>rDGD)A$IcSfaw8t(&Par7AW{MJoEQsbftuCQg zA;VD!>nv9QInEH$%wBBs#IV5@K!z`gs@5Uoh1-zkiebIgPtf_WjoVB!VQ2Fp)8WHb zvjdx1qpxy>v5FQuYwu}f9e5n82KHjxp2x85_LJCn`wj|j-^Er+{8b&JDA{!a)|<~D z``{Ur9DfWYI}RehwF`UsI#Ff9ImIAh(L?A-dztilsRm$fA3ZIh8je+`0k_ilq5!C# zQ14ao!2|Mv+aSfrCIVeOm_AOQzZvl_WQ?gmNsbz#qjH zS!FLHU8yQjSu%b!lxRkNSq;@%4pmSY$_cW{8Xa;aQ3k;$YHcmm$vUc$wCobp8QidY zQ_xw1aQIWmDXyhT$WnPBMG;COT><45D%5oZ-YUL#D?u7%1s7}Wfyo`hIx54}8`H6d zVB3_oO+ixB-U@)WaG&aszf+EBU3xyY71yEK9E8!=f*Kp4rw=fKDoPE!zw9aAb6wE- z+L7PDvRGwDA;G70rC<-YD`l8nT2DCVVzW|wc@Lo{s!r6LBn}akxmhW=-<)2WMu>}= z6O|{A3-meZIAu5M8wq zsZ|@$!f0Q!37x#Yy}Z_ggy=LC@UHb~IJhAlM|kcI^ZO5GZNp)@%*$ovxJ8xxbU`Iv zE^EMxwMN{k)!~d@iydJn77tEg;e~TpAlN3q^E`UK^$ePR>j9bxK2dk^cWi2` zT$r{f#2dyR^AUDlgj+*MbSL0%^26Wgr<~_&l0ihN_#}-|l(Uq582qC?1$X8ISMC_3QYd z>J1}6K=s*_&9Zm2yvZ9H2 z;SA8)hP}KgB8XG@`KkJb2YWFyJpr9Tr&iK4P$B3|Cc;m0HyT;7mMRspX06y!QE!`+ zwH5dY^u*qZ3S3#DU_DjdMuLwp+bW4KHYAFRS613v0iIahtjui+dYgGOZQ^Ih z=m4F~gAM6eR>WyWfj8gjpEefw-9dbMzz6#%%V#5oGhKNRX|A? zh}x4?14Z%4B3iPXf~dSTY5CZaUyY&$C(Pkax~CT2U(Kj>wV>M3gmOy)b=)^a`gxzV zq0}5l8Kc}Bh1S~)SGo_lIP+ykM$yLxj^s9@>)Di%l=fg4o|Rx zx>;|)WvdIPcyAw^8^_GuqnP@_J@in)rM`Y2Np2Hgx~HHQ{?t`?KYbOx&t66F(~raV z$#d|0>Ks<3Wo%_3%TT?v`T047UM{zk*@XotrBszc$K^Hkw6_MTJqt8cNVW9XW`e>% zc$DZ&sI=OlAFN$>QpbTjlgqQZSc6=3VH!5yfC2`2~){M3AdOR zo`ehjHZK~-8_~b3AJINq+fFZBtxhid@$4 zn{BkQyiqa;x*UQ&&*J0bUSv5uSZ8ygetst!j$c6j=_|-Qb`^Ce9$?kreylxu70yq5 z85M-ws=j%w>Y2mFou^TJ=Oq+B^&u2L{R)cCUqepUDC!!!U}|i^(JNOKbOdMuHU&Ke zE$&G>AD{ghA6>D$qRSpKiv$zk$=}ku^;7Pxq=u4bTHF4EyV0?05pBzJXkVH|8v$t%(Soj7 z$D5JXGdrh%@o1?6f!-#i^wKl4l)Y4KVzh~yH*Hjw_R-*awVEEGCyHv5B)cV`^t@Rj zO&!QBsX=yrCGrYukWYw}RA^948+vKjsCzX=+^EpuZi5RaTt19Xbz@l9=8GWV3ov{L^E+flaky?btlU&>a=F)%yv{*Y$$cOsQi2= zCE%(Fx++TmrL?nocG_A#p3c~2C+G;vY(Ac2^AmbrwP<(w_#C2lohcEZ-x zg#$OQt4@a>QM!pL`;`p;_yrLtX|ahiF`z3dZiQotUJnT-gDO^3-a|F_5Ongiyk25u z$T6RI^u7;5{J58Xf%%X~hFO{M+R)necDG@kkaY2;FdFqr*@>l0c;a62a&y!TF3^-!`L^)p zw|m3Tm|Q3*sYgBmwqZ*?@``FvSlWQHY9lgo%8`*rb+)yDQHY8DMYLefSQpRFPV~qRA3ymvm~75vJTc zm{{SM*RF$$&?;<>(Xd()3C@8n|okG8jf$yz-cP{Gray6dCXVx zOK`oo64y#=aGl`0QAOZY>G6D(0oUuaSausRJRHN&;|mxk^b7$H*Top6{TVRdR z>ZEjt%!Oc`vBS5+3qyMa%ssWxPnpnoz6YK?Nz`_m(YB)-8S&QOlaCjzBKen`qhH zsM9%BTTX@9g(51s9J`Bvb93uKE}@ub4Pny)bt6p`o=E zp0OdEdg|_qh5ZHbrP|yGA{o0x9Q}k_dF1DhWZpxt{o+^evz~=;fYB^DYMxd9zjkc0Eq$s+TB}FA%GanB_j1+i zvUmfTS?NU^J5WH#Z6Mgvb1IZ_Dga|bRV(m4SwoyA5f*DXBOG=c9{ zf|%A*>}O`Nq+F|H8FZn}>_&d2mhh{@R$AO*!qyyUhgsV1JRPWWcB0PF4NGJa&c=DF zyB^wJ{v92KuVn__rg2#K+G;}(1r5voRx@l+=znC$t4gO za()DOk}^)(7!~+o=ii{Z(JZk!SK;>j6Qc9-5&sbJ|9(|_w6eP zee5dYZ(ql%!V;+o2)u`|%O#=n^72toP>7psb_}6|}ik6#~5))JrEteZ3lp ztfh^WA1i~8rTg0LauG&iUBie*B5EB{>9mlQ-CeY|E&|WPeR<3mpz;b+0j6kS+iXZJ zCE=WP!#(FjL#hO+d0O8c0eJVspdEF>J{Cm#ayNp_yt!zlQyzYgKSCQDr*cdn;)^56 z+OyH;M~n(Gl(Zu{OzS)7hOylQS10YLwF+8I8JxOmST)=?)k9u(RA$S3qN9W3|+-WqolYlDqg9Zq0fEXDmOHpg30*f)#S ziGHL{>_ExUbEvuT1hyQz0L|SuPY+6ijkeO>CtE(e!mpJ#Jba&o`o9bwee`|_^-p#ly@}Af zc?nZAVr8$5VPn^H~OESjh)! zkK0YOw(`1c0y2Y^Oxmkt{p8%7Yz4fWtSmw=SC!>-0#t7LoI;l4Iz4~GAe})9n{&#L zQB;F+Esx0)L)-8Sobfi)QpuMaoG8}V)ZFcQM;O&6AC5hC7gry=h6YQJV6LDus=y|K zB#pge}@f_0^(2M>6#k*VH3RhUG}lswl7=SuL;-dF8y9Oine_ zWJ_+TvbdY5Hn$ZwsLlyTtPiGO4|KjB0&fKFra6j)3Dmp0q4jpdkr+j&ozP1SL+ePP zw9bK?5)BH?GveN~I#r2+|QEf7cOgjZMu|cuv(^p?YymyvJ-56*@gEWfd^6V$uaA3!urdl`$Xh{|AN%3vR@rohh2t)B`__T7Y@T~_&`y>+*{(6rbHKkcn|ijZoo zLVVJL_9Gog>~Dp6gthIiZbV0t2sILFRF6q_01?`i2yLw>yrd_B7Eb_8Vrj!>_&aO} zFNG01+=#$#-k|dV#D}ON9i`A43wW_AP;D+ng{cBMyABPkHLO~ykp{C`nz2r2LY2Xc z3Zn%T1}nD?l(R-IA^h@eZWPc)XM5tvaL14nOdvZDMY@+jf^rkv%qxg8o5hdq$vd?}xE(2(HN~I7e8p2L{kIK81@jpki23NVvpr{`S&ZzykEXgQ~bUxOe3)4n;(v85zPJ9e=E>i zQF@P4>5UM20~j3dR!K8TCc|11SSqnx-1~5K0 z44c)=3X#w=Qw{LuFk9`=>5V8UDOESnqvaT-z3vh81bA7rT%z=3!T4NRONA;)Zbqf_ z#A@;Jt-M)t^9mH$MCD~s)#c^nAd~xfIl0Kr&Osi_bNV)MGDQk{xwJ~o5ETk5$)b9* zQeqlcn1HjYY&xNi0n|D}sI>Z!-(W$d#mDmD#a3EijVp>+*BIdvQAs7zX-zj#jck)e zt;zs~KzYCEc=$Q~mJ!t0QYh92Q148r&&{iQUiO?G<<;Dy?ksDp414T*iReX4UH|1 zl3H5vay|0PG)N~TGIC3hT~MkL^%nCUmMrBtvb-wKZH62RD+95*kH$g%W;_C&Y$cbb zG#|+Il8}ALuoR)i_0efbkGynHS!4x2Zk=R8Vs0j454p(AG&VuLF20@(e(BuBtCfsiBDZa)2DBt^;5LBr6nb* zMtlUkd>&|VQ5i~#%TdOYRasSw+B%t2t7l=Qg*91ePpF9OPJ+uvh{+)10KrE1M52UE zL~WDF6cUM~8ptTm$!k`|<^>U;`ctzn;!*f$kG#!%e@_^#3$0XpjYuxguD0>}B9(~F z@-yeD;$~84S?Gi{ZddBiAM_Ehq89mCfrx6yYozk)pzLn-5q42MG|^Jnm+WvI3&3`e z_IkS=jSF5>y3$b_$%k{ugo0=>DpGZ@61oj8fw>082AMmoC-@9%ZCZ`iNYI;k4lD$r z6*UB44b@8pFLD_zaiPJ5LL;}dzXiS!@(JHEXA{c(T}Y3#VO_LI4Nl&iXu*1KBMMuF zP}JUwlD1ChdIzEJ8${heA2hUshVc=Y$46ls8-;yh0!Kdl5y)z;vdG~-GyVrd1~khc zWC_`qaen`u@wX4-pBMta6+h!M_(gaI$622glG|E5IHL_Dm+hs*Mn+LvO7DSW-5ITcJ5e2b5jx)Y3Sy8 zT3JlqK}BFg4J*#9tZb}bw@$5JE4{D(C-m|N5&@p{cFMZQdAwOgk%+Zjv0+OKOqB&y$Sb<#Y+AP$Q#* zpVb15yA{Sjt5SMfXwf!Pm2S%?@CZtQo(y!B0neq?T4WWJ6Npr-`K459Mr4)L(e~C- zg*KqZ!uzRX3e7{iknG!mc<)ZMjqXRs=q~u^BpR$y1-s0`dfN1Ieiq?YR4B$=q% zye`UZDaH@ z##o`vQ*8?P4)XYpQSC|l-ubj_TujTuW9bEWoRE85fsYC>yAan2z$+O=xR_mvv!!)7 z!h3a&PGfYm1wE&x(fr~Ogx@*?|3}XtO!e3F(Mwc%^xnBFI0|^YDETZPC4^l$C1Yik zwA*RXK*ggs^9R|Su(`Z&c>HRJNI-^+$nX$Gls9!e(WoMkY(lEBMQ!DA8CNIM%_P+8 z2}UoW=Z%KpiUwehx?xWeq8+rr-7eZt8!fLMu0|s|mpc#{qq-PxgrkW!mh|8zq~V;3 zFdTwgRHOjcA3~B4ZuL6R7&If;q=#)l3on6ZKIumD-F76-grV;&M5%uxsyYkc-Rnki zUnTTYc6cU2us2I@6xE(V3$ym&Y?1~&O!eHdFdFMkw8pYttqHX{8J5$42BQ;Ni=P(7 zjcTh46@+dP?XdKssWDceCS=j_7Q|vGWkpaNr3HyMp{lh5m2K^WULVY(Bed^BsHZy8 zb88$Mfn|CMuAMs&J9`!juYVZlzW6m9{@7=*|MgE{$BQ4q_LpD5%(Ks6{K;o9^YZK1 z@y=(l?~7l>-uFI_-EY2yb~%*F6W~cjX<AQj-{WaX&d$ZdR|v{{vuRiUW78iiD21!XnZ z!kZ=4Ie?zAdDK(UmFpcS)!7vQOLYzuQpGh$k_v;HYKzyf!H$izoLgwC%-%TeKKEhl zI&xl(hbySjB9ou1W2I~4wbuFDp!E@at`-HoI(tB^|C}YcB>9`tbBeJouZ-o!LFgH& z#;E#a_5^{ql|U@1mR0@C$S!F>PPq<6b#|0!JgBk8q4m&u+nZ^>Q&e_@U5y#r3TjlF zp4^myGn=z;ehXC}f9GQvg}6c$dzt6% z3eV?Np3lov$QO&MaIvfoCn!{A?IsL%(fRG~N9^((B2Vl_?Ab$Tdg&Ba6%uw*Knf_o z3#ssmSR+QJq#Ho6a^`>fC+&b8`8aNuXa1wZqIs%l| z$5dxhicg&vg0DeiLVbgS3fPM}trhjsR%O)@czRf!9yFL7(C{}GdpxKL`B5JB6Lu-& zM1v@6ZA4W^D{8yjp&#gld3XTk!G1W0hiP#~VH_HQZDAhnBZuKRcNYG~A4lN&4Y;me zgXiuY7;auh>4nqCKXn?FS8u`g>Kh2Y{Vsg3eHiY$PayW#1vDPnkD+6GF@A76cAPwb zcfRx))j1-Rdj!1y6MBz&fGfnYXmSsJaX^w>(Tud=jRe2z-5tgIY`tiZ+6N_oy> z1ul*CviMX9p;Yt`cw%qG_LUMY&7DJ--+36NHAZ9?Q(X}tb!Hz5s*EVtI8dQ+vdqV! zw@0Ybs`6sK@VGDSt^YM-EZ<7V^Eq^enN;8+eu9x03~uYPGZ^E-EB<5GIC^`$k$vvp)iBC`R}s;Ue_B zu@rpqMuZ8sSW5?M|27p(?LA1fc2N;frKMWo=WD!tkB71H9!VcX5XSf!GQF&c;B2C$ zjeFrvc;JdTVD=gKdIwx#2`3zW2O`M`LNYg(kPAj5gl&ZI3$Of5poFC{J?@0B$%3K7 zJ!n7G4$Th2^RN?^{j^t0T2wV|Lt%IWG&2<_7|DnAfD=W%tigvYsPR|9WUqr+R|~sd zOYjkfRC;#S$QG(RY3Gw!DdiU45L9(lti_9s0hHOoD07HywxiNq2ffpvhVSScPQJ#0 zTCW!-s?sW#6?Igpwarn~Hz!~s7|cDi>@4c8J%)-amr#EBI;w9!g@)%|hwh~}V0i9jSnuA0@4`7m4)4Xl zsr?u^vIDcn_uvy>{VXIs)c+gs3UimxrwDsO4_5u`U>=`ha#gf~}>s$Y-Tz47b7(?PcXX1aqVh_IMxk{si*W z8Z0KH$uUuhRBP>O3TG)RQ=`+5!peGOZL@i7ImOikUj@Hk#`sxT-n>d_R+$2*Hiu}l zHQ2yoNy{lwLvS{;VvyM+8)#!??4O$PLFFc~mFIFB@9k`!w|t73JVpl3&juN?@*B2R z&b@-&!;o1ok}+N4H=&o2u@WoLD^!YHu4$^dRYA{W7PZHFm(X*X?6BAK`qQc#v&v!K zQViRMLb%ox5PZ3?uc6{wmj?fa3`91hAxW^c5rVx0;PARlm|`q2cJS|g(yL489i!Si zPUxNB_SDuKoXW_>@yuKtW1M7#a)xj}#R~9vK@m=uR^cdPKh@@v(TJgl8|^(Yv@G{v zRY`d{ib`1UjZWu%!_uy%7c~J1uLj za)TKq1XxW6?aRnG(vpPg(m~`7PGfyjGtzndwP6o!t^rj}7F3T{jsL6nhSa=Ihsy(< z%}iw*fsZQJ!5T8y+J;DH7XmFUi1+p&IXnbE0T>t?hIe8NM_+saH$V4TIOgY|pPhka z_f9k{EkM6>H;j7@!hGN~%ts%G`TSj2uRaC)wfpcsejTCX$I*8D5c-ep!T9mrm^`ru zAN|^ARcFX=w)L%`ZUuevf9V*J;Vq9n`2_lRETDCM3e9s9Xr7rslGZ!i(*qH2}#TK1KM0yC&X1DQXlQyENsw!w$ z!4wwDN_nNI)lebIxJiO9mr6|r@Rn2(EWBxDaIXwMQSO9=TRPi{`MxTaU1@t3WtT@= zmsiM2oCUs=YN}3WQNXJp?4&ods@|Yd{g+lWAd=|7=+q9lLoKwGHbTjOt+}E@M^J5)PKT{By-g|%fnPS&UMW>&wOU^~ zEj#Z6r6)-feiM2bncG$de-mtvz$X<`c6OEmUb+nY&dFxwvtpSGsbG^$Evmg%nlKDT zGp{`b2JaUe&%32k2VGVLbeoD`*-!%a`f_;Il~Cp7!@D6DzV%rM()z~MrlXM%>{z!I zeT3i8+D(|G%A4c<(uS?rK``#wn2v*6vv4#m2S>ML;~2qrG>u!vX#((Eb^*@j7UDE3 zy|b+FPUV;3SV1}V^4!j{%#GI9W5{g8s+xudRMxQ|K3v?Ujs>%tmevx8(sH)IKy|0} zhvA}K4RrUx-A3y=!kb`toR)JC(XL(vz-UJgB5hs&B|?N=ps5{xg3r(Q`yK^ta6r@{ zZE%G9(Pl!FwMDqG39+Uom0iU~J7^71`T2bCc>`*8MVv}8LaQthCNKnmu|OCJY0vW$ zdQ=xt86+Eb!QbM9tDRQ4!vx2G4Gry7u-nT4jqQJ zvl~^cB&rDbGOFFISQAR4lF&2^r`r#o&jW|uLM0R=yklr???f!sh|ZoK{`N+sdb<(r z?Lu_07pLz(p?YGU_~@I6@H=A*3$Tq4hI6xU&rHKNzX1Q@9{6a{T!f$d_(gb5T|nUY zDWr}ZLeJ6t7(TiO6UTOA>cl?0@wLyXUfy44T6w7Vq;q41dkT8;{MmbV(Yd^U#`zf} zW~Y#xnL>1Qlx3h7?#2{6GB%GY&qh^icbQb1UL+d8&Rxrh^7@(0X4Shd&|6UePSk7k zgb^)VW;SG*(X=gF39lU0i6)P2ql&Al(ZI#?C({VUl2z8})MD+jpp+zI~2r9kjP~3V3-`QvyKA5+vWA}Sw^BZ3Cn<*W)VL0FFpgA+NG8mc%q z)u0u{0>e5Jp(|tfs`$5w-`~qhvWxB~iGpe))~4mE_Bol6BH)s9mqF#bNnj_lQmE8q z`rRffxs4KA2{oQC0bg!$xl(?rN1RGot!zl>rEgn-oeBjxUJpL@NQq|j*x^I2GU|w=<0ShXCA3`0Xe34$CGh060Y;cWNOcV$+0n=7{Xiim`MyA73l*ibkMMhGr^6wFE|^3#LMu&(gd*Gz z#tFGNlD;T|KqUAgRC|7z_@3~=7PYIH9-cUBGag?Q<6lyHCP)@fJ!Bokq^&BFdUNP~-Nhv{|8$7aqc|rJZ&$5=Dx? zwY{Yktu4)jU@MwB+Hn2uofRZelD+luchE65jySDvYA97`$ucWL?AcaXOPO|U8*A+927?0K*~hLSoM_=q zR;=JBDU+n#xtf5iYcLXcdejnd4i776!b4_UlvYb(TO%r|rs@rL-srs9^UKvbC8G37 zYcnzBr zGh)m|=A1Yhg=Oth~{N6-2+3~orr2vy!Fp*KORJGEg8 zW@&eqH*Ldi+T4A#yL&ceV#n5P*umephk)NttAA+oHby26^7+GC^Km?_2uHRSVfWSo ztdfk!78g&rMvHnDOj)nOM$j0k?zFUJ8ebU3ND3|j$w5$A2(Ff?Ii#rcLWH1?01FaW zkRkIfALc&K63|I&6C&~=)s2m1=Ku#AKIs1 zgXa2ucn+RI9p7t|*(pA^N~aZ#$B;-R(MI?s6LGY*HlwM%1y4M6UjayX^!^WjhPOWY zHWgqGnrMAnhK7+EVkI%fN@KnYtqX&M(-@k!Poib_EL!$0pkx0c2KFst;^2179NK~T zBYUuT>;OLTr7tL`tqgKr>9+U)dO}!kR-VPlrBol$* z45U<;2?&!jjEZ_Q>ZqpjN@~=Ab6+rl27`@issUMLl}Mvyt}r`MsI#MEW*6T4+8?5C z$8nsz_Zpu5i^Z&-#MscxhO7vX3eM4sM*GDjGiP%~;gaSDrOIH=}i zOkg=JZFZRkX$4hk++H57ZWWKcj0&)rFqeFJ8+n6o$q+j$ZFtf#kwyrvfbUT<27%rd z0&(SeJ|}jV3Q^ihMODif$HF4%z${VwnY64*aSQkeQ~{iXTK1J8(34qO0=*L2MHvPp zZKw4OIt&es(8bW5QQcb|kA|?YZ17y0sthn>*FwLk0+uxuaIUT-^vdC8cnCr7>Js=@ z7b3JKACcABh_A^+Gc9l1dV+6b2KuS&h8QCow_=PTgPA7?!WlldK(H-r$;2$7H_vmj zo!i}8GqIOY+{5R0Z_FeVv$20u4i0S2!`>}nWUcl$or3it$uQ5M!j z-!PK>BWM~LM;AY%eRKwGjOMXfG>^`qX>b~@)?R1`Q!Q^WOQf0VjfyBvXtLn=o4erR zF}M<~RCEj~KUq&hI!HXUy&m3>l6cfh!zArYW;k7Dc&Jc}+Dce0 z4YaVdR8|6x>cqubN+n3JJE&B6!6?y9R#{ceffC+i_2W}0o|?z1WFJ-y?MD5rw@`ZP zO&DH&7kNh?$HrtYa$ARyGqVRdM=zrE?h7cp`~gO~+{pw| zZ7pc-=5K6C;<0--RpL;EBViHarH_6Tjnm_B^>m;U=} zhS0k-!W(uRgS)0Mv3m})`<5_&Uf#j9)^=3F}qnIrn;#1oUk~g^M%)tHn_O99=TQ3 z3VKDf!cw8NP4B|t8_(kS6R%+W$YqQkdK?FDzKAD2_GSFuKl~7{f9ZGeg+KWl{K>!k z7~lPupX0G-K8De~r_nmK3l3V~bN4@tS3mOyICWV##*J6r z#=gg%z~YG;7~g-9*K;ow`*sZNKa1||N6^gAFvUC3J-Y`M9{<+D8l*D>c*QkTvNEiO z+e#{0ITz9^yD43U^lTyQvQz>FY0KNNnf8_-d^8kAsXf^jrI$%N%X7PhYFehF$&eoD zex>8rYqeug2@izSv!XUVVK%%6KiSsgz4(+#^+E9)X#NC6GuYpf~EF zHRxavrOz^8H4=hUwB`a0Ok1mASxexpsfJ@!HC(G|sP=eWS62{xr37CQf~)foUX_K| z>TO7^%|Oe;c0QT5Ci8U%XmMp0M;}#R-zGjrm=C9CVuIynmhck*F4O96SCNfnTHkp- zHp%BD=BNNyRn^v@vbGk%XdDKc6Go?(g*vW!Y6W68LM7Haf)JHigvuf(5dX&&f8)_eco*MAB!IpB_eRk_x-O7m=}E#K*hQKHiJ|$pH*b4`66+5JQW@7~MXO>0Ps!-@Aml{Zx7f zcVqGJUVQS4pOuePBwa}@Ab&Ar^N4@k$PAAY*RP;uW(?7>egucQ5a?~Cs%NDV4Z`82 z%5z&`_gG=3f_6|z&n?c;uJby$T}t6SYRj|P9MJ0@&a>UH0c&Yx*REcz^5021C{cQ% z{3OeYSfzEeE~}{aath1UI5tsvX*oqIsx>k`(n8B>L?eMWJh_CUXRcuPp|jX^=o|(n zcOud{gm}j&D-f1F0?<#8#9R7Uw*5*G))R1bvbHjnUIC$3X|SM(Ac=I3Qi1hh=lMGr zJ8%&l3&+qra~Mt2hp_+Z^LXX+zlZ1F{RW=>Q({p|1I+=Dj|?w>>J^d2Ne zmT>K*cktAS&|B281!QbQFr@oGx zZ+{Wbe(u|N>fNtn-?gWaq-vZX1Z9D%>`H@Dd%3i_g_T;Bw70lY5`2hd7x?LvO_h{c z(u*oRwW8=G`?Iw3tqd|=8N@Bq%Vg$7CRLtfu*o3!q%$ZzL)rpmEMtKJpR9RPR9vdy zCxgCa?5^b3Q`1pPN>N!^sqT%+sv6iRj%0U?#3MrEUpT)#7 zpdtK zkdP^C9jqwZ`Fp$hI|nz34Nj*LOvfZup3Lc;-nb1j+@B)ohBl|6pXzsb!zMznstV;5 zm8`Wrv=3gGoj$5P9~!J;a}%s}l2mg&2zOCoP?0r`Orv#Tjuv)Efi6lgK7yS9PT5sL zQlh1AR9V;*U)wS;hOV(0^i0m9V`3KFbIYtH521f~FAL2QYrbW4Za;v5J;%|wv=7du z9f<5ZM0GO;XR?QY9U#~UJE}%AVQX*dfrDXf63b01+|&k3(@M&CN1_=%!q6RWhKtY^ z)oK>IP9SK(JnGNhMS06O z8vImA$tY|xzAfZOV{;=Cw6ii~=i2?d3O159O4ObDgL3ab84uq3G<+xb!fql*QUwoIF={Bh^+M z3p*BR9sMiwFllK8cw%d%Cr_3El`(Y!ymf0=RGwH{mDXJ)wcJSCyOKe8gJeqHoUzgw zLBPqtT>;*ftO7a_A8NE_1wE@f42wI8k?EZnn%ark-N&)x@CCFE&g1CCySVfGn>ha1 zU33i2(&k1K^mHZ{>NS$pTZc*+sZiU1+~Nu}&>DLP$C1S&7+gGtq3x$x5g$P7I{_{9|`9nB* z{dL;mw+O>e;idP!h0px{U*ns9`5*YqAN~X0{r*4U+yC}+eB&Q~geN}vHJo|&qu6)# zDd@uOsCC6rP;Wy~odsnL7BrZoosyQ96~Wdl8JCv_omG@yJ|BD3Tgkd2(36l(p+}0ZxTIWdrDt3eoMcLFXwa&P%jI&bL<1U`tz@(+ zwUt^%_{HXi{f}LwOp^NITi*~n@ zk9AWO_ODOF0AD{sSdQ^;=?od8ixA)qu;N`+Sy_SFx;p-VdicXJ*4_c=EVKgLnmv-b zDG5s;24{>mv$$0!i=H+87ah`6-R(b@{|6x;~9wt;bU3{9$N9UMnf|0r6ACeTe- z_DnA*YuvYe4|;a&MaSHBw9W27iU3K@?Lyby6X-i}0p8vF(RBO_ytKPELeJkis8&L= z(>~i6_NHDH7TOyVEw8?%LxqvBGbP(#p!L)3@uD! zbZH9X+ov$Ki}sdEZ+7ppQhRfUb`g5-5_RuCgxG8Jbs>ADrTEhA{MJ^(3YE}ytEvf2sQ;e znRZ6-iPDp;C_RDRN(L2LT!L@o)=Xp&bUKThH+8X^qLp1#si0TKN=RpsG-i_3IEqMP z7Y?3$9FO090mm=hQ$4S@pZ^%1|LEuO^oQQX(etXRSExBu*aVrKtkbWI<{?7{1p+U6Uj5woW6^ zIZrUst?W334ua6%F$Z_k41(>;aHjZt`vMY!J20^0B&LsD#oX!Jxcd4RaQ~g};46Rk zL;Nw---}=SE~d`jLF@8y^zS{3mg)V-*ZGiHX+mzb38f7-x~~w7cAx6Rk*qEmIR$Xi ziIr5AhWbZK03{yn%aEbc zvMN*)s0D?^&>MISWyq1$j+6{DldML(e=4eK)!1H9gPS*QLKH)S`g z1f!z(9YzNn)fQNC8erZ+mAA1Dj!M}t6x&MZH8MH~yjDKmI!?8>xD&m*_Mvyr0kqHWL~?uqjk7z^ zec&WIkDf(h|54bf1RM7sgR^@~*^W@#5S;N27Ao4?RL2MF;2Pot91FgNAaqfM+Nlf; zRE4IVQ5d_2p-px|M^M*N{gqJdm3jlH2}V)v38TstL8T{-N^c4ko)pU63AIdQwMRN| z0;rX?DT2>t7t7{Q1+U&7M2*!CEkUWVyV2kcp_bsR=4V%jqNt!{EpKc@VMjZ(^YgIn z+zsu(D0Bp+(;ri#*8^dhO%jBxD{d`=U_fYb-ZgYdgtabJWt^5oWaQQIA(UwVAsB7EbW}f z;{NUU*k?aUt`TVuzej!72mc<)y%i03^yCpVwlyNw97iB330vK2=7rs1;Q27X!kXOf z62%(8;`SvZn^3dS)fNzTT`Rv!){!^Vr|#Z%F3Ku>AX<0Fj(`hq7IhUFa8HlDZIY6}H*ErowM0I#Hm^`Q^?DX~!oS{}sEFHnQ8_(nK{y)G+KJy(U zdKNIWcoGNBKZWs~=iy8Y6L^!b#RlmlhT(0V!tT@eapu;Cv2f%T`WH^a8XZBja~XP1 z7aHu%Fb6vLSU0RmmfxWz#0kc(LsxL=t*_zJKlpbXd+y^n#N+JRdl5~GC(z0`aN}i` zX~MO03Pu?iZI58ziHoWSS~?*zXeC#YkIDi@29@r9OIkJoCq1m$s*O~#%%rCgdRsQD zp(dNRY@r2~VJkVvDv(~+0)AEj|Caf<#cE%s>1E{=p@3j@QSg{74zzW4V{l|tB^J=J z0+aPkB(Xr9M#eZ+puRyvMOc6>{2s|xT~Sen&K|lkw;g($8CvPZCr7NjcN`iUY-I*$ zww6P?t_+sdyq>EFJE}bw!RJ~d+Xi^n)>G}(5PVd9gr9eH9(+`M{xxX`tl5On+6{=T z+lUw;CK09Di%{)_X?a5uvd`DWxS!zjDFU*Q`^_pgp>gd-tg5M|(yOgOO>M2JnN{+D za2O7ULnZ!{%qd>4AK_>cp=cB0R8{e|UL-ntk?QV8bKel!21n68JVD@1p<|3eut~HN zaw)=4qK%eTLR4Pw@@@ieKiZaeDd_dk)^_hYjOHB&5#7z$e+038M-W=rO)E=VOX&HU z`ruAjuLOcx*1YVi18Rda?l=)jx>Tg83FGd*B{(1;Cw-XxP%myk%je{1~ z=7ZTAgVE-J-pq?@cf#m)!%F36;cN6x0ihoiVZyO72~Be|k1@W|F~r*07l^>kbL@?T zaQ5;A0`Hgj>HE@kAkg_0{;wZW-U*H5Au>qT&UohAe+2*SJ19JP5;;6qC5O+W>c-P3 zz4J86pLha>E0>WvbQHZy+aIdE2`UvTy*)G7zC4Y6dk8)%zE6JPU4A5?rq;$#U-$5z z?5sdfl%T|sV~1$JlZaD^20~tyOh9H`SZ&$DM-%`6Y*9%>K~y>d&%m3)s36zW(u}3; z%kcVrYM6>72Q~8MkTgRw&asj=M+Vi#I!PTxrKc<|fhTiqCCPvQPo*rP+S?=`1f=U!uX<1P;t~9?`?9dvFsIIF=1ud#5KTkvky>}~%y64m_Tz>FT z96Wy?$1XjEr2`kyH@*vPeXNXWcY`wK%|i#{Z&J}YvWSCcZ{b_t`v?5apZyDgM>QSl z#>CFEnA~*^-BVQ5%@Z&Nx)ErdLaLwVnLxbu%qK8^=mxr{k7ID@92M~_8V7cv&f3V& zrQ=|^awd8(y89Fcs3L=+3Qym|^4$+(`Nk{Qefu@coPQFr8J6|&{b*h|iTShl5$s<^ zji(jmX1Wk7E679#Dro6N;pOD5*i%Ukx_)yGO3Lf0+HzH|cg8m9Y@mwUx>XG>76mOs zSJrRZtU5im$t(($_@Gz;FSn?Iz$;X}FN^%k;$O02Vwwz**|d==F(2_%LQN6X+bq!VwE{g0p=Z;v>{J=hkX;GQ=5m;6 zXYH#5eDx|;tS-SvyX#(4Pw3SUdX?OkF!C6ggkhSRv+E_;0)$zRw$;zy?RyyVxP+|D z;a#(lHhCk0j4)v*V2nPDRrPgMtc@#C#UEiX=v4yH#>N!`pF&0?;JsG-y|BE zdr%i?BJ7eVBm64DEvSmLqB7Ko3aZ-bPz0KI9Q9EuzhD3rJ|AiVVbq1Py^#@tVL$G;*aJWNiyqJw(HF(1?P&Hb75!Lf3^OfDvC|r>+E?&J%%t-bVfs6o$ z8thjO7yfz3AlaAI+aLViAHaU=D$)+^L*|}+C_jA>4fmge_W75gf9@HWE*;ZC#+FXHI6XE1U23I+~cMc>ieXxn!c zNkX&#;AKpoy@#HCk74N0H4GlOf)n@O#Qdo{2y{!{ydIc?EvPoQRjR10yi!Oa&`hJTD^WHR;`0)^*Z?0tVfWKON1D!>T4@dQ%x&D z#iy4v??xSxi6~m=&0P+&THqxS4Fhy!{~$UChtV}Og6@%V^o$Ak zR^TTPY#$z{3O6krjVSNM(gwfI_GzwV{tc{7xy5wa~~S_ z9fo^q9(7cembL-7+6G}IN0qOqNP?i}i<7;CBe$_O!8n5Qz@h-&^_6HXFxw7gVg zweb{+gAo+dl2$je@FiRLo)%OF6Q~TwQB4I`MN3;p0LlPvBVlO^CSmo*VGFIag-Y1N z2{>4L+Ja%;2qA(mg5UkaKV0FC8b9(&{QT#V*Z5cX@vkHuJi$kmCt^?1NnQQ=SE0FZ z0tKfJq2}yqG(2_*`X}zg_WTR5KYbtOvnSwRm_ys_l$tL)wv%dac}ziXFIC?1_IcH| z_a>nyczyq2&+rGKHY?a$F}J_M-b4Elic^_}{i=7?=Hd-NYb#l3Oja%7W=1R#R|{{; zGM+{$;19v~XsC*;XDdB}k`79GXH{p#hRtelvZQ$bFZPzu+q#(wg*Gi?TdqpbDbOpd z&>+9O0VZbr&ocBtF6G61O7fDuX3;Ryv<}+_#=h4e(>7Pgc z~lcm$u1R@X=P`4sr77-a-q5#g85kc0&pgxwYeJC{K2 zVK@mo$HTA_c=ETC&~vlAc?mv&pTw%#%2L$VRzllQ1B0%C{#%cr&xxVlb|k_cv^PhQ zi2Bjg7{~GB$I#Q)i_YFYboTe7qkjOM1B0{-LkfPs4nnSf2+fqr&AkK4{z|k^^+_~y zKSp3i84)VRKxaQZ9kj3AL$LP_!_qqhJ^yZK?0~(UH73=bEhRm+-Bft3u(tL>Pvxgg zbU{m{*U&x!9}BW(cnQY&gJ_sJ02d2kX2&dQ7mveDyW>B94~9dRP_^q6j8uJPOl#K$tR2O^Mray)kuTtDY)5sf9rZlU z`cxaLQO!~$eEK?0TzeXKUiluLY9p8;`_birAN(aU{#0j-f$0U-ytn7!DsQymF&oZZV$7wx-s4$0MPYqy< z_H}sAS%iA$p$)X7#*rXAsnn?Uu0Q{Av`-(!zALX{^vFE~hY!KqzXL9+xxLq3z}>gL zhEM(O-{S3W{S~H<-Gn|$XVW->Dq8{#t|Ut8?T{=mo43*47M8E{;!^3Q7geIx>P7p= zEL`z6lr-2dvHcKUdHeGin%xVnBMOr@g^{@fXr|DXUPkW{fh_$mxPx>@9)k;nqi26CfC+2UliY$n@3f`g#JgK@C>Uq{>TAOH*xp zjm=F6(#^=CK?Z5twK>$#6{~^3q`IujD~CS445rPcRD7kdtSg6gZ6(21LqLhjYk;3t zS3Qbt>e8OxioK$%d4#MwI*a<%y!%E=U2s{Y~p(h~#bP4q8tIA-| z)WT-c!(!0FZ8O13h1b)Xz{2Dpn&Y&#Q6EMIyK!LuZgh0Cp`*JC?Gn9Ie2=29kJ0}@ zwD$BN+1^Df+=G^$el&LVA<^E=g4lx?Biz*kZ)-a|%`!!;9qwisb0CPVb=wNDa8UB#~*z6`$P*t^KxyU7vSSZj~`W8 zutnvGtu?WLOJ-!f$-~1_w$Ayeh4Y0H(^O%9DfzDmezNtt2Bg@%wyK;}KS10jhLa1YX+yqAV{x zBl8#)8#}ij!wZL~(vD+t_bCh!c=LxYVus~pC&72_-YfX-KmQmX`rLOgb?`AvvvNFn z`z3S|n9ZZx5$&Er)8IU2_MAfB^d2?&zf9{;$wf`!RzwJt?xmv$4=y1xxX5xkh6-B* zhEN-<(JsUZ!f5X@`j^k6ef}JR1N#u@+m2xW5{}a%fAs5riWlGeJ^cPZ{v69^?!gov zge5TuJEPVXS2^_xtBlAksY78oL8G@L+SZThJx6io#*?^o|0T4tj5fI9h;>gQ-ZKrI zhb~6vL+9uYTKhTdI`TLs`MYa$US5YvY-7ceFP$2hS!%Gc?8{bq-X)}MUy=lSJdzQ8 zB}uSsorIyN_*#t?)eRcet||kYC9k2h-O6>8@u#Bp%+l1sN>HtJYyfURk-s5G`94X` zt0}LACc6~+j1uTJ7elw9nD8rC1D97RlI5_$dGqgaSkX<0?a44V)$e{4TqimEf~m5b-$B6!v1r%qaTW z;^=KlVEgI@?JLpU!RS=c+SNttyAqA<9S9{;2&bATw_8<4l|Vw$ zP_-b)2(`(a+!lDbA877S1DxGdOO9k4oV2hGDlr*+Y)kUw(NfC3gRnF5HDf4F_v=GZkOx_+^-PpM-AVFl>9zAV^D7H?<3;@lIOW zWzyCHqn8q_F)tc%&iPNdHfFY51)bg>OFWbQ!ySn58d&zC|TTv zwDv(1_fMmWH&{LaT+&0k-8qPg)*e*0bfBcA8HH_)$nQ*{peu!v_9SYWV)(}Q{zO6M z*YKgz`{|DrlGF$~dF=gPDDXY~)<@yxxwi~c4G#_R_fEn$JCDHpB4TqgICc^P+MLH@-{&P7Pv)R(G1#wr^ny{YQ4;*{^(BrJfhttiD)TYdL_G!w?t>@TB8-A631N z_SWWi!RRzWFH`s|^;CLwQ~^fx4fJ4fd6A0Fpj6%?g=c21t}&QVT2-sEaBSF``EV*( zJ{3v@3d(8~{3HdE%p8z8ozf001Ag{Oz8kvM&Z-&tkq~c7dxwvJtU$Pc+ zsVrac(1zI@E;!vj1j8}a;TECdZtE5YGKFYMKROA7`Tb`xx#u*-cb~!TGqR~J zo5bv~>v-j}zlR4@dw=pTzr=wnFTl|}4hzBOObx>n=zu5Dhj250&(Iu(r*^PHx`@3e zE@5b4AKF+c4=x^L9HMv_qTQau;*rbf7~ex+CeTUZEh-#qwi0Sv;aHajJHxR){b8iR z$#88*SK;26fmM<>&dY;|$jW678-gsrE#Ux~f*!Qd)*jqGgNgnQ?3kaxSJ_upoMgAZf{3ZTbs(3;^9ei2Sc!doA%Zn3Bv55Ri(-k=;^Huw7448sj*|%GE!8nD+YqzU`d*}O?nZj4rGp}j3rx1Po9A^THvjOWDc#HSr(HwIF)6!xNpfkYkbi+&QCCe5xN##n*76`?VYGEbZJ%m)}5JFVxjRUjTeflQ$QSI+O z{{*&k|In4EapCEY;J1G8ulS>X`6(_w_ZCJET)^#*d=`5y-A4~W(lxse9g{mS&T_MO z@H7nWsG9gM$wup}A#_s>j_x~)(@(vD$6op*8mIT6bLlW512d>6^t^|bPST_s~@Hy)n?ZeT&Y1cQf;2N_B@VXeg@&TaTM1(u_>zp zMO2Dqvcx4Jlh1QoKy@Y=RRnrv<>jcYt%Y8%r?T@X*a_%lT3MW~L@n^e`&hu|mre~r z&q#$`r)4E1iw;SG*PMJ_#|-51y2_Naef#z+U>c++pU@MlYm^0|>{4F1uVsZ*QwMEX zH8cg4(C1dcltu7n)WWj07UoSgurREfSN4sYYoOc2if|J_xv2z(&3Q0y$yQ<6ngeTE zE^O&}urh3{U@hsnFssOcC4}U?U zG1k?B!#n4(fBPKzI$O|~jG~jUYi(;n3!}BYnNqd|EmVJvElp}JX;6A71AaLDemGbd zZC<|`&LPnAMhUxz;gE&6Su=Yg3An>?)%zJF_+0)dydf5taEdox3Qhvi&TtTV4#Llw zY^C(4<&|DsT1qX$NZ@H16;z2;R9SV&Zq@cy)7sC!2cV&HGjxxmPGH1>SRL;{O}rab ze0?P?b`9-leN!v+jXiwd5S1Pa?c8w~&fSOM)+>m-{yD@y^jQ?{x`5g}k0Ey9DHwL1 zLFvpP7*1YC_=PuMd+J44?>z_Hg-sF<_kSiYk+(^Yhp57z@{alC&++qL{ebBFH!}&XmwbiMd`^b41u0TZ$wp{MrA6NG)|)KvI;6x@1qPqDXP?> ztVX8>!^(6i>DiT#8%EZ^Q0;y!tT{zhsxjUhZB^q#^;W9Bng-R-UQsPmC+n&5tO|N= zv0%I*n_Ku=R<6=km246hGKd%*LF>RYBAsJ6b>n$F_T(El|KKAy^TexII(-B0{lQ=1 z`~Ut+yz~2iiJ|@HaQ2xuaOtIwWAEb+FtGDDI;MA{lj`csji=DuHvx^!uQHzK9Whvg z?dV@Pg0X!U@%T&c;`)a_kM^ZwNKWiRxPJzvCNGL~c9a=hDAl`BYv;W)xfgyaJPCVp z2#q7V@yy#_!-F@!jL-eS->d0uy-Rc^LAsi74@S11P;2P649vq#tL#toAeic-IOsuR z_cX$-BWR?$>z&@OI$HeAqu77;4o+Qt2CaQi zlIO6A?(NFsSJ2zttJI^E5sgu1X;@Hi(!nH4f(rO#+#gk+p4U}R+iobTgE7Ajrrdg1 za~srPW(iYHJ@h%X(B;%Xn_B}-UKJV$?wZ0r8;MjA&Uo8f*Dk zb^Oe_T9_K@VbN+})-+(1?;-Fal%i1jXf-mYYBgtBRxGja{@j)z24x=pzgU!sg@a z2|TAi3@7)4p*RABp3h4lO0t7sOs#9{kk=57t_W3PlwhQ_brEtFg3pxdggHq7(w0_J zsnw=rI7pclA+zm8l{Rne;1Bd-h}3bS73PI zB^YnsLGk!J>iZ|5?-+!ky$|(Wy{PW#Lq%UdVc3nY{`sG)*%s=bfJ{M;+ehG&oKgZl zd0fD!LSFaq_5AbwpAu8*ad{p0f3?ysDCypR@OOVhYrBm8nQ2TN*oWrDDRi*%>YSUz z?dP82tEA;nx=mKv59K3&M3O!N-}2twDwB$dR<^-d13j&+!J?tHHDP>m6x}`Dko9XH z_1X&bq`|t5mPdM5W%fW;{>qA6a>H%TEJ8+J1qv%xpeJL^)D)?Wo2l?*=0JuTn7kz| z3z>Y6v|UM}fFQ4*Gmt>NVI=`+ISaGQ{j8~H0p|C)JhCuVP_;qDd2`BE)?!JrBI_BN zphcGiw|&Z<_Rk;0sar4OtKa)aeBlrO27mNVKf)hV;r-6v{|64UAj+iP~(iC%;-ce)oo6d9>p3v+Q+t|d3YIt_6e%Mam4$Vaqj*{@Zvk)#Eb8K3t#>I zzhYwlB{cXtP-#m->tp2}?SUuRuX6pRsOU!K4kFnxfg!5Ork)v$EgZ(o?$f-kZ^7(u zMKx7rrQU`X!CcmSpsRd8oe+qLmb1=(zO;LN| z2kTmK#a+mvT{^W4?Qe!Oxrn-yF@<`Q0a}xu6}BE)Grz}b!YV)M9&)p`WC8YBt%RNj zO#v^Od@e++2K1)FIJ$EI$M$YVXEOm83#p#mcsz_K_d}6@nn&v8dt81G>|Qsmh}te8YqOb({d65=a;%zE*k15^`eX<<}LeCg!Q|Y_43{9jJI&LLoFtd&@G!lBo zL?86DK04_QrWF-9nL37{7yGdHH2R);9q!xD!%b^zefA@;J@-*KpZW+47w(~A@i+=5 z_M+y<<8VCr8f>q>joKGph4#6ZP<7#P6iqIoj%v@)JpxnbAhcZrRG9;)>X9*pJ^0FB z{2$O$6K+@jsbMTD5B@sj`G;+Pig@ymo1gyhhuq3*eu1C8{}Wo%U*daz`HTm>Lj{_%XLIi(ojoGCW0qC*YG|DzrS6v@H3>WoltW4qsrjl@qJsns$h*v)S4gC54{DoRo?(BmP;V{+T^0~Vh*n1W? zKJq!7AvktW+RAzCx$q1gy!}ml z>|1}0+i!ds9aMkXP%BlQQ?0aCqOqtbs52wfK7^*B1qHoO=L|w!bC^AH6>og`k8$Ct zH}R+c_ABgt>?zfzS8NEQ#6Sy86}Mj&8b5j&J)$C;hgqQ=N9W)I&R%&6dyid4TmKwm z7Ft^bdF52n`IS`HrL1tMzz8BqK_$sWWdLzybpQL4Q;N^)rusKX)|^U|l$NXH1d{Pu;3rU(dAyySJ$U)WSFmT-9#~iz z3Q+A7Qlj*>rE<%r1tDaHS zJHqV%zWRf|P^sHLkXgUx7~%8lF>@oKoFnnR(clmW>TtWTBc| ztY>V^DyG$y#dTJOvZx91n^#uQPp37Kj34RS@*pdERW(@9*f9h*uUVbJp*k^SWxg5$ zPg2br&AcgCDLPnI#Lfl?Jc%ePR$rJFt-VVnrVX$Hbn=>ajxS-?sjK+*5B>#z`M-XJ zfBYF#qJW;|qlDC9?4=#N_WFA``~1hS``R;Dx^M@h2hU>Sz!@BR><+3-Zdh2ZYP7WT zEO%mW9kid(o&}YZ;MBdhaR2S!#?0|sFeL^N?VE)u*bEJ=aE---a=jh-wX|~qDyfkb z=miPAK*tO^XAk3JU;jQ%-Te@L@9%zy8?Ss?&FRh81Za_?RFli-p{m=?3ij}YC$K<3 zpP&*QnAkzEkHg^Lap_!C(Nx(4tK<;OEv|yBhPEvyAKS8X7};t;|H9%z1-vR++REx` z)N82T?RHjX5e2yZ{(kiJ^`W=7_k*oGo{~kp=mbqvZFQ1my|`4ZKqoo)atXd7T1%NC zCz-KF$0qUmhdzSE`6bn3D{4>5iOx*mJyd#+prBu#U@xMQFtoKB-qub;I=kUyp>-2-exBSQ0T<+Z zCB3>N-E>nydOaQlX_X@cW7NsQZgIiKaNDRZEe_tKZrJ&_iGQnwja)%BXjR&V)L`LI z3bt4a+_c1QT4R~V>uT+TlhAYVSX_jYr)L=Mz7g0N{<$TzoID5Hj(u?KI|kp;3$W}w z3CrFyh+MskHY&TumtKeY`aSrceiiW--hkuEJ(Mi%Mak4Ws>UZ#H#~}l!6DSk+P3|} zs2UjK{t&+M{{?#T=c5hqf+Tnh_)TF5I}PzC%9mr_w&CzUO@`E{wtD zXh1w_Lrb%tD!`9Gz=he_88p)BNQy3%H;=HBaa&RVHB?`chFmg!$XXWbH)UYMW=VY{ zi7N$o6}%a%kxknxOF2r~qZQ!EIt_fC3{1`{_@EP`scS@KQmJY%E9lkMYt=k)Q4VU| zWgjb0LQj<4BiPC6d^Wm+a8o<2-!Ocsc6d{rXd9Wwp^LZisjvSb{*e;<+yCufv2^Mh znx}SSic0CJPkkNFz4vY0_~_@caN#a?UcQg9L+5bd@;$_QCltK2HXkI>fee&3N#aj; zia^?qmdS(INwv4{@=NI3eh$s!`!GSf8SNZ{&J~5$5mL~U?`dwcWM&)|)({0ZLv#-E{k;g}k>;%pkBy*_~RcVEG!dmlm{;TLXYrQ>ge)tf+B ztw|+Gl_`6Y9$(Vro9#a2P-*An6{%@!c@HxrXXoUq)KM~Myqbz!Le{hh1Vd<{DCr*< z#PG-{23TH)hK3aM+NtuSGebH!qzz7{tX5JXmJ}ADn4g=Ok;%`^qq;9a2De32fOAXR z@$pZ*g8`Ovo6N>w*^p}{c2h^_7|0F-c_AUw)s)SZ#|cGAKr2vt6tcLKgpt-%ranrT z`Fi;*NmMGsq|`$4+*&1npR_RxK=~fEFC8b6_<&*N_e!jCIgAMU?4)V~)s+pSeQofV z>foWB@tJFI|LSSnzI*~raW_4^mp(rVoxYZZU9#f}!TCW9k*+#^iI!iiv61cso{hDK zohQtZOuyXGFaBvC+H7Xn?~Fb8F5LI7UrNp=%xw847n(@*Mn;@J(8F55L-`t}UH9@vbaE@B$UVF%JYL=$w8fGU+Ai&a z@g{g1Tj6W&fRE7iwfDl`IiPX{278AP9vDJ=bPThnPht7;WrU_@;GSTuJii<6sa^0c zABJ7#24A{@=&gHjUc8Q`CtgA$t&)}Mp?Ycuswd}AJ3;7;QrQg&@CH!RKZuIKG1T@G zdVlrb6sQzq9tmhVsg$3LKV*Ag0M zC(*Tc7yL84nRai7pSCtRKaIAzNkj)&X^m3ljdvr`*MeY2iYh3Ki#KoJr~G*NUiJT9 z{sc>N!w9&wXm9plu#dK1w4Yy&+(N26f=?RMH*d|r`VE_u(vyku0zDbTEb2tkD20+8Xp!U*_8=tXdv$}J zsze8^WN~2`lWXgh<_^D1lMAaQNZo{=)f=QD3(+~WAi+w^#|kdhJBE(21=Shx+B;vt zKm8bZ^Gm;jvAw6UNbp^H@nd-JkN*y@e)$h^`q?*e^!{r&^2E#7f9)wOp1cfwpz#Cf znQ3q9sM^fFI1Ij&%7olJz88bLE}(7p1iI%>A>6%)E}01w@1fI)z(_zEJP9?Rxxx}a z^Vkl)W)8L_;l=P%@rla&2rcflSKh@p{@cG}`)rBgf zB)kZsN@qob*{M=+$x^L4lS~t{s62PFNWTmvk(BVV`dnuAN)Ek(qHbbC{M>W`{&s+a-nv&^|MYOV2(*dqWE`HG(89Vsvr_iFrbA zViC^Cc{t~Gz_t4@baQ**+;IvH*3PEUU9c03`mq`4#>Szi71Rw4Ktt%&^0UiF2tC%O zU;Vqk{g;dD-Q|mB;;X5xc67O9Ujfr{mCzXp}_h5fB$E6Y@b8Z;xvzM1|BNC z_`yAJEKMV@n^tOZ0^x}PB&G@WiFUXKQ}FjU!$;`(nv*;iNj!ez27dGt(tvgszxoN5 zXZw)|8PU_`$6$Ai@C##+K<(-5Qj5OG)FJ_%S_FnxLvpMY)7oayPRY$B1B_+7od7R~ z%1*X2pLSbzu^N&hsg6|Mw}(nk8o?#kn`BHjIs6DF+R@lKte_{^R%&@8t5wxl#_TKu zc2;`+ptRw!B9k$C@kZDQL1`lnHg&8lKhRC3*F>A!Gm0eD+Kp#l#}~fyJ^c6o_!XXh z^Rqa7+|@`@Bbyf`M3WIci;XJ4%~i8DZSlSpTg-UUqkEEPE^}NsI}5TxWlwa zPBna{!A|hgIe1%#R7RA>@%^kYx1(|N033-?s^~db{p|#B6P-^KCSMa8TnUsKgDhur zi1seR5+8yw(hqBD1Vg*ey`ZQY)&e7 zihv%fvE}W%uxIZ9?AW;nQ`55q-Y}9WDrZ045>=E`G}75oT`kLOmeAVL&eAd$$}*nm z8LZ$ku{DeED=1JMm&Z<@#k~j5Ak5D<=}ok&gcHSyC^2akk zk9>J(Q`HJID{H38lAL%}sw`7WJAA$U ztkq=hDxv515_E33+;+8EzRPW)&5~5^2KedcBYGX0v<>Lf)u6Ag1mnhPjPbbIG!^Kg zLY*QYC%iVa@;%K|h)q<8QBjFu+U5jrl2lyHGzg{=2sWk=ZfQeIW{Pz7p^3-QJk*C4 zs-c+!Gr0fyJ&Y_(plfm%U811o7gWO2*1271UZ6#poJDA9H|@XYLmcWMvZqub#c zU4(UX8s@P{n5h_z!y_;ZjY2y-j=J$_=qIM|jlcgpHFQM%Bf%aDIW=_d5%>glva5FF zais`X+VW&qRNx=~^WUL!*F1dV14!Ki9~hS4%KgP!SKxc}NG@wGqxYkc<~{}V^9J&QBVi#h;z%P^WocEQ(1 z%i287@7sxbYXapOFDvm@)R}092|Zc6Hm{cFF5F4WyZ~!_5QY$cW3(4>+T`E<%YP^I zK8$yN@2^y*=D|JZ5aR0=j^Dt-kt+&(_GmZnEvoMEdDQWDODEvy>;e+3yz2=VSxo)W z)G}GPzp$8UkHE_>Ch%&_uu^qKn|hFJ=|)%YFcy|~PDu&8kti6IeWXzQrx>rh!;soEY(O65BxjlC>hB#R(P zZa`TOek0Pewjet%gHX@nvE}m^O9|(4Mj5o^L0w%b>~mDBaI? z%2X#6fY$}1%K^L3N4pV*pC?dOH}f~Q(c-pI>G6gm0QKEe41Kh=U7c`IbvbB%_@pis% zl!{>3??Z3MhlHOtIp~Ei5mTTGQrQF(2_)Lu)zr8+)n*?HKwnQEx(C~E@XQkKy?P!O z?jFL_@)Y_fCosG;kHLjS4Dsgap?z&%T0(4o3eovl#1gr53A@25=y}sPMrQHNAN=jF+tLU!+5aK`5|}BfDP_m_4FG;XJ3c0_|K7j; z3$5!2f^)QSgtKRH3Vy0R=k_IN_Z@_mHE#8Zqo_W52$ehcqJGD2)XYvmGd)a;-HnEx zE?$QYG&CmA5b?tl_TcEbBmCqaVehU=IgYE@?A zT8#x|)mkby+Ed<`W-1bs-NSO{hSS4JkqXajW%=?1VfRKbFh0j}mO@0DOX6*;yr{hB zA{uFpn|jC5JGmXrgR|Iq@+z)9_ZD9H%(rm)#ZTej&DZduul*T*_n+R!m;d75@u}ba zYux!+R@rS z4re%pvU&rGs~S*TS%(~2S;@YVS6Yh(i&q`DjNMChPpF>pNxGIpC(h%<*-O~7{}}oQ z$JI)MF1Mcw%L*-lRae(QODn)DM{zOLQ(+PEXk{gZl}tU80p@ErZp6ln43yQ@BFTH| z+@;HCYwv=S*Uaz`deVz4&=YqkLP>0?bS%{ABt4ikeMrjtO>_fkgthcAm(!-#(S}zO zh7G*_e6LLlvqJ-mQ-j9#DB?{4*j;*-!+J!*4kTmJoTcS=RUzOLuvYMQlp`E5qq*6Q zRLTLLPlHvW&RS?Kubw}QJKI^#dfr| zrC8JJU~|!@JL#)El%4@MtUfmZMvEE&YL?f(iZP0YIz|h$W zXLFO9vEgyif;w232~I0N)5y=1w9_^!yMSJgl+J)wjSiiZ+TD5$n(HdiW2nXmp*PS_ zh9ONYb~+t6PTRZRYQnfqk0I77Jv=6vQWuVf5K6F?OQzI1vyJVoXyr{GZ)-)*;0W4V zdoe;awzy{+Z@l*$e*bU3fUD1+!Q_qwj4$uR5TPf-KgMVGVshyadKQ<^wlIqpN!dO( zizYtTG{+j3wQhj$^AdW_iCI_~X8vuY^3x4X5_(f`FuwCQ|A_x2Uc{bCo19D!`*+ez z9uuIcp?3^XbMlxx|GybBa9HAx{_&p?o0&v_3f(z704FW1gTS*>k=l0?Zih~w`ut^7 zT)K$L#~wrFsmD-#;3TSe+%>f7^^=pR?C(cyZ!fA_notuDp-eI&hn?7eWCyk{jndkN zG1Qf$(u-j)&qFL4Ru)q$Lk3jiu~?6l#_gP(Y^+}|%Q>!Ai~mRxf%WS*s5DYCEJY;< zWhgr<;?bLaIMcwkyMggTSJ1!XEc%vCVw3<(_RkZ1 zjR>|5Bh)bgf7>`LVd)+2f;&nG*~4@yji}bU3B3rkjtIrVI1Zn`g&LC=vV5b}8%J}` z1e*E?>qI}yRDzZ;VLdQ|#S@R?D}VYAs#E3luYDhHe)F$!?1?wgwRi^Mo+V80e+-R0 z23M>T7JrKHlpf?}s?G$Dp$)#qF4USmsJD1fD(nAN)+4{79#wjWdc8Z*LqPYy5~RCn z9mLG?0h~C04JXcB!Tj<*I=F5sXdhjijW$`Y7EiCPu0@5+buOzQ^h#A5;iFCwY0Hyk zD7P}!Nrn~Pn}ZKCF-f8V8CD|e(m1FZ9}QiRJ^-uBuExz36Kvb4(z1)PQ9*TPbsKqq z2GQQtjJ|fJ^Zw^N!r^c%HU?!4!ybt z+FBikl+C5$)6`&@igPbd=&VkI8J^rBhaH`Sa8rj(yPME9 z(27p}mhRCx3@_}&sk;~Ph3|a90C;>VbqTgLq9fxqfbAJ zJ8!*>+i$;x+n;(1H{X2=S3mg?+bpo#C?WmhyM){$WC_eibDz08Z)wQdr zyL1C}N6$h>wWpn#Ma9?zN(QL-hDK1+*@@bwBx-p5%1w2&lv?z3%X#t>coB>ZbYS}; z6{>?3oRzhd zvx~>C;LG3p2YmGF-^ZOdzlOKI`B(V-cmECd-})`pSrYG^g)h|$SG0|)tpnaf2bELSPcqjosszCiM0mJ&)zR$Iv%2 zgFvJSI-?zR4F)wPvZ|Ukv$9%^$tx`_|Dc^v)SX%&TrEGj73()|p}kF~DtDrroRGoc zYJC*RtYW77Q~M^>A)@0wTvSnlEjineQ(j0&Ygl1;k?KgIy{`qG{cY&zX+^3z27kzn zL}LW~!(B);2Vu0;!W}Rm+3KNa@xbM;hs#?>73ccDn>x?vwy!hIJ5FX2cgvQoLW#Y1 z5(Ehl9Rx^#=#{7>*n96)5-EzRR`1Pfwk3Bc4FzEKI#H=rNt=+J zUI+YqzPwt-3!rAilB0;0R7P@@jF?LaeH3rAg-e%4+1Y)85|ZQl$MjdC~-BR$(^s#bOCXuZSMMf~c#e5NgZvpf8}6A!x1_x^An-TRi%v*#cN_8vj&+zi@gCeb=Mhxp(W8v7;?>l{I-wIAWuK15o3 z;g7e%ODh{_>3}5?LrpXaEd@?Vp{s~D(B=kE7!UF`gi@bK9uP)ZgAb(ssk+ypeQdU-wxqTNh`M7Lg1f@b0V+o`*wjeDM$3_zTW}^Y=S|u_>@oq&i zY!o|>!wA3Agx-!uI7#+m#Xg7REpntud-K@D@uinudWqupcX;fv)jGC9@Kzgb4@FLh z-OohQP&Tm!|e!ljN;U_kFoR2+gLvRF1DX} zA1fC=!t{|hFtBnOom&oKnu0j6{TO<;9hMdocD{N8gDYn+`N|b+KlL73CifySwt`U4 zJT%TGLO~1iNNK{BD(ZaF>TIq|%RX(C=I;s=7K%_Ek_$F@s+h4A0`xpUCiBa*o0$p{Zs zqQF!gf0sCe>p&sxbGg2rLaXE$(_$09%d`R+@+(AwTQiE)78I(iD6bK9ryF4lw!#x> zhqEDp8k-+F5_6Th1_i|xQvHzB^lWJb_>tgk5G5jCS|--tP1Y4L%ACz{BlomY^Dp93_-)V{eag)8(PIt`vIyEGtVwWkoWK`h2<) zal(YKBnZz~1f!`K24ewCW|E%CWkmnXAYAbnDn%VuzYAKQ9ohyrGy*ZBVMduK?2Qem zGLzCOC{h$Gg|-^O#vru(qsw)qJ259jqLPceacT{%u8zdVrqD^t{Dm@+-IiikPV|>4 zkk5`?SW-!4W+ccX;pNe@i|h-ev}~~=!I4Vdf4h}V7Js!?agTKiQw9e)42W1 z*Rk)?J~S`w#?0k+aPHgB@X^2h9*6FJh`!?|u=4(=*ms?l=j<8uuPmWuVJkX!>_N-o zE;P>XfS=^&XzPWqtp~xDPWYlt@X;m*D0;p~1Wr+)n*vrF4#LQ$2?wAeF>Aw7R0c@S zK`)9UKIDbm$fD@wMg7QNFiDAru(2_Wb)>nCiAJQbWyGRfJhc!#TBb@0-s-V42C;f^zBT}h(=XXvoyEh!6M!$?i53Q>I&WOcjC}Cb zcX0ZxJ6Jk?6$?kMV*1eQSfcQ?FC0SO(osz8Ig81?XVJ5;AH7=+U~v0!3f((6@!prX zdjC&&`{AE4b>IrdcArPzwxdXlufP)TM{T4V#d?Rd_b!$8HH-E(wMd1W3KKGAW>S?O zvYIdqo-P>Nop3je(}KpKwj^M5^pf^E;BT76!0ZuB?l=u=bP%zj?U*2aUZw56_W7R> z9bUoxAO9UkuY8C)KLcMrNvv2dZJ&{8Y9Xi4>Fhy{O*fMAC|I<&n@G;jtWU$bw0z{0 zt0CiSiZ%68wwE%a4+aW$4Q;o&&MPHP$mV;*zDO}gnp;@L>yp8i&cKu+iJhp7ww8pq zhIaN@(w(6CCnWr*@baKupjwxa#bB7@bC5KRq-iLJFUA5WZYK8!Qw=g{l}DeJSiNHR+F{D%%VL zh0N(v(y|sorAUQNoe7gMpQM+CD*mjf&Y)Fh0CiQuYA=T`poFjf7WJrDe2HOBX4J9a@x@%w@|il1uYAxw$3C&nsplMdaqu+T!H4P2W@mAXqMy216@}Lhg+Uv3qiBC{pLjd_~>7>D=SOeCxCGmbLg;VVJ zE8uMr#V&)dEoe?4FVTvU)($AT+K>|oB8&I)2F8#zG>$YjL3b$>yOe=G6b^8p_4cAP zoLTdYVk*I+YAIZqVs_RwZHC_HQ(%@G7>Hwp`~Q6^(>B3+9!lp;r( z*rp;jyNazRk3A+t?{N+SksMSM?_L!=QRPg`;fbQq83<(w<}+Y#E}&+zK)-{9o+uhBMt1R)Aga9{~dlY200zYC4K2`WcKnwQJ1GNIhy zMy0s{-qs1y{vh0~w8Vja=-l0KHcr4E8-vl^1G8rcU1NtavwRV~QwI@fn}#{q1HG>k zo{kyJAASSJ-@1>XDI9nuP`NQC0ps8&iTFE)X{w4UVsvaqu2 zq!Sm5y5aK^9v!zsPLjb~Z6Plfsw5PMyY!mPXM_Eg)Ht^fDM-GKxz0b0IH6 z5i%JaviWmfQ7LIcAac31tGkSZQAWCxl@&orAyg8XwF+K*5{RiJ1Ge&P)RX%98vJOo z*1(~wBd=R~ImTa0FYgUBRhuzr)S{`7gNm z`>!#7^&0lvxQnYl{5KrB`6Z4}=+1un3C@1;00-{e#n8D+*z(r<7(4wc!qfBciN%hN zF2q_|kZ5k9=tU*rYasQx8zQh$WX#b74Dl9dB26T{7}ONR>M#QhY0wZ1!Q>0U=n8W1 zg-}83D+DelDq4{CmyJQFFzQ2bJb;RpxU~PTfS*y$&&y(qZ^PXR4$A;ifWkp^6dURru4?am8Qy$d0Dt7+hWA+6L($pfBxmd~$N(O<2nnoO}< zN8(#UpzF|I!U1eK{w8)^yoLQ&KjZt}M%%(M#K!ls?U5EbTG{Fw zhfqQx$*!!0%0_yxZGgXhQc4>i9oUZO;0huGJJ2}13nR;C__v%TWE{iJW7o0g_=jj7 z*on#Qr_nU9NYU$pxq*Th?nAJ53ug9TM#t1X$@<27rcq_EA&robnq7!Yj+>iMJkhY)^~%jxW!e35{^%bUY^p5d<|`^B}B{ZM4`z3OhU~SG+13D zIkM!8T)en032S(r#I<|+ndf*Nh0uvTl_DdKSd{vORQB4Da-^8JGI3cq{h(FDqS_GcC)#}eqJ>c)@l+f1<+Uti-&O1 z;DDjF8ftwx++HIhQ9@Qg2dh<%fWH=DYLe4N3v4Vw&{K`}goVz;0Iy4qU{HnDb_=@u zoM`Rj?+TQmsZ~XHkcP+Fhufg_TTtwEqu3v$*twzbvjqdtg}ex~xG}RdfOu~dGKyS& zC2cG%tJp+bR$WOy&OfBN2?Z*ZR46A;Yea@phh(`5n=4dEDpw+j(J84^iA~}RVwqBE zsb$qj;bSSJzEqiLmAo%QT6qO8AW5*K0tI|uA>UI#kt{4G(w51kXd(ujLN1oW3u%9N zp-FU_$~@@hIWVirF+1818wF0QsesX1cFNw5lM8955L35|KqPX_cev><}Dn%bssl=_cy%p@DF(VoA2@7Z-0xo|L`MDe({i$ z_W`!N_9lAv9YlP34lNT??7)MFHa8$b zrmB`^$ocvT3YR7jgUQS045Nm(RsImEFNorhUrHWYLGqRbTqt2jCA<0T>+TrZuWjKd;uHH29odDh`NI^;5xASj5a^ha1kcel0(;{qtg#X30(~$=hDmR8 z=$bo(nSGbgIlUjlTTh~)ZJuog4ee8G6L5*5R^cv^VF$F{c7*%3Vr1z!5(8V&GB8Jx zGtdQpU>hrtDsm4k;|?m5mKC` zvXokzw8EUh_vFjesGv}a5~rKevL(?IN7%%5d*)fO_wwog5IbRM#X^-JJyF_ISl23o zhscmD`h-~B7xQ^yBeAfz;@F@#iBa#c!7KK_GB}%QS#{JLmCsDkGeYGykot5`JGIc* zwb0qLFxqtRg~jwm4ZH#NMfTlz+=4(rgILteG2DQzwg$8{xfqyTm>6onV7Cj2m>!9^ z0X^LgjEn@)-Rr>U1cPI<8cod#(qX0$y-pMu709;Np)}HfauQyJM>ID~q>A3LIHtF> zqp>?8t1o1-@)sZvdP&|(b*Yds^#%Q6+# zllESs&~21yD29B#oT5l!+{li;v4R6nPBD~MBb{G~3twO1~3e}o2sH+R$tk+^_u!93e3w<5GOrrAm)-go-7|oruZEg$ZrhD-9 zH`npyckg5Cp)oYiHeu=FUfloNAMp9#{(?8Z{TlCm`)hpi!=G^P`@iDWH~)a^-~Ar9 z|LHII=+A$}weNn1i=REju1i-jxML4GX6Df`Hi6du0VFy)(A3tBP@;vz(E?jbH}q}& zQ1p(&w0r=T!{<@8vLEuLB@|6hK)JLHRZ}yNwf8|5ZiSlGteUrFp*YI;y_O35>vM7N zcu9axC_PpZqL_YgL(QP1%YF_`W)6nxeeU1x39BwIL(?ZS|8BE$vfsavI4% z?7T0Rjzv`}G?L(nxjL2J#3)Z2YqCOXFiSSqY^jIM<%OM;Cl)*EJz-cV>?*5Y61tw5 zo%}yX(LKErbNkLBKC~5c2QQ&_@i@+X^lQ9w^>duR{XKs3Z~s>ky}D4BlnUP6NU>`f zKz+C!(cUSznulNt^}tEn87JAz9eW!)FMf<|r$0dV(rNaYqqMra(7ojZVnaI->{{S^ z#wmEC6uc4C(b}5A1Je7$gWGAX&th=t7#c^mqkVE0Jc%*bNOY~k+n}|F**7Bywhxo^ zT4C_FKNHI45itFLQ&sEQHOkG9STT!Srki2^f(rUT}~E@O0>2j4V5@qBk~=J z(-vY=un;@3SS7?yx+Uoq<`+m=mc>a6adJW=oDiZXHZ@mN5MokNrQ>UM23TJx02hO? zo&DD6wLl-VQuIi99zE1-8jlb?4YX`VmyW?-2b;GBZoirEU`Kn02hnH^A|WjXdcD{- z--D5X7-lCDSl-fsm94E9=(3`t*@*V$8f=+QU}7SI$;lXYtnhp6x1ps)gTcXCJmv@y z%{BD&K0ArRiz1Sn+${!@pBK)JiTPHHPl-u+8%ju2#fmCa>h%hTJ#csQp@xE|9hgS#%3;)>egpcmZ=&kNB~%_gfvO`%U_5&s zrOUgJ-aCrSwgJfcCLr%0N9FJssz-)V*+vqi;At3y^ltuNw4M3@EsBeF%2&^yt^D5k zI3H7aNk)DXU#Fo}U?8xPc5CZlFq;vM*T550K%o;y1eM67;3Zd+tSd=a6?w=lNP$LK z1W%nBVW%0bAs;$PkXl|_vAI;N#EDKVCaFb47pa7vdG-a89<2|>P0-t;_DJkjZSGUj zoZo*Hy=SF4qBYV4d0LvNw=9xZh-4AsG>(>*wS!j3g=(FNl*fTV_aJJq3bAwff^d;s z{p?eAUzCK`08e9=ls5hFrT4LT_!1($vxxQ2V_@q+xZB3y@1Di(S8w3d^{;U0)8Avy zl~3^SFaI0we(`(h?5sW74R@>?_D~xgK{uKQXZamALQml-uZ_UfK1(~hAKTAfm+bDG zu)6an(KL0Gw6_l}lLtvo%rg{pJkm z-uI>KSnjS66CR_g?;;M0%p0v6?PRBp%Jp6Lan2 zNTQgIDv?AlpMoedo`_@x!B7Ld4Sv{sPSgjSF!@REEOBV3*{b1isc3C$5RVzqnyA6VNC2aQehl||u(Z&GgL}HMb1{ytRtlIk!$KM`qSR|e9y@z6BfP@npm^CCIU-ox z(TCxY2Ix)n=c2N`NG#gYNU_t9xKv1|XP1kDI`s1X_6~Lu3(_e>n@NQ0lq%#oJjkqf zBGcwUK5dA;eGr?}7HlZhVpEwAJ~b&%2%d(dr$tJo9w~AIQWa(-D{A@LdZgC4u|aD^ z645=)XhVwHg!~!@(p2m$q=4euT2#}{_Vza8^6A5f`9084Y;{Hp+FRooogC!<b(9Od080#ikKt=e0u>u#x1f9GHxnP6{BY*5WoJ7PG<|)WYObLs46a z8gmV5YcyzPXR%oFA+IV#nX-~LrxaO=5{dBgOEOSWkPNF{hLBCqX2ftuGlEV#WYU3g zk;X^V!B`bO5zU1yeeU_S6tY!;6J;L7;)SrgkD6G-kTR)I>;yH4ChU?Ry%%0sD{Xoe z89&7Ce_>t4O07a;K%%_|+M3l2+B#FcboxR|t7~=A_C}i!inq}EBuD~L+NDO)(j3Wd z2Tr~9F;2aGN2-6(KCuG}`_H3kWD)MxF)SQ@4fBUz$KvVtFn#DX9C+hn{PX_-2DTkQ zplui~il!}?K+pIVs4b*xg99ZxCyI5nf+E9lY)E>4+srFiJoz3Lj$g;{?l%w`+>Q2` zBWN7n2`{P6-!a4c%NSTZipjm_vE|5Xe0~pu?l5g@E0m^`M*(kWo&j zlUIVIf^sP_u07rjgSQpdNIz=49fEr3Dwmi1CkQ?1qJ_ECxs#>npm;TOwWWy ztwt~yM659`k)AZ4#{g{()I&v@Q+X(O6g!o(noY^RtU;~ci29%jW(r_K+=e#V+Kx^S z5(zPfr$juWL~myet*wqCr^5K42@BKpn4hwsx1$Op{YLEE-iX<$5H4R>!NN=%mbdlc z!Ws4t3gDqVgLtgo?0?aX`yHVYlC(S(FRV=H1h-NAg|1k5)G6oiV$t5 z?Naj}mXi>Px7ozo%u-qt(v`on9U8wEQ%4V@ZFxJgZ4{|mD^lz1DDZ}nVs_z0nI0QU zb=Xuwn_EV~t;gA3Qm-z0-yZHD%3GkD+f;v ztKZ;*JNNP0yVtS*)EUg~Si!*bG&)Cz(K67FSWho}?cH#7(;{_`qJCjF8ZW&E_vH^z za^O62_MAlewtXmnZjo0fMXXV)$iY@J78e-A}3OpEKG z6>#(aaH866LlrH8oFXT#m5%i12)HmaGl196@51>rizE+PUb_TWag(LCp!s>(hP*e2@KlbC`ELnyA5WY4#kBf5Ebf+rSrBj6j1Q8 zvU8->SV48d){4aov4|m7ZiUzhkrS29M0?`Nrzm!U@K!goihW*>QXGlYMnd#N>T$6x zrAn=Xg+VfsXot>hgUV1RH7$j&&f$gC<)_#+vNfTjZxo}m+mYxU$Bx72vF+eF>^Sin zu6_C~h87P>MS%94zJ}R7r=;p<;htG(Zf|VYY3zFS11z3;8&~f98n=J_7usP4CVv7k z+CzQ4Pii__kVIrfp{gE*8YhY@yxmu{Cv%NJWByf z5%~x67@Sfvu;%5Bct)IgUYoR{Y&geUgli7APv zdF>dy#1S(#A$&i1{BgXn_C-nH)LJ^MqE*3LE&W-<{y`@|+RMqwMLEgOUTkkWi>p`8@Mj@|*hN7qU=@9J#5J@)So;lzT5wc)%t{cOnO>j7>QBj@_C;!lJlY@Vh3&rY6 zUMM*-%Sw=5T!0Lb`@Xu8Rw01sU>AA z&U-g;{LHJ^y0TNM9@aB9N{ic%M$%rSyAPhuKA1cDVH%l%ZTBJAPP_`;$qTf%uOfBZ zP897u3ONUB`j&kt+)Af}y$?5mya-41E1o?#lb&>?u5Xa$?t}m?9K3)@3e*1AKgK7&`4=o5xrk8P5RCR9 zG-iIsHFmlf6J&H3rD`k6bo|cig0!C$-f*|HL0PQe&L4jl2VTF6?Wf+w{J~4udHO91 z?^TSioIu;;3Wl~HWxt$5ZMX+zCO^{34Je?HR@M5@HMxT{-Yg{=6&s7wN)(WroM;`{ zg7fd*#pKRY@U~8(b!;d5(j0X5800z|((`C(le3VLo`bb7C*kF!RIH=b-IP@*wWron z;9lA&)dv=}N?v?iF*~ln;=BPnmu%R(62y`HUAT66JKns!8z&A-;f?dl zIJ~n5vx5#iR;FZUR#l=d;DE1{irg8cRp>&`P!vPMyx$XrQjrgZJOf>wUM%k(g_%g2 zQBi=bDjCtd2x&z*C=%IF^a@lMsxY-{0jJ)74Z)FaWYAg_a8P>2CZSGr;6=HPf@Z=7 zM!Zde^2)0zdMcz)_|jxrqzloL*Wks98az{7hnL+^lnl+Fl&xZL7U}FLo6P}a)iuD- zK8Di9KIHqu*mi0ku6=S1{adCw?8BOeRdC@+<$;i zKKl}%+P9@?f(*xR~LL-Es%Ou)2#1+|Bc zLQhLvbMh=qr(T8O&{<^Dn&mGag!0s7RFd@acI-yc)GYE_+ffpr;Q4$cIuf{(wt$o; zL{COro6p-!vEm+YBu%vA=!q44`1Vm8JUWZ2Mhi-Odi1vVFx3@?(Z%4SFGGs9niO4u z)bes{&dEZ0PCD&K7HqXD_-rISPc7!A24NEWE=lkCw6(dkw}oXD{JD%&n1z?tzQBR- z^Z#766?^lfGq%5wwl+N4-XPk46+KbzQB=cEN=lM8HD^-rq_poMHI%Uy2D*Xv-eH(* z3|hQzbvM8hN=Qis=C|)5*=@z#(k?VHkO+GlZRwFVB!}Aukr z$rVPq+6;|I7;CUeTRUo8QAB#CFn8p2%pJXo*FXIZ_kQ>*ntCUt*O}-XDm7;4YMm&r zHc6YE#e0evgv+#csI3%F3SVMs4`P#hFw5U_ zw|N}#o*C&ZZ$8J$MiQP#w=Yt0tzEwfn=^7(tt`cF&H7|KPcht>oQ7n6$D$_2DqU0b zD1MJ}9g4*YkyTluJBr;Sx)aw-%-ubjzZ0S-9V`6#-y=0W4f3i=iS$J3Cn=9zIr{_! zuTKY+%l*sB^wAmBqFo2Vz@S^0Q9 zf(aX<6u=1Yw{?3kIuXP0h#y@YW{eEg)3!P>+;70-h>2pSCBT}#W1_2m9Nhvs^=h`LYG>Whvb5EzBj|E~#!kMQ($vnoWtMifW3UiUU!NbXs2_dYjq7lB_|b#5$1H z){hKEbj{=vvi+?jz#x)sAt?K&k;#r;Gd+iMq_yK$kHg*30+GJT;0a=Q>k>lU-4r`^ zvTzKBmKF>yO=Ea*3N6z!h>wk7{*@&>{KG?h`G;@t-rZZc_4%h1wfngH#doy2-{RK8 zhxp`|_i^vRJ$&)yeNrCFq{yX9pVjS<#kO z1e#mnXla9~y#t2+0ciP|%83b7&&)zKvj9CUS_#FxbbbZpJNKht*B%rslJffcq>~qN zA8Ch{R^h0l=+)8UT2M**S;n7p-7ciYT2VPPimn}VIC3az~+y@6>5amY=^$q0Z|LL z+DNh_fhA2!%e02UMl?a-q5zZFe~E_ z>{tq6eAtYs5ep^;Yp^(L!GT>KY?-QNQ%VB3LYsT~Xd5n{8^xy|y@FfstzgSIU*Ds^ cXh#YDAG?u@-rPidr2qf`07*qoM6N<$g2t{?CjbBd diff --git a/docs/source/tutorial_hello_world.rst b/docs/tutorial_hello_world.rst similarity index 100% rename from docs/source/tutorial_hello_world.rst rename to docs/tutorial_hello_world.rst diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp index 6de1daa3d45..498eda2442f 100644 --- a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index b9ecf2ac20a..f57fe033574 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -74,8 +74,8 @@ using DeviceConvFwdInstance = 8, // BBlockTransferSrcScalarPerVector 8, // BBlockTransferDstScalarPerVector_BK1 true, // BBlockLdsExtraN - 1, - 1, + 4, + 2, S<1, 32, 1, 8>, 8>; diff --git a/example/40_conv2d_fwd_quantization/CMakeLists.txt b/example/40_conv2d_fwd_quantization/CMakeLists.txt index c3540d6ee6c..0a314cd74c2 100644 --- a/example/40_conv2d_fwd_quantization/CMakeLists.txt +++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt @@ -14,3 +14,8 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_in add_example_executable(example_conv2d_fwd_dl_bias_relu_perchannel_quantization_int8 conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp) add_example_executable(example_conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp) +# Conv + bias + tanh perlayer quantization +add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp) + +# Conv + bias + tanh perchannel quantization +add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp) diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp index df10e803963..5c445d9c50b 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perchannel_quantization_int8.cpp @@ -76,6 +76,10 @@ using DeviceGroupedConvNDFwdInstance = 5, // CThreadTransferSrcDstVectorDim 4>; // CThreadTransferDstScalarPerVector -#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc" +#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc" -int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); }; +int main() +{ + const auto out_element_op = OutElementOp{ActivationOp{}}; + run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op); +}; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp index 18f9197b9cb..0ff85f008fa 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_relu_perlayer_quantization_int8.cpp @@ -74,6 +74,11 @@ using DeviceGroupedConvNDFwdInstance = 5, // CThreadTransferSrcDstVectorDim 4>; // CThreadTransferDstScalarPerVector -#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc" +#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc" -int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); } +int main() +{ + float requant_scale = 0.5f; + const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; + run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp new file mode 100644 index 00000000000..f8f996d17e8 --- /dev/null +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using BiasDataType = int32_t; +using RequantScaleDataType = float; +using AccDataType = int32_t; +using OutDataType = int8_t; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using ActivationOp = ck::tensor_operation::element_wise::TanH; +using OutElementOp = + ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK< + NDimSpatial, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + AccDataType, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 256, // BlockSize + 128, // MPerBlock + 128, // NPerBlock + 16, // K0PerBlock + 4, // K1 + 4, // M1PerThread + 4, // N1PerThread + 1, // KPerThread + S<8, 2>, // M1N1ThreadClusterM1Xs + S<8, 2>, // M1N1ThreadClusterN1Xs + S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 + S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 + S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder + S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder + S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 + S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder + S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 + S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 + S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 + S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder + S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder + S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 + S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder + S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 + S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder + 5, // CThreadTransferSrcDstVectorDim + 4>; // CThreadTransferDstScalarPerVector + +#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc" + +int main() +{ + float scale_z_inv = 0.5f; + const auto out_element_op = OutElementOp{scale_z_inv, ActivationOp{}}; + run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op); +}; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp new file mode 100644 index 00000000000..3b25fec0c4a --- /dev/null +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp" + +using InDataType = int8_t; +using WeiDataType = int8_t; +using BiasDataType = int32_t; +using AccDataType = int32_t; +using OutDataType = int8_t; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using ActivationOp = ck::tensor_operation::element_wise::TanH; +using OutElementOp = ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK< + NDimSpatial, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + AccDataType, + InLayout, + WeiLayout, + ck::Tuple, + OutLayout, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 256, // BlockSize + 128, // MPerBlock + 128, // NPerBlock + 16, // K0PerBlock + 4, // K1 + 4, // M1PerThread + 4, // N1PerThread + 1, // KPerThread + S<8, 2>, // M1N1ThreadClusterM1Xs + S<8, 2>, // M1N1ThreadClusterN1Xs + S<8, 1, 1, 4>, // ABlockTransferThreadSliceLengths_K0_M0_M1_K1 + S<2, 1, 128, 1>, // ABlockTransferThreadClusterLengths_K0_M0_M1_K1 + S<1, 2, 0, 3>, // ABlockTransferThreadClusterArrangeOrder + S<1, 2, 0, 3>, // ABlockTransferSrcAccessOrder + S<4, 1, 1, 4>, // ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 + S<1, 2, 0, 3>, // ABlockTransferSrcVectorTensorContiguousDimOrder + S<1, 1, 1, 4>, // ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 + S<8, 1, 1, 4>, // BBlockTransferThreadSliceLengths_K0_N0_N1_K1 + S<2, 1, 128, 1>, // BBlockTransferThreadClusterLengths_K0_N0_N1_K1 + S<1, 2, 0, 3>, // BBlockTransferThreadClusterArrangeOrder + S<1, 2, 0, 3>, // BBlockTransferSrcAccessOrder + S<4, 1, 1, 4>, // BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 + S<1, 2, 0, 3>, // BBlockTransferSrcVectorTensorContiguousDimOrder + S<1, 1, 1, 4>, // BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 + S<0, 1, 2, 3, 4, 5>, // CThreadTransferSrcDstAccessOrder + 5, // CThreadTransferSrcDstVectorDim + 4>; // CThreadTransferDstScalarPerVector + +#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc" + +int main() +{ + float scale_acc = 0.5f; + float scale_z_inv = 0.5f; + const auto out_element_op = OutElementOp{scale_z_inv, scale_acc, ActivationOp{}}; + run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp index afff7f8b69a..a98a1e240bc 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perchannel_quantization_int8.cpp @@ -76,4 +76,8 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perchannel_quantization_example.inc" -int main() { run_conv2d_fwd_perchannel_quantization_example(); } +int main() +{ + const auto out_element_op = OutElementOp{ActivationOp{}}; + run_conv2d_fwd_perchannel_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp index a38fe2a6c30..262594d58b3 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_dl_perlayer_quantization_int8.cpp @@ -71,4 +71,9 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perlayer_quantization_example.inc" -int main() { run_conv2d_fwd_perlayer_quantization_example(); } +int main() +{ + float requant_scale = 0.5f; + const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; + run_conv2d_fwd_perlayer_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp index ba6990d9383..6b22055053d 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp @@ -80,6 +80,10 @@ using DeviceGroupedConvNDFwdInstance = S<1, 64, 1, 4>, 8>; -#include "run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc" +#include "run_conv2d_fwd_bias_perchannel_quantization_example.inc" -int main() { run_conv2d_fwd_bias_relu_perchannel_quantization_example(); }; +int main() +{ + const auto out_element_op = OutElementOp{ActivationOp{}}; + run_conv2d_fwd_bias_perchannel_quantization_example(out_element_op); +}; diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp index 690d70e1127..1ac86797437 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp @@ -78,6 +78,11 @@ using DeviceGroupedConvNDFwdInstance = S<1, 64, 1, 4>, 8>; -#include "run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc" +#include "run_conv2d_fwd_bias_perlayer_quantization_example.inc" -int main() { run_conv2d_fwd_bias_relu_perlayer_quantization_example(); } +int main() +{ + float requant_scale = 0.5f; + const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; + run_conv2d_fwd_bias_perlayer_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp index dd755ff0650..f28abe5ebc9 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perchannel_quantization_int8.cpp @@ -80,4 +80,8 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perchannel_quantization_example.inc" -int main() { run_conv2d_fwd_perchannel_quantization_example(); } +int main() +{ + const auto out_element_op = OutElementOp{ActivationOp{}}; + run_conv2d_fwd_perchannel_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp index 48617e4775e..f468e8adcde 100644 --- a/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp +++ b/example/40_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp @@ -75,4 +75,9 @@ using DeviceGroupedConvNDFwdInstance = #include "run_conv2d_fwd_perlayer_quantization_example.inc" -int main() { run_conv2d_fwd_perlayer_quantization_example(); } +int main() +{ + float requant_scale = 0.5f; + const auto out_element_op = OutElementOp{requant_scale, ActivationOp{}}; + run_conv2d_fwd_perlayer_quantization_example(out_element_op); +} diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc similarity index 98% rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc rename to example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc index 822a1ed8b5f..1587c614da8 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perchannel_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perchannel_quantization_example.inc @@ -167,7 +167,7 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_bias_relu_perchannel_quantization_example() +int run_conv2d_fwd_bias_perchannel_quantization_example(const OutElementOp& out_element_op) { bool do_verification = true; bool time_kernel = true; @@ -189,7 +189,6 @@ int run_conv2d_fwd_bias_relu_perchannel_quantization_example() const auto in_element_op = InElementOp{}; const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{ActivationOp{}}; using InLayout = ck::tensor_layout::convolution::GNHWC; using WeiLayout = ck::tensor_layout::convolution::GKYXC; diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc similarity index 98% rename from example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc rename to example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc index 00cbaa09ee8..455e0804d4b 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_relu_perlayer_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_bias_perlayer_quantization_example.inc @@ -155,7 +155,7 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_bias_relu_perlayer_quantization_example() +int run_conv2d_fwd_bias_perlayer_quantization_example(const OutElementOp& out_element_op) { bool do_verification = true; bool time_kernel = true; @@ -177,7 +177,6 @@ int run_conv2d_fwd_bias_relu_perlayer_quantization_example() const auto in_element_op = InElementOp{}; const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{0.5f, ActivationOp{}}; using InLayout = ck::tensor_layout::convolution::GNHWC; using WeiLayout = ck::tensor_layout::convolution::GKYXC; diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc index 2e0623028d1..8e75c27746a 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perchannel_quantization_example.inc @@ -157,7 +157,7 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_perchannel_quantization_example() +int run_conv2d_fwd_perchannel_quantization_example(const OutElementOp& out_element_op) { bool do_verification = true; bool time_kernel = true; @@ -179,7 +179,6 @@ int run_conv2d_fwd_perchannel_quantization_example() const auto in_element_op = InElementOp{}; const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{ActivationOp{}}; using InLayout = ck::tensor_layout::convolution::GNHWC; using WeiLayout = ck::tensor_layout::convolution::GKYXC; diff --git a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc index aeccb30cf22..926c033c58d 100644 --- a/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc +++ b/example/40_conv2d_fwd_quantization/run_conv2d_fwd_perlayer_quantization_example.inc @@ -139,7 +139,7 @@ bool run_grouped_conv_fwd(bool do_verification, return (pass ? 0 : 1); } -int run_conv2d_fwd_perlayer_quantization_example() +int run_conv2d_fwd_perlayer_quantization_example(const OutElementOp& out_element_op) { bool do_verification = true; bool time_kernel = false; @@ -161,7 +161,6 @@ int run_conv2d_fwd_perlayer_quantization_example() const auto in_element_op = InElementOp{}; const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{0.5f, ActivationOp{}}; using InLayout = ck::tensor_layout::convolution::GNHWC; using WeiLayout = ck::tensor_layout::convolution::GKYXC; diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt index c3b7b825920..a9990c5d890 100644 --- a/example/42_groupnorm/CMakeLists.txt +++ b/example/42_groupnorm/CMakeLists.txt @@ -1 +1,2 @@ -add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp) +add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) +add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp) diff --git a/example/42_groupnorm/common.hpp b/example/42_groupnorm/common.hpp new file mode 100644 index 00000000000..e159abf3e94 --- /dev/null +++ b/example/42_groupnorm/common.hpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" + +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" diff --git a/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp new file mode 100644 index 00000000000..b07a26c4c93 --- /dev/null +++ b/example/42_groupnorm/groupnorm_sigmoid_mul_fp16.cpp @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; + +struct YElementOp +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(ck::is_same::value || ck::is_same::value || + ck::is_same::value, + "Data type is not supported by this operation!"); + + T a; + + ck::tensor_operation::element_wise::Sigmoid{}(a, x); + + y = x * a; + }; +}; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector + +#include "run_groupnorm_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_swish_fp16.cpp b/example/42_groupnorm/groupnorm_swish_fp16.cpp new file mode 100644 index 00000000000..c52243bfb0c --- /dev/null +++ b/example/42_groupnorm/groupnorm_swish_fp16.cpp @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +constexpr int Rank = 5; +constexpr int NumReduceDim = 3; + +using XDataType = ck::half_t; +using GammaDataType = ck::half_t; +using BetaDataType = ck::half_t; +using YDataType = ck::half_t; +using ComputeDataType = float; +using YElementOp = ck::tensor_operation::element_wise::Swish; + +using DeviceInstance = + ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector + +#include "run_groupnorm_example.inc" + +int main(int argc, char* argv[]) { run_groupnorm_example(argc, argv); } diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/run_groupnorm_example.inc similarity index 54% rename from example/42_groupnorm/groupnorm_sigmoid_fp16.cpp rename to example/42_groupnorm/run_groupnorm_example.inc index 35c7c054e05..bd7eb98ca0f 100644 --- a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp +++ b/example/42_groupnorm/run_groupnorm_example.inc @@ -1,80 +1,15 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" - -#include "ck/library/utility/fill.hpp" -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp" - -constexpr int Rank = 5; -constexpr int NumReduceDim = 3; - -using XDataType = ck::half_t; -using GammaDataType = ck::half_t; -using BetaDataType = ck::half_t; -using YDataType = ck::half_t; -using ComputeDataType = float; - -struct YElementOp -{ - template - __host__ __device__ void operator()(T& y, const T& x) const - { - static_assert(ck::is_same::value || ck::is_same::value || - ck::is_same::value, - "Data type is not supported by this operation!"); - - T a; +#pragma once - ck::tensor_operation::element_wise::Sigmoid{}(a, x); - - y = x * a; - }; -}; - -using DeviceInstance = - ck::tensor_operation::device::DeviceNormalizationImpl; // OutScalarPerVector - -int main(int argc, char* argv[]) +int run_groupnorm_example(int argc, char* argv[]) { - ck::index_t N = 2; - ck::index_t H = 32; - ck::index_t W = 32; - ck::index_t G = 32; - ck::index_t C = 30; + ck::index_t N = 32; + ck::index_t H = 16; + ck::index_t W = 16; + ck::index_t G = 64; + ck::index_t C = 128; if(argc == 1) { diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 1d5c2a818f8..67ed45fc51a 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -169,6 +169,11 @@ // flag to enable (1) or disable (0) the debugging output in some kernels #define DEBUG_LOG 0 +// denorm test fix, required to work around dissue +#ifndef CK_WORKAROUND_DENORM_FIX +#define CK_WORKAROUND_DENORM_FIX 0 +#endif + namespace ck { enum struct InMemoryDataOperationEnum diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp index 1e2f81915d9..1cc30fd9e6a 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 2ded6d2eeea..45e05060115 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -588,7 +588,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static bool IsSupportedArgument(const Argument& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102") + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index deb6aec29fd..29c585c8076 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -335,7 +335,10 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD && ABlockTransferSrcVectorDim == 2) + { + if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else if constexpr(is_same_v && ABlockTransferSrcVectorDim == 1) + { + // FIXME: not rigorous + if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else + { + return false; + } + + // check vector laod of B + if constexpr(is_same_v && BBlockTransferSrcVectorDim == 2) + { + if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else if constexpr(is_same_v && BBlockTransferSrcVectorDim == 1) + { + // FIXME: not rigorous + if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else + { + return false; + } + + // check vector load of Ds + // only support RowMajor for now + bool all_valid = true; + + static_for<0, NumDTensor, 1>{}([&](auto i) { + using DLayout = remove_cvref_t>; + + if constexpr(!is_same_v) + { + all_valid = false; + } + }); + + if(!all_valid) + { + return false; + } + + // check vector store of E + // only support RowMajor for now + if constexpr(is_same_v) + { + if(arg.NRaw_ % CDEShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } + } + else + { + return false; + } + } return GridwiseOp::CheckValidity(arg.a_grid_desc, arg.b_grid_desc, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index d083a709338..3814a559944 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -303,7 +303,10 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm && ABlockTransferSrcVectorDim == 2) + { + if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else if constexpr(is_same_v && ABlockTransferSrcVectorDim == 1) + { + // FIXME: not rigorous + if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else + { + return false; + } + + // check vector laod of B + if constexpr(is_same_v && BBlockTransferSrcVectorDim == 2) + { + if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else if constexpr(is_same_v && BBlockTransferSrcVectorDim == 1) + { + // FIXME: not rigorous + if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else + { + return false; + } + + // check vector store of C + // only support RowMajor for now + if constexpr(is_same_v) + { + if(arg.NRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } + } + else + { + return false; + } + } + return GridwiseGemm::CheckValidity(arg.a_grid_desc_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp index 8de81285d9a..7bab2d0408f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp index 7ea09a22204..fefa6c793f7 100644 --- a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp @@ -7,10 +7,30 @@ namespace ck { namespace tensor_operation { namespace element_wise { +// Y = Sy * Qy +// W = Sw * Qw +// X = Sx * Qx +// B = Sb * Qb = Sw * Sx * Qb +// Where X, W, Y are float32, Qx, Qw, Qy are int8 +// Sx, Sw, Sy are scale of x, w, y (float32), which is calculated from quantization range +// Qb is int32, scale of B is Sw * Sx for convenient + +// Y = W @ X, where @ is convolution or matrix multiplication +// Sy * Qy = Sw * Qw @ Sx * Qx +// Qy = [(Sw*Sx)/Sy] * Qw @ Qx + // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc +// Activation(Sy * Qy) = Sy * Activation(Qy) template struct Activation_Mul_Clamp { + // Convolution + Activation (piecewise linear function) + // If an activation is piecewise linear function, then Activation(Sy * Qy) = Sy * Activation(Qy) + // Z = Activation(Y) = Activation(W @ X) + // Sz * Qz = Activation(Sy * Qy) + // Qz = Sy / Sz * Activation(Qy) = (Sw * Sx / Sz) * Activation(Qw @ Qx) + + // requantScale_ = Sw * Sx / Sz Activation_Mul_Clamp(float requantScale, Activation activationOp) : requantScale_(requantScale), activationOp_(activationOp) { @@ -45,8 +65,39 @@ struct Activation_Mul_Clamp Activation activationOp_; }; +// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc +// If an activation is not piecewise linear function +// then Activation(Sy * Qy) != Sy * Activation(Qy) +template +struct Mul_Activation_Mul_Clamp +{ + // Convolution + Activation (non piecewise linear function) + // Z = Activation(Y) = Activation(W @ X) + // Sz * Qz = Activation(Sy * Qy) + // Qz = S1 * Activation[Sacc * (Qw @ Qx)] + // Where S1 = 1 / Sz, Sacc = Sw * Sx + Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp) + : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp) + { + } + + __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const + { + float y_fp32 = ck::type_convert(x); + y_fp32 = scaleAcc_ * y_fp32; + activationOp_(y_fp32, y_fp32); + y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f); + y = ck::type_convert(y_fp32); + } + + float scale_z_inv_; + float scaleAcc_; + Activation activationOp_; +}; + // Conv Perchannel quantization + Activation function which is piecewise linear function, such as // relu, leaky relu ...etc +// Activation(Sy * Qy) = Sy * Activation(Qy) template struct Activation_Mul2_Clamp { @@ -76,9 +127,20 @@ struct Activation_Mul2_Clamp }; // For Activation function which is piecewise linear function, such as relu, leaky relu ...etc +// Activation(Sy * Qy) = Sy * Activation(Qy) template struct Add_Activation_Mul_Clamp { + // Convolution + bias + // Let Bias = B = Sw * Sx * Qb + // Where Qb is int32 + // Y = W @ X + B + // Sy * Qy = Sw * Qw @ Sx * Qx + Sw * Sx * Qb + // Qy = [(Sw*Sx)/Sy] * (Qw @ Qx + Qb) + + // For activation, Z = Activaiton(Y) + // Sz * Qz = Activation(Sy * Qy) + // Qz = Sy / Sz * Activation(Qy) = [(Sw*Sx)/Sz] * Activation(Qw @ Qx + Qb) Add_Activation_Mul_Clamp(float requantScale, Activation activationOp) : requantScale_(requantScale), activationOp_(activationOp) { @@ -139,11 +201,18 @@ struct Add_Activation_Mul2_Clamp }; // For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc +// If an activation is not piecewise linear function +// then Activation(Sy * Qy) != Sy * Activation(Qy) template struct Add_Mul_Activation_Mul_Clamp { - Add_Mul_Activation_Mul_Clamp(float requantScale1, float requantScale2, Activation activationOp) - : requantScale1_(requantScale1), requantScale2_(requantScale2), activationOp_(activationOp) + // Convolution + Activation (non piecewise linear function) + // Z = Activation(Y) = Activation(W @ X + B) + // Sz * Qz = Activation(Sy * Qy) + // Qz = S1 * Activation[Sacc * (Qw @ Qx + Qb)] + // Where S1 = 1 / Sz, Sacc = Sw * Sx + Add_Mul_Activation_Mul_Clamp(float scale_z_inv, float scaleAcc, Activation activationOp) + : scale_z_inv_(scale_z_inv), scaleAcc_(scaleAcc), activationOp_(activationOp) { } @@ -151,14 +220,64 @@ struct Add_Mul_Activation_Mul_Clamp operator()(int8_t& y, const int32_t& x, const int32_t& bias) const { float y_fp32 = ck::type_convert(x + bias); - y_fp32 = requantScale1_ * y_fp32; + y_fp32 = scaleAcc_ * y_fp32; + activationOp_(y_fp32, y_fp32); + y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f); + y = ck::type_convert(y_fp32); + } + + __host__ __device__ constexpr void + operator()(int32_t& y, const int32_t& x, const int32_t& bias) const + { + // CAUSION - We might type_convert to int8 in threadwise copy + // eg. GridwiseGemmDlMultipleD_km_kn_mn + float y_fp32 = ck::type_convert(x + bias); + y_fp32 = scaleAcc_ * y_fp32; activationOp_(y_fp32, y_fp32); - y_fp32 = math::clamp(requantScale2_ * y_fp32, -128.f, 127.f); + y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f); + y = ck::type_convert(y_fp32); + } + + float scale_z_inv_; + float scaleAcc_; + Activation activationOp_; +}; + +// Conv Perchannel quantization + Activation function which is non piecewise linear function, +// such as TanH, Sigmoid ...etc +// If an activation is not piecewise linear function +// then Activation(Sy *Qy) != Sy * Activation(Qy) +template +struct Add_Mul2_Activation_Mul_Clamp +{ + Add_Mul2_Activation_Mul_Clamp(float scale_z_inv, Activation activationOp) + : scale_z_inv_(scale_z_inv), activationOp_(activationOp) + { + } + + __host__ __device__ constexpr void + operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const + { + float y_fp32 = ck::type_convert(x + bias); + y_fp32 = scaleAcc * y_fp32; + activationOp_(y_fp32, y_fp32); + y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f); y = ck::type_convert(y_fp32); } - float requantScale1_; - float requantScale2_; + __host__ __device__ constexpr void + operator()(int32_t& y, const int32_t& x, const int32_t& bias, const float& scaleAcc) const + { + // CAUSION - We might type_convert to int8 in threadwise copy + // eg. GridwiseGemmDlMultipleD_km_kn_mn + float y_fp32 = ck::type_convert(x + bias); + y_fp32 = scaleAcc * y_fp32; + activationOp_(y_fp32, y_fp32); + y_fp32 = math::clamp(scale_z_inv_ * y_fp32, -128.f, 127.f); + y = ck::type_convert(y_fp32); + } + + float scale_z_inv_; Activation activationOp_; }; diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 6b4df3b60e3..2987def02a6 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -316,8 +316,36 @@ struct Sigmoid y = 1 / (ck::type_convert(1) + exp(-x)); }; +}; - int32_t divider_ = 1; +struct TanH +{ + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = ck::math::tanh(x); + }; +}; + +struct Swish +{ + Swish(float beta = 1.0f) : beta_(beta) {} + + template + __host__ __device__ void operator()(T& y, const T& x) const + { + static_assert(is_same::value || is_same::value || + is_same::value, + "Data type is not supported by this operation!"); + + y = x / (ck::type_convert(1) + ck::math::exp(-beta_ * x)); + }; + + float beta_ = 1.0f; }; } // namespace element_wise diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 2ebd3b308c7..c332625e802 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -60,7 +60,8 @@ __global__ void const C0MatrixMask c0_matrix_mask, const Block2CTileMap block_2_ctile_map) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; const index_t num_blocks_per_batch = diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 632c6539110..5e0f6134095 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -685,6 +685,15 @@ struct GridwiseGemmMultipleD_Wmma } // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + if(!(a_grid_desc.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB && + b_grid_desc.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB && + e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB)) + { + return false; + } + return true; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp index da0b0cea241..98a71a7c247 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -92,6 +92,17 @@ struct GridwiseGemmMultipleD_xdl_cshuffle using GridwiseGemmPipe = remove_cvref_t())>; + // denorm test fix, required to work around fp16 mfma issue + // we convert fp16->fp32->bf16 and execute bf16 mfma instruction + // when mfma if fixed, remove this section and update + // ABDataTypeAdjusted -> ABDataType throughout this file +#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) + using ABDataTypeAdjusted = + conditional_t, ck::bhalf_t, ABDataType>; +#else + using ABDataTypeAdjusted = ABDataType; +#endif + __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() { // A matrix in LDS memory, dst of blockwise copy @@ -397,7 +408,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, ABDataType, - ABDataType, + ABDataTypeAdjusted, decltype(a_grid_desc_ak0_m_ak1), decltype(a_block_desc_ak0_m_ak1), ABlockTransferSrcAccessOrder, @@ -428,7 +439,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, ABDataType, - ABDataType, + ABDataTypeAdjusted, decltype(b_grid_desc_bk0_n_bk1), decltype(b_block_desc_bk0_n_bk1), BBlockTransferSrcAccessOrder, @@ -458,11 +469,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle // sanity check constexpr index_t KPack = math::max(math::lcm(AK1, BK1), - MfmaSelector::selected_mfma.k_per_blk); + MfmaSelector::selected_mfma.k_per_blk); auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector< BlockSize, - ABDataType, + ABDataTypeAdjusted, AccDataType, decltype(a_block_desc_ak0_m_ak1), decltype(b_block_desc_bk0_n_bk1), @@ -480,10 +491,11 @@ struct GridwiseGemmMultipleD_xdl_cshuffle a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align); auto a_block_buf = make_dynamic_buffer( - static_cast(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize()); + static_cast(p_shared), + a_block_desc_ak0_m_ak1.GetElementSpaceSize()); auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + a_block_space_size_aligned, + static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_bk0_n_bk1.GetElementSpaceSize()); constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index d77cb969ead..f0acecda0c1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index d929dbebc20..a181b8016c9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -475,6 +475,13 @@ struct GridwiseGemm_Wmma } // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + if(!(a_grid_desc.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB && + b_grid_desc.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB)) + { + return false; + } return true; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp index 126887cbacd..2da92466b51 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -166,15 +166,12 @@ __global__ void const CBlockClusterAdaptor c_block_cluster_adaptor) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__)) - constexpr index_t shared_block_size = - GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB); - - __shared__ FloatAB p_shared_block[shared_block_size]; + __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; GridwiseGemm::template Run(p_a_grid, p_b_grid, p_c_grid, - p_shared_block, + p_shared, a_b_k0_m_k1_grid_desc, b_b_k0_n_k1_grid_desc, c_grid_desc_mblock_mperblock_nblock_nperblock, @@ -183,16 +180,16 @@ __global__ void c_element_op, c_block_cluster_adaptor); #else - ignore = p_a_grid; - ignore = p_b_grid; - ignore = p_c_grid; - ignore = a_b_k0_m_k1_grid_desc; - ignore = b_b_k0_n_k1_grid_desc; - ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; - ignore = a_element_op; - ignore = b_element_op; - ignore = c_element_op; - ignore = c_block_cluster_adaptor; + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = a_b_k0_m_k1_grid_desc; + ignore = b_b_k0_n_k1_grid_desc; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = c_block_cluster_adaptor; #endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } @@ -264,6 +261,16 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight using GridwiseGemmPipe = remove_cvref_t())>; + // denorm test fix, required to work around fp16 mfma issue + // we convert fp16->fp32->bf16 and execute bf16 mfma instruction + // when mfma if fixed, remove this section and update + // FloatABAdjusted -> FloatAB throughout this file +#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) + using FloatABAdjusted = conditional_t, ck::bhalf_t, FloatAB>; +#else + using FloatABAdjusted = FloatAB; +#endif + // M0/M1/M1Padding static constexpr auto M1PerBlock = Number{}; static constexpr auto M0PerBlock = Number{}; @@ -605,7 +612,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight __device__ static void Run(const FloatAB* __restrict__ p_a_grid, const FloatAB* __restrict__ p_b_grid, FloatC* __restrict__ p_c_grid, - FloatAB* __restrict__ p_shared_block, + void* __restrict__ p_shared, const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc, const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc, const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock& @@ -666,7 +673,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, FloatAB, - FloatAB, + FloatABAdjusted, decltype(a_b_k0_m_k1_grid_desc), decltype(a_b_k0_m_k1_block_desc), ABlockTransferSrcAccessOrder, @@ -696,7 +703,7 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, FloatAB, - FloatAB, + FloatABAdjusted, decltype(b_b_k0_n_k1_grid_desc), decltype(b_b_k0_n_k1_block_desc), BBlockTransferSrcAccessOrder, @@ -725,11 +732,11 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight // sanity check constexpr index_t KPack = - math::max(K1, MfmaSelector::selected_mfma.k_per_blk); + math::max(K1, MfmaSelector::selected_mfma.k_per_blk); auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1( - p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize()); + static_cast(p_shared), a_k0_m_k1_block_desc.GetElementSpaceSize()); + auto b_block_buf = make_dynamic_buffer( - p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize()); + static_cast(p_shared) + a_block_space_size, + b_k0_n_k1_block_desc.GetElementSpaceSize()); // gridwise GEMM pipeline const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock); @@ -798,8 +804,6 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock = GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(); - void* p_shared = static_cast(p_shared_block); - auto c_block_buf = make_dynamic_buffer( static_cast(p_shared), c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp index d1149c0c2e3..51c578385f4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -58,16 +58,16 @@ __global__ void c_element_op, block_2_ctile_map); #else - ignore = p_a_grid; - ignore = p_b_grid; - ignore = p_c_grid; - ignore = a_grid_desc_k0_m_k1; - ignore = b_grid_desc_k0_n_k1; - ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2; - ignore = a_element_op; - ignore = b_element_op; - ignore = c_element_op; - ignore = block_2_ctile_map; + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = a_grid_desc_k0_m_k1; + ignore = b_grid_desc_k0_n_k1; + ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; #endif // end of if (defined(__gfx908__) || defined(__gfx90a__)) } @@ -131,6 +131,16 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 using GridwiseGemmPipe = remove_cvref_t())>; + // denorm test fix, required to work around fp16 mfma issue + // we convert fp16->fp32->bf16 and execute bf16 mfma instruction + // when mfma if fixed, remove this section and update + // FloatABAdjusted -> FloatAB throughout this file +#if CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) + using FloatABAdjusted = conditional_t, ck::bhalf_t, FloatAB>; +#else + using FloatABAdjusted = FloatAB; +#endif + __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1() { constexpr auto max_lds_align = K1; @@ -281,7 +291,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 using BlockwiseGemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1( - static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); + static_cast(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize()); auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + a_block_space_size_aligned, + static_cast(p_shared) + a_block_space_size_aligned, b_block_desc_k0_n_k1.GetElementSpaceSize()); constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index bb28c194f4b..cba06f8e875 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 4bf87cf39f4..7a02b8b8a50 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -1056,6 +1056,60 @@ inline __host__ __device__ constexpr bhalf_t type_convert(float return uint16_t(u.int32 >> 16); } +// convert bfp16 to fp16 via fp32 +template <> +inline __host__ __device__ constexpr half_t type_convert(bhalf_t x) +{ + float x_fp32 = type_convert(x); + + return static_cast(x_fp32); +} + +// convert fp16 to bfp16 via fp32 +template <> +inline __host__ __device__ constexpr bhalf_t type_convert(half_t x) +{ + float x_fp32 = static_cast(x); + + return type_convert(x_fp32); +} + +// convert bfp16 to int32 via fp32 +template <> +inline __host__ __device__ constexpr int32_t type_convert(bhalf_t x) +{ + float x_fp32 = type_convert(x); + + return static_cast(x_fp32); +} + +// convert int32 to bfp16 via fp32 +template <> +inline __host__ __device__ constexpr bhalf_t type_convert(int32_t x) +{ + float x_fp32 = static_cast(x); + + return type_convert(x_fp32); +} + +// convert bfp16 to int8 via fp32 +template <> +inline __host__ __device__ constexpr int8_t type_convert(bhalf_t x) +{ + float x_fp32 = type_convert(x); + + return static_cast(x_fp32); +} + +// convert int8 to bfp16 via fp32 +template <> +inline __host__ __device__ constexpr bhalf_t type_convert(int8_t x) +{ + float x_fp32 = static_cast(x); + + return type_convert(x_fp32); +} + template struct NumericLimits { diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp index 12203bd7f31..72071992f65 100644 --- a/include/ck/utility/math.hpp +++ b/include/ck/utility/math.hpp @@ -168,6 +168,10 @@ __device__ double exp(double x) return exp(x); } +static inline __host__ float exp(float x) { return std::expf(x); } + +static inline __host__ double exp(double x) { return std::exp(x); } + // greatest common divisor, aka highest common factor __host__ __device__ constexpr index_t gcd(index_t x, index_t y) { diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp index 4febace0b84..a3732b2fe0f 100644 --- a/include/ck/utility/math_v2.hpp +++ b/include/ck/utility/math_v2.hpp @@ -92,6 +92,15 @@ static inline __host__ float sqrt(float x) { return std::sqrt(x); }; static inline __host__ double sqrt(double x) { return std::sqrt(x); }; +static inline __host__ half_t tanh(half_t x) +{ + return static_cast(std::tanh(static_cast(x))); +}; + +static inline __host__ float tanh(float x) { return std::tanh(x); }; + +static inline __host__ double tanh(double x) { return std::tanh(x); }; + // math functions for the HIP kernel, some are implemented by calling hip builtin functions static inline __device__ float abs(float x) { return ::abs(x); }; @@ -172,5 +181,14 @@ static inline __device__ float sqrt(float x) { return __builtin_amdgcn_sqrtf(x); static inline __device__ double sqrt(double x) { return __builtin_amdgcn_sqrt(x); }; +static inline __device__ half_t tanh(half_t x) +{ + return static_cast(::tanhf(static_cast(x))); +}; + +static inline __device__ float tanh(float x) { return ::tanhf(x); }; + +static inline __device__ double tanh(double x) { return ::tanh(x); }; + } // namespace math } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 104b21a3ec4..18864395280 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -26,6 +26,7 @@ using Empty_Tuple = ck::Tuple<>; using F16_Tuple = ck::Tuple; using F16_F16_Tuple = ck::Tuple; +using F64_Tuple = ck::Tuple; using F32_Tuple = ck::Tuple; using I32_Tuple = ck::Tuple; using I32_F32_Tuple = ck::Tuple; @@ -85,6 +86,7 @@ using GK_GK_Tuple = ck::Tuple; // pointwise functor using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Relu = ck::tensor_operation::element_wise::Relu; +using TanH = ck::tensor_operation::element_wise::TanH; using Scale = ck::tensor_operation::element_wise::Scale; using Bilinear = ck::tensor_operation::element_wise::Bilinear; using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu; @@ -94,6 +96,7 @@ using FastGelu = ck::tensor_operation::element_wise::FastGelu; using AddMultiply = ck::tensor_operation::element_wise::AddMultiply; using ScaleAdd = ck::tensor_operation::element_wise::ScaleAdd; using Gelu = ck::tensor_operation::element_wise::Gelu; +using Swish = ck::tensor_operation::element_wise::Swish; template using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; @@ -102,6 +105,10 @@ template using Add_Activation_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp; +template +using Add_Mul_Activation_Mul_Clamp = + ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp; + template using Activation_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp; @@ -109,6 +116,10 @@ template using Add_Activation_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp; +template +using Add_Mul2_Activation_Mul_Clamp = + ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp; + template struct DeviceOperationInstanceFactory; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp index a0cea7e390a..c116d999da7 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp @@ -19,6 +19,7 @@ namespace tensor_operation { namespace device { namespace instance { +// float void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance( std::vector>>& instances); +// double +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + std::vector>>& instances); + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + std::vector>>& instances); + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + std::vector>>& instances); + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + std::vector>>& instances); + // Contraction + Bilinear template && is_same_v && + is_same_v && is_same_v) + { + if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) + { + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + op_ptrs); + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + op_ptrs); + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + op_ptrs); + add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + op_ptrs); + } + } + return op_ptrs; } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp index e921ecd47aa..e3f07606c25 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp @@ -19,6 +19,7 @@ namespace tensor_operation { namespace device { namespace instance { +// float void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance( std::vector>>& instances); +// double +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( + std::vector>>& instances); + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( + std::vector>>& instances); + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( + std::vector>>& instances); + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( + std::vector>>& instances); + // Contraction + Scale template && is_same_v && + is_same_v) + { + if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2) + { + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( + op_ptrs); + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( + op_ptrs); + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( + op_ptrs); + add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( + op_ptrs); + } + } + return op_ptrs; } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp new file mode 100644 index 00000000000..367180dea49 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization_swish.hpp @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_normalization.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// FP16 +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>&); + +// FP32 +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>&); + +// [x, gamma, beta, y] = [f16, f32, f32, f16] +void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances( + std::vector>>&); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceNormalization> +{ + using DeviceOp = DeviceNormalization; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f16_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f32_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(Rank == 5 && NumReduceDim == 3) + { + add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(op_ptrs); + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp index 57c971e52e8..793dc8d04aa 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp @@ -49,6 +49,22 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances( Add_Activation_Mul2_Clamp>>>& instances); +void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances( + std::vector< + std::unique_ptr>>>& + instances); + void add_device_conv2d_xdl_bias_perchannel_quantization_int8_instances( std::vector< std::unique_ptr>>>& instances); +void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances( + std::vector< + std::unique_ptr>>>& + instances); + +// piecewise activation function template +struct DeviceOperationInstanceFactory>> +{ + using DeviceOp = DeviceGroupedConvFwdMultipleD>; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2 && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(is_same_v) + { + add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances(op_ptrs); + add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp index 9f8ac9b7b11..c570f76750a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp @@ -49,6 +49,21 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances( Add_Activation_Mul_Clamp>>>& instances); +void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances( + std::vector>>>& + instances); + void add_device_conv2d_xdl_bias_perlayer_quantization_int8_instances( std::vector< std::unique_ptr>>>& instances); +void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances( + std::vector>>>& + instances); + +// piecewise activation function template +struct DeviceOperationInstanceFactory>> +{ + using DeviceOp = DeviceGroupedConvFwdMultipleD>; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(NumDimSpatial == 2 && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v) + { + if constexpr(is_same_v) + { + add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances(op_ptrs); + add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt index ffd6a6a7be2..d2a0a3d0fbe 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt @@ -1,7 +1,13 @@ add_instance_library(device_contraction_bilinear_instance + #float device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp + #double + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp new file mode 100644 index 00000000000..093b2f0e98b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using F64_Tuple = ck::Tuple; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// k/k/n/n are the fast changing dimension for A/B/D/E +using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> + // clang-format on + >; + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp new file mode 100644 index 00000000000..0f683e5c280 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using F64_Tuple = ck::Tuple; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// k/n/n/n are the fast changing dimension for A/B/D/E +using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> + // clang-format on + >; + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp new file mode 100644 index 00000000000..e384993aed7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using F64_Tuple = ck::Tuple; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// m/k/n/n are the fast changing dimension for A/B/D/E +using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> + // clang-format on + >; + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp new file mode 100644 index 00000000000..92e39c173f5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp @@ -0,0 +1,76 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using F64_Tuple = ck::Tuple; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Bilinear = ck::tensor_operation::element_wise::Bilinear; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// m/n/n/n are the fast changing dimension for A/B/D/E +using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, F64_Tuple, F64, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> + // clang-format on + >; + +void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt index 7ad6605486c..31f6a0fcdc9 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt @@ -1,7 +1,13 @@ add_instance_library(device_contraction_scale_instance + #float device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp + #double + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp new file mode 100644 index 00000000000..0aa927155a4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using Empty_Tuple = ck::Tuple<>; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Scale = ck::tensor_operation::element_wise::Scale; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// k/k/n/n are the fast changing dimension for A/B/D/E +using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 32, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 64, 32, 16, 2, 2, 16, 16, 4, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 64, 32, 64, 16, 2, 2, 16, 16, 2, 4, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 8>, 1> + // clang-format on + >; + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp new file mode 100644 index 00000000000..b84ea274c50 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using Empty_Tuple = ck::Tuple<>; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Scale = ck::tensor_operation::element_wise::Scale; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// k/n/n/n are the fast changing dimension for A/B/D/E +using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 1, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 1, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 1, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> + // clang-format on + >; + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp new file mode 100644 index 00000000000..578469997ac --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using Empty_Tuple = ck::Tuple<>; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Scale = ck::tensor_operation::element_wise::Scale; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// m/k/n/n are the fast changing dimension for A/B/D/E +using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 2, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 2, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 2, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> + // clang-format on + >; + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp new file mode 100644 index 00000000000..8e5a19313ee --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +// This (ifndef) is a hack to use customized behavior for buffer load rather than using default +// setting Don't use this hack unless absolutely necessary! +// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op +#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1 + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F64 = double; +using Empty_Tuple = ck::Tuple<>; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using Scale = ck::tensor_operation::element_wise::Scale; + +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1] +// m/n/n/n are the fast changing dimension for A/B/D/E +using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance = std::tuple< + // clang-format off + //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //#####################################| | | | Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //#####################################| | | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //#####################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 1, 1, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 128, 64, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 8>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 1, 1, 16, 16, 4, 4, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 128, 64, 128, 16, 2, 2, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 8, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 1, 1, 16, 16, 4, 2, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 128, 64, 16, 2, 2, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 1, 1, 16, 16, 2, 4, S<16,16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 0, 1, 1, S<1, 16, 1, 16>, 1>, + DeviceContractionMultipleD_Xdl_CShuffle< 2, 2, 2, F64, F64, F64, F64, Empty_Tuple, F64, PassThrough, PassThrough, Scale, GemmMNKPadding, 1, 256, 64, 128, 16, 2, 2, 16, 16, 2, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, 1, 1, 1, S<1, 16, 1, 16>, 1> + // clang-format on + >; + +void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance( + std::vector>>& instances) +{ + add_device_operation_instances( + instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt index aa0cc114805..176fb2fbee7 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt @@ -1,4 +1,11 @@ add_instance_library(device_normalization_instance - device_normalization_f16_instance.cpp - device_normalization_f32_instance.cpp + device_layernorm2d_f16_instance.cpp + device_layernorm2d_f32_instance.cpp + device_layernorm4d_f16_instance.cpp + device_layernorm4d_f32_instance.cpp + device_groupnorm_f16_instance.cpp + device_groupnorm_f32_instance.cpp + device_groupnorm_swish_f16_instance.cpp + device_groupnorm_swish_f32_instance.cpp + device_groupnorm_swish_f16_f32_f32_f16_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp new file mode 100644 index 00000000000..e9c2112e16e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_5_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp new file mode 100644 index 00000000000..79dde38fc90 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_5_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp new file mode 100644 index 00000000000..9f6bf128fae --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_f32_f32_f16_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, + device_normalization_f16_f32_f32_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp new file mode 100644 index 00000000000..6241e033856 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp new file mode 100644 index 00000000000..b64328d5d07 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_groupnorm_swish_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Swish = ck::tensor_operation::element_wise::Swish; + +void add_device_normalization_rank_5_3_swish_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp new file mode 100644 index 00000000000..d6a2f6f2c1c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_2_1_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp new file mode 100644 index 00000000000..73097828e3b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm2d_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_2_1_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp new file mode 100644 index 00000000000..507a683ee7a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f16_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_4_3_f16_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp new file mode 100644 index 00000000000..ca1aa0c25ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/device_layernorm4d_f32_instance.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "normalization_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Pass = ck::tensor_operation::element_wise::PassThrough; + +void add_device_normalization_rank_4_3_f32_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_normalization_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp deleted file mode 100644 index beeaa3aa22d..00000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F16 = ck::half_t; -using F32 = float; - -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -// clang-format off -using device_normalization_f16_instances = - std::tuple < - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - >; -// clang-format on - -void add_device_normalization_rank_2_1_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_4_3_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -void add_device_normalization_rank_5_3_f16_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_normalization_f16_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp deleted file mode 100644 index 4d236fb6332..00000000000 --- a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -using F32 = float; - -using Pass = ck::tensor_operation::element_wise::PassThrough; - -template -using device_layernorm_f32_instances = std::tuple< - // clang-format off - // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, // irregular size - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl, - DeviceNormalizationImpl - // clang-format on - >; - -void add_device_normalization_rank_2_1_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_4_3_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -void add_device_normalization_rank_5_3_f32_instances( - std::vector>>& - instances) -{ - add_device_operation_instances(instances, device_layernorm_f32_instances{}); -} - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp new file mode 100644 index 00000000000..9dea41e89d3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/normalization/normalization_instance_common.hpp @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +template +using device_normalization_f16_instances = + // clang-format off + std::tuple < + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + // clang-format on + >; + +template +using device_normalization_f32_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + // clang-format on + >; + +template +using device_normalization_f16_f32_f32_f16_instances = std::tuple< + // clang-format off + // XDataType, GammaDataType, BetaDataType, ComputeDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize> + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, // irregular size + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl, + DeviceNormalizationImpl + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp index 7729e426380..b231f8c956f 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/conv2d_quantization_common.hpp @@ -25,6 +25,7 @@ using GNHWK = ck::tensor_layout::convolution::GNHWK; using GK = ck::tensor_layout::convolution::G_K; using PassThrough = ck::tensor_operation::element_wise::PassThrough; using Relu = ck::tensor_operation::element_wise::Relu; +using TanH = ck::tensor_operation::element_wise::TanH; using GK_Tuple = ck::Tuple; using GK_GK_Tuple = ck::Tuple; @@ -32,17 +33,25 @@ using I32_Tuple = ck::Tuple; using F32_Tuple = ck::Tuple; using I32_F32_Tuple = ck::Tuple; +// perlayer using Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp; +// bias + perlayer using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp; using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp; +using Add_Mul_TanH_Mul_Clamp = + ck::tensor_operation::element_wise::Add_Mul_Activation_Mul_Clamp; +// perchannel using Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp; using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp; +// bias + perchannel using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp; using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp; +using Add_Mul2_TanH_Mul_Clamp = + ck::tensor_operation::element_wise::Add_Mul2_Activation_Mul_Clamp; static constexpr ck::index_t NDimSpatial = 2; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp index ba2451101ef..ae5c1d7c325 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp @@ -76,6 +76,42 @@ void add_device_conv2d_dl_bias_relu_perchannel_quantization_int8_instances( ConvFwd1x1S1P0, 4>{}); } + +void add_device_conv2d_dl_bias_tanh_perchannel_quantization_int8_instances( + std::vector>>& instances) +{ + // dl + add_device_operation_instances(instances, + device_grouped_conv2d_dl_int8_instances{}); + add_device_operation_instances(instances, + device_grouped_conv2d_dl_int8_instances{}); + add_device_operation_instances(instances, + device_grouped_conv2d_dl_int8_instances{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp index ea1c953bb2b..192d5c9a555 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp @@ -76,6 +76,43 @@ void add_device_conv2d_dl_bias_relu_perlayer_quantization_int8_instances( ConvFwd1x1S1P0, 4>{}); } + +void add_device_conv2d_dl_bias_tanh_perlayer_quantization_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv2d_dl_int8_instances{}); + + add_device_operation_instances(instances, + device_grouped_conv2d_dl_int8_instances{}); + + add_device_operation_instances(instances, + device_grouped_conv2d_dl_int8_instances{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp index 25e2cda9cb8..b6e8ee15907 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp @@ -74,6 +74,41 @@ void add_device_conv2d_xdl_bias_relu_perchannel_quantization_int8_instances( ConvFwd1x1S1P0, 8>{}); } + +void add_device_conv2d_xdl_bias_tanh_perchannel_quantization_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv2d_xdl_int8_instances{}); + add_device_operation_instances(instances, + device_grouped_conv2d_xdl_int8_instances{}); + add_device_operation_instances(instances, + device_grouped_conv2d_xdl_int8_instances{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp index d598d3d38e7..70f92cec3a4 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/quantization/conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp @@ -76,6 +76,43 @@ void add_device_conv2d_xdl_bias_relu_perlayer_quantization_int8_instances( ConvFwd1x1S1P0, 8>{}); } + +void add_device_conv2d_xdl_bias_tanh_perlayer_quantization_int8_instances( + std::vector>>& instances) +{ + add_device_operation_instances(instances, + device_grouped_conv2d_xdl_int8_instances{}); + + add_device_operation_instances(instances, + device_grouped_conv2d_xdl_int8_instances{}); + + add_device_operation_instances(instances, + device_grouped_conv2d_xdl_int8_instances{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp index 81fec5590a8..73343f6bec2 100644 --- a/profiler/include/profiler/profile_groupnorm_impl.hpp +++ b/profiler/include/profiler/profile_groupnorm_impl.hpp @@ -190,9 +190,9 @@ bool profile_groupnorm_impl(int do_verification, if(time_kernel) { - LogRange(std::cout << "length = ", length, ",") << ", "; - std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, " - << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; + LogRange(std::cout << "length = ", length, ",") << std::endl; + std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_instance_name << std::endl; } if(num_kernel == 0) diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 375ff493104..8f462237f5e 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -8,8 +8,8 @@ MY_PROJECT_SOURCE=$1 cmake \ -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -mcumode \ --mno-wavefrontsize64 -Wno-gnu-line-marker -save-temps=$PWD" \ +-D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker \ +-save-temps=$PWD" \ -D CMAKE_BUILD_TYPE=Release \ -D BUILD_DEV=ON \ -D GPU_TARGETS="gfx908;gfx90a" \ From a0058be6cf36d18e8fbfe88d9fcc62ae56ba219d Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Thu, 20 Apr 2023 18:29:56 +0800 Subject: [PATCH 068/118] Disable SkipLDS & Align AIT api (#3) --- example/01_gemm/gemm_wmma_fp16.cpp | 19 ++++++------ .../gemm_bilinear_wmma_fp16.cpp | 31 ++++++++++--------- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 5 +-- .../common_wmma.hpp | 2 +- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 17 +++++----- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 18 ++++++++--- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 14 +++++++-- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 27 ++++++++++------ .../gpu/device/impl/device_gemm_wmma.hpp | 5 +-- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 8 ++--- 10 files changed, 88 insertions(+), 58 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index f637568fc0b..3f489d5f2db 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -34,24 +34,25 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle AElementOp, BElementOp, CElementOp, - GemmDefault, - 256, // BlockSize + GemmDefault, + 1, // Prefetch stage + 128, // BlockSize 128, // MPerBlock 128, // NPerBlock - 32, // KPerBlock + 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 1, // M-Repeat // M-PerWmma / M-Repeat = M-Wave - 8, // N-Repeat // N-PerWmma / N-Repeat = N-Wave - S<4, 64, 1>, + 8, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<4, 64, 1>, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, @@ -59,8 +60,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle 8, true, 1, // C shuffle (M Repeat) Per store - 4, // C shuffle (N Repeat) Per store - S<1, 32, 1, 8>, + 1, // C shuffle (N Repeat) Per store + S<1, 16, 1, 8>, 8>; // clang-format on diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 005e7a6eecd..77d0fbefcf0 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -72,41 +72,42 @@ using DeviceOpInstance = ELayout, ADataType, BDataType, - ck::Tuple, - EDataType, AccDataType, CShuffleDataType, + ck::Tuple, + EDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, - 256, - 128, - 128, + 1, + 64, 32, + 64, + 64, 8, 16, 16, - 4, 2, - S<4, 64, 1>, + 2, + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 8, - 8, + 4, + 4, true, - S<4, 64, 1>, + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 8, - 8, + 4, + 4, true, 1, 1, - S<1, 32, 1, 8>, - 8>; + S<1, 2, 1, 32>, + 1>; int main(int argc, char* argv[]) { @@ -264,7 +265,7 @@ int main(int argc, char* argv[]) float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" - << std::endl; + << device_op.GetTypeString() << std::endl; e_device_buf.FromDevice(e_m_n_device_result.mData.data()); diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index 25ab210739d..ae74e4833b0 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -56,10 +56,10 @@ using DeviceOpInstanceKKNN = NumDimK, ADataType, BDataType, - DsDataType, - EDataType, AccDataType, CShuffleDataType, + DsDataType, + EDataType, AElementOp, BElementOp, CDEElementOp, @@ -67,6 +67,7 @@ using DeviceOpInstanceKKNN = ASpec, BSpec, DESpec, + 1, 256, 128, 128, diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp index eb6975a6d81..47a59762aae 100644 --- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp @@ -39,7 +39,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index f57fe033574..3ec83873d68 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -42,15 +42,16 @@ using DeviceConvFwdInstance = OutputLayout, InKernelDataType, WeiKernelDataType, - ck::Tuple, - OutKernelDataType, AccDataType, CShuffleDataType, + ck::Tuple, + OutKernelDataType, InElementOp, WeiElementOp, OutElementOp, ConvSpec, // ConvForwardSpecialization GemmSpec, // GemmSpecialization + 1, // Prefetch stage 256, // BlockSize 128, // MPerBlock 128, // NPerBlock @@ -60,19 +61,19 @@ using DeviceConvFwdInstance = 16, // NPerWMMA 4, // MRepeat 2, // NRepeat - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<4, 8, 8>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockTransferSrcScalarPerVector + 1, // ABlockTransferDstScalarPerVector_AK1 true, // ABlockLdsExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<4, 8, 8>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockTransferSrcScalarPerVector + 1, // BBlockTransferDstScalarPerVector_BK1 true, // BBlockLdsExtraN 4, 2, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 78bdd349ffc..eef0761197f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -62,10 +62,10 @@ template struct DeviceBatchedContractionMultipleD_Wmma_CShuffle @@ -132,8 +132,16 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); static constexpr auto WmmaK = 16; - static constexpr auto AEnableLds = NWaves == 1 ? false : true; - static constexpr auto BEnableLds = MWaves == 1 ? false : true; + static constexpr auto AEnableLds_auto = NWaves == 1 ? false : true; + static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; + + // If true, LDS is used unconditionally + static constexpr auto AEnableLds_manu = false; + // Bug: blocksize 128, Tile 128x128x64, Repeat 8x2 Failure + static constexpr auto BEnableLds_manu = true; + + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu; + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu; static constexpr auto matrix_padder = MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 45e05060115..d36fa0378b9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -143,9 +143,17 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); - static constexpr auto AEnableLds = LWaves == 1 ? false : true; - static constexpr auto B0EnableLds = MWaves == 1 ? false : true; - static constexpr auto B1EnableLds = MWaves == 1 ? false : true; + static constexpr auto AEnableLds_auto = LWaves == 1 ? false : true; + static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true; + static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true; + + static constexpr auto AEnableLds_manu = true; + static constexpr auto B0EnableLds_manu = true; + static constexpr auto B1EnableLds_manu = true; + + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu; + static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu; + static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu; using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< Sequence, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 29c585c8076..da08f196822 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -28,14 +28,15 @@ template struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{MPerBlock, NPerBlock, KPerBlock}; @@ -744,7 +749,11 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD" - << " NumPrefetch: " + << " AEnableLds: " + << AEnableLds << ", " + << "BEnableLds: " + << BEnableLds << ", " + << "NumPrefetch: " << NumPrefetch << ", " << "LoopScheduler: " << LoopSchedToString[LoopSched] << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 3814a559944..1c8fafa522b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -34,6 +34,7 @@ template struct DeviceGemmWmma_CShuffle : public DeviceGemm struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle From 394dbf8310ef7f2d7f7471fa8be5cd8bfff6f552 Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Fri, 21 Apr 2023 19:53:18 +0800 Subject: [PATCH 069/118] fix layernorm, reduction Ops (#4) * [Navi3x] Fix Gridwise_multiple_d operation (#649) * Add CMake Option "USE_OPT_NAVI3X" * fix bug * standardize docs (#655) * Separate bibtex requirement from rocm-docs-core (#656) * separate bibtex requirement from rocm-docs-core * point requirements to source rocm-docs-core repo * Add CMake Option "USE_OPT_NAVI3X" (#647) * Add CMake Option "USE_OPT_NAVI3X" * remove navi3x opt compile option from cmake script * Conv + quantization + tanh (#645) * Rename file. Prepare to support another activation * Add comment for quantization * Extract out_elementop * Add tanh example * Add conv + bias + tanh quantization instance * Add missing parameter * Refine cmake * Add external api and client example * Extract variable in example * Fix the comment --------- Co-authored-by: zjing14 * Add a denorm test fix (#603) * Add type_convert implementations for bf16 * Add the fix for conv_fwd * Add the fix for conv_bwd_data * Add the fix for conv_bwd_weight * Format * Format * Another format * Add a macro to use workaround on MI200 only * Format --------- Co-authored-by: Rosty Geyyer Co-authored-by: zjing14 * simplify karg in device/grid of split-k op (#644) * simplify karg in device/grid split-k op * fix mk_kn_mn instances * add more instances * use name from tensor layout * fix 3rd dword of buffer source descriptor (#659) * add fp64 instances (#658) Co-authored-by: root * Issue #666: Revert "simplify karg in device/grid of split-k op (#644)" (#665) This reverts commit bb5530af91352dca062b791313d9b77700335ae9. * Groupnorm + swish external api (#668) * Rename to proper naming * Add example of groupnorm + swish * Extract duplicate code in example * Add groupnorm + swish instances * Ractor instance generation, split into multiple cpp file * Add external api and client example * Refine profiler message * Use ck math version of exp * Refine problem size in example * Add host version of exp * add a marco to turn on/off denorm fix (off by default) (#673) * add a marco to turn off denorm fix by default * expose the marco --------- Co-authored-by: root * fixed quant example (#672) Co-authored-by: root * Add dependabot config and pin rocm-docs-core (#663) * [gtest] suppress unsafe buffer warn (#670) ref: https://github.com/ROCmSoftwarePlatform/MIOpen/pull/1912 * Add memory index guard in wmma device ops (#667) * Add more macros to turn on/off denorm fix (#678) Co-authored-by: Rosty Geyyer * Fix a typo (#676) * Add (#677) * Allow using ROCm release candidate compilers. (#679) * enable use of rocm5.5 release candidate 4 * upgrade to ROCM5.5 RC5 * try fix the PUB_KEY error, remove the cmake-data package * upgrade to latest cmake version * use private dockerhub repo for rocm5.5 rc5 * add missing bracket * Disable SkipLDS & Align AIT api * Update dependabot config (#682) Co-authored-by: samjwu * update attn api * solve type_convert bug + enable --------- Co-authored-by: Sam Wu Co-authored-by: Sam Wu Co-authored-by: rocking5566 Co-authored-by: zjing14 Co-authored-by: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Co-authored-by: Rosty Geyyer Co-authored-by: carlushuang Co-authored-by: root Co-authored-by: Jun Liu Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: samjwu Co-authored-by: haocwang --- .github/dependabot.yml | 2 +- example/13_pool2d_fwd/pool2d_fwd_fp16.cpp | 2 +- ...emm_scale_softmax_gemm_permute_wmma_fp16.cpp | 3 ++- include/ck/ck.hpp | 4 ++-- ..._gemm_softmax_gemm_permute_wmma_cshuffle.hpp | 6 +++--- .../gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 5 ----- .../thread/threadwise_tensor_slice_transfer.hpp | 16 ++++++++-------- include/ck/utility/data_type.hpp | 17 +++++++++++++---- 8 files changed, 30 insertions(+), 25 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ada22f1b56d..9cdf2d670c3 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -6,7 +6,7 @@ version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values - directory: "/" # Location of package manifests + directory: "/docs/.sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp index 659f3251dcf..bae5069d276 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp @@ -19,7 +19,7 @@ using IndexDataType = int32_t; using InLayout = ck::tensor_layout::convolution::NHWC; using OutLayout = ck::tensor_layout::convolution::NHWC; -#if 1 +#if 0 static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; #else static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index 552fd9dd1cb..aa122b4c9ff 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -77,12 +77,12 @@ using DeviceGemmInstance = ADataType, B0DataType, B1DataType, + CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, - CDataType, AElementOp, B0ElementOp, Acc0ElementOp, @@ -93,6 +93,7 @@ using DeviceGemmInstance = TensorSpecB0, TensorSpecB1, TensorSpecC, + 1, 256, // Gemm 0 128, // MPerBlock diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index 67ed45fc51a..8dd3cf71d47 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -70,10 +70,10 @@ // TODO: enable buffer load when found correct 3rd dword // buffer load -#define CK_USE_AMD_BUFFER_LOAD 0 +#define CK_USE_AMD_BUFFER_LOAD 1 // buffer store -#define CK_USE_AMD_BUFFER_STORE 0 +#define CK_USE_AMD_BUFFER_STORE 1 // buffer atomic add: integer #define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1 diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index d36fa0378b9..2554cb189b8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -34,12 +34,12 @@ template struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle @@ -147,7 +147,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true; static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true; - static constexpr auto AEnableLds_manu = true; + static constexpr auto AEnableLds_manu = false; static constexpr auto B0EnableLds_manu = true; static constexpr auto B1EnableLds_manu = true; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 5e0f6134095..cf4e1cdfe6e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -714,13 +714,8 @@ struct GridwiseGemmMultipleD_Wmma const auto MBlock = M / MPerBlock; const auto NBlock = N / NPerBlock; - const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( e_grid_desc_m_n, - make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), - make_unmerge_transform(make_tuple(NBlock, Number{}))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); return e_grid_desc_mblock_mperblock_nblock_nperblock; } diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index e12ba154c63..82ff101dd77 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1407,32 +1407,32 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow if constexpr(IntraRowSwizzlePerm) { temp = __builtin_amdgcn_permlane16( - temp, type_convert(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0); - v_this_row = type_convert(temp); + temp, type_convert_sp(v_this_row), 0xb3a29180, 0xf7e6d5c4, 1, 0); + v_this_row = type_convert_sp(temp); } // apply inter-row permute. temp = __builtin_amdgcn_permlanex16(temp, - type_convert(v_this_row), + type_convert_sp(v_this_row), LowEightRowlaneIdx, HighEightRowLaneIdx, 1, 0); - v_theother_row = type_convert(temp); + v_theother_row = type_convert_sp(temp); if(get_thread_local_1d_id() % 32 < 16) { // apply type convert - dst_buf(Number{}) = type_convert(v_this_row); + dst_buf(Number{}) = type_convert_sp(v_this_row); dst_buf(Number{}) = - type_convert(v_theother_row); + type_convert_sp(v_theother_row); } else { // apply type convert dst_buf(Number{}) = - type_convert(v_this_row); - dst_buf(Number{}) = type_convert(v_theother_row); + type_convert_sp(v_this_row); + dst_buf(Number{}) = type_convert_sp(v_theother_row); } }); }); diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 7a02b8b8a50..86e0361c4fb 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -964,8 +964,17 @@ inline __host__ __device__ constexpr float type_convert(bhalf_t return u.fp32; } +// Convert X to Y +template +__host__ __device__ constexpr Y type_convert_sp(X x) +{ + static_assert(!std::is_reference_v && !std::is_reference_v); + + return static_cast(x); +} + template <> -inline __host__ __device__ constexpr int type_convert(float x) +inline __host__ __device__ constexpr int type_convert_sp(float x) { union { @@ -977,7 +986,7 @@ inline __host__ __device__ constexpr int type_convert(float x) } template <> -inline __host__ __device__ constexpr float type_convert(int x) +inline __host__ __device__ constexpr float type_convert_sp(int x) { union { @@ -989,7 +998,7 @@ inline __host__ __device__ constexpr float type_convert(int x) } template <> -inline __host__ __device__ constexpr int type_convert(half_t x) +inline __host__ __device__ constexpr int type_convert_sp(half_t x) { union { @@ -1001,7 +1010,7 @@ inline __host__ __device__ constexpr int type_convert(half_t x) } template <> -inline __host__ __device__ constexpr half_t type_convert(int x) +inline __host__ __device__ constexpr half_t type_convert_sp(int x) { union { From bddc3afa97518a2b3b5e5bf5667033c610e15fc7 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 21 Apr 2023 12:44:58 +0000 Subject: [PATCH 070/118] fix typo --- .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index cf4e1cdfe6e..acdd76ad371 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -716,6 +716,10 @@ struct GridwiseGemmMultipleD_Wmma const auto NBlock = N / NPerBlock; const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( e_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), + make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); return e_grid_desc_mblock_mperblock_nblock_nperblock; } From f677f7028ba0d48d64995bd4c2367fdc3c1b5e90 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Sat, 22 Apr 2023 05:42:24 +0000 Subject: [PATCH 071/118] Fix attention with causal mask --- .../CMakeLists.txt | 2 + ...e_scale_softmax_gemm_permute_wmma_fp16.cpp | 165 ++++++++++++++++++ .../gpu/block/blockwise_gemm_wmma.hpp | 2 +- 3 files changed, 168 insertions(+), 1 deletion(-) create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index b22a376a8cd..5e83cc27576 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -6,6 +6,7 @@ add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) if(GPU_TARGETS MATCHES "gfx1100") + add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) endif() @@ -20,5 +21,6 @@ add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_tria if(GPU_TARGETS MATCHES "gfx1100") add_custom_target(example_gemm_scale_softmax_gemm_wmma) + add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16) add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16) endif() \ No newline at end of file diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp new file mode 100644 index 00000000000..cbd23d00677 --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +/* +Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g_k_l) * B1_g_l_n + |-----------------| + Gemm0 + |-------------------------------------| + Gemm1 +*/ + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using B0DataType = F16; +using B1DataType = F16; +using Acc0DataType = F32; +using Acc1DataType = F32; +using CShuffleDataType = F32; +using CDataType = F16; +using Acc0BiasDataType = ck::Tuple<>; +using Acc1BiasDataType = ck::Tuple<>; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 1; +static constexpr ck::index_t NumDimN = 1; +static constexpr ck::index_t NumDimK = 1; +static constexpr ck::index_t NumDimO = 1; + +using AElementOp = PassThrough; +using B0ElementOp = PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; +static constexpr auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle; + +static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; + +using DeviceGemmInstance = + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, + NumDimM, + NumDimN, + NumDimK, + NumDimO, + ADataType, + B0DataType, + B1DataType, + CDataType, + Acc0BiasDataType, + Acc0DataType, + Acc1BiasDataType, + Acc1DataType, + CShuffleDataType, + AElementOp, + B0ElementOp, + Acc0ElementOp, + B1ElementOp, + CElementOp, + GemmSpec, + TensorSpecA, + TensorSpecB0, + TensorSpecB1, + TensorSpecC, + 1, + 256, + // Gemm 0 + 128, // MPerBlock + 64, // LPerBlock + 64, // KPerBlock + 8, // K1 + // Gemm 1 + 64, // NPerBlock + 64, // LTilePerBlock + 8, // L1 + 16, // MPerWMMA + 16, // LPerWMMA + 16, // NPerWMMA + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, // MRepeat + 4, // LRepeat + 4, // NRepeat + S<4, 64, 1>, // ABlockTransfer MK -> K0 M K1 + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 64, 1>, // B0BlockTransfer LK -> K0 L K1 + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 8, 8>, // B1BlockTransfer NL -> L0 N L1 + S<0, 2, 1>, + S<0, 2, 1>, + 1, + 8, + 1, + false, + 1, // CShuffleMWmmaPerWavePerShuffle + 2, // CShuffleNWmmaPerWavePerShuffle + S<1, 64, 1, 4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + 8, // CShuffleBlockTransferScalarPerVector_NPerBlock + MaskingSpec>; // MaskingSpecialization + +// Ref Gemm0: fp16 in, fp32 out +using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +// Ref Softmax: fp32 in, fp16 out +using ReferenceSoftmaxInstance = + ck::tensor_operation::host::ReferenceSoftmax; + +// Ref Gemm1: fp16 in, fp16 out +using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +#include "run_batched_gemm_scale_softmax_gemm_permute.inc" + +int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index b91e1de4c9b..432288167cd 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -170,7 +170,7 @@ struct BlockwiseGemmWMMA const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D(); return make_tuple( - Number{}, blk_idx[I0], waveId_m, Number{}, waveId_n, blk_idx[I1], blk_idx[I2]); + Number{}, waveId_m, blk_idx[I0], Number{}, waveId_n, blk_idx[I1], blk_idx[I2]); } using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); From 9e1091cd67d9d3cc42c3065faa61ed1f3bef34a6 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Sun, 23 Apr 2023 08:30:10 +0000 Subject: [PATCH 072/118] multiple fix, try ait compile --- .../gpu/block/blockwise_gemm_wmma.hpp | 11 ++++++----- ...atched_contraction_multiple_d_wmma_cshuffle.hpp | 9 +++++++-- .../impl/device_gemm_multiple_d_wmma_cshuffle.hpp | 6 ++---- .../gpu/device/impl/device_gemm_wmma.hpp | 3 +-- ...e_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 14 +++++++++----- 5 files changed, 25 insertions(+), 18 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 432288167cd..073061445b6 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -418,13 +418,14 @@ struct BlockwiseGemmWMMA } protected: - // A[K0, M0, M1, M2, K1] - static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, Number{}), + make_tuple(Number{}, Number{}, Number{}, Number{}, Number<1>{})); // B[K0, N0, N1, N2, K1] - static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); + static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, I1, I1, Number{}), + make_tuple(Number{}, Number{}, Number{}, Number{}, Number<1>{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index eef0761197f..508d12549b7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -136,8 +136,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; // If true, LDS is used unconditionally - static constexpr auto AEnableLds_manu = false; - // Bug: blocksize 128, Tile 128x128x64, Repeat 8x2 Failure + // Bug, MNK vector load check not implemented correctly + static constexpr auto AEnableLds_manu = true; static constexpr auto BEnableLds_manu = true; static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu; @@ -725,6 +725,11 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle // Batch Offset ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; + + // for checking vector load/store + // index_t MRaw_; + // index_t NRaw_; + // index_t KRaw_; }; // Invoker diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index da08f196822..b4ef5237461 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -98,9 +98,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{MPerBlock, NPerBlock, KPerBlock}; - // Describe how data read from Global memory + // Describe how data read from Global memory static auto MakeAGridDescriptor(index_t MRaw, index_t KRaw, index_t StrideA) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 1c8fafa522b..5f754b765b2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -92,8 +92,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{}; From 6e2c6159ac8b0567902334a28ceae7b0e6829305 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 27 Apr 2023 02:18:27 +0000 Subject: [PATCH 073/118] Add A/B not use LDS pipeline --- example/01_gemm/gemm_wmma_fp16.cpp | 14 +-- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 94 +++++++++++++++++++ 2 files changed, 101 insertions(+), 7 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 3f489d5f2db..c34d63bce3c 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -36,23 +36,23 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle CElementOp, GemmDefault, 1, // Prefetch stage - 128, // BlockSize + 256, // BlockSize 128, // MPerBlock - 128, // NPerBlock + 256, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 8, // M-Repeat // M-PerWmma / M-Repeat = M-Wave - 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave - S<4, 32, 1>, + 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<4, 32, 1>, + S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, @@ -61,7 +61,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle true, 1, // C shuffle (M Repeat) Per store 1, // C shuffle (N Repeat) Per store - S<1, 16, 1, 8>, + S<1, 32, 1, 8>, 8>; // clang-format on diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index f0acecda0c1..c6d82923260 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -462,6 +462,100 @@ struct GridwiseGemmPipeline_v1<1, true, false> template <> struct GridwiseGemmPipeline_v1<1, false, false> { + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + __device__ static void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { + constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + auto b_block_buf_switch = b_block_buf; + auto a_block_buf_switch = a_block_buf; + + // preload data into LDS + a_blockwise_copy.Run( + a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf); + b_blockwise_copy.Run( + b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + a_blockwise_copy.Run( + a_grid_desc, a_grid_buf, a_block_desc, a_block_origin_idx, a_block_buf_switch); + b_blockwise_copy.Run( + b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch); + + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + a_block_buf = a_block_buf_switch; + b_block_buf = b_block_buf_switch; + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + } + } }; template From d676da85ef56f789b2cebfdbf7cee69eb88d7c61 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 27 Apr 2023 09:48:53 +0000 Subject: [PATCH 074/118] Clang format, Add gfx1101, gfx1102 support of FMHA example --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 48 +++++++++---------- .../CMakeLists.txt | 4 +- .../gpu/block/blockwise_gemm_wmma.hpp | 2 +- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 3 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 4 +- script/clang-format-overwrite.sh | 4 +- 6 files changed, 32 insertions(+), 33 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 3ec83873d68..84add520b61 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -49,32 +49,32 @@ using DeviceConvFwdInstance = InElementOp, WeiElementOp, OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // Prefetch stage - 256, // BlockSize - 128, // MPerBlock - 128, // NPerBlock - 32, // KPerBlock - 8, // K1 - 16, // MPerWMMA - 16, // NPerWMMA - 4, // MRepeat - 2, // NRepeat + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // Prefetch stage + 256, // BlockSize + 128, // MPerBlock + 128, // NPerBlock + 32, // KPerBlock + 8, // K1 + 16, // MPerWMMA + 16, // NPerWMMA + 4, // MRepeat + 2, // NRepeat S<4, 8, 8>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 1, // ABlockTransferSrcScalarPerVector - 1, // ABlockTransferDstScalarPerVector_AK1 - true, // ABlockLdsExtraM + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 1, // ABlockTransferSrcScalarPerVector + 1, // ABlockTransferDstScalarPerVector_AK1 + true, // ABlockLdsExtraM S<4, 8, 8>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 1, // BBlockTransferSrcScalarPerVector - 1, // BBlockTransferDstScalarPerVector_BK1 - true, // BBlockLdsExtraN + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 1, // BBlockTransferSrcScalarPerVector + 1, // BBlockTransferDstScalarPerVector_BK1 + true, // BBlockLdsExtraN 4, 2, S<1, 32, 1, 8>, diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 5e83cc27576..5aa5692aacf 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -5,7 +5,7 @@ add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) -if(GPU_TARGETS MATCHES "gfx1100") +if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) endif() @@ -19,7 +19,7 @@ add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_soft add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) -if(GPU_TARGETS MATCHES "gfx1100") +if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") add_custom_target(example_gemm_scale_softmax_gemm_wmma) add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16) add_dependencies(example_gemm_scale_softmax_gemm_wmma example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 073061445b6..95918252a81 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -418,7 +418,7 @@ struct BlockwiseGemmWMMA } protected: - static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor( + static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor( make_tuple(Number{}, Number{}, I1, I1, Number{}), make_tuple(Number{}, Number{}, Number{}, Number{}, Number<1>{})); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 3328031261d..5d582ab52ef 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -99,7 +99,7 @@ template {}; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index acdd76ad371..b22f703611e 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -712,8 +712,8 @@ struct GridwiseGemmMultipleD_Wmma const auto M = e_grid_desc_m_n.GetLength(I0); const auto N = e_grid_desc_m_n.GetLength(I1); - const auto MBlock = M / MPerBlock; - const auto NBlock = N / NPerBlock; + const auto MBlock = M / MPerBlock; + const auto NBlock = N / NPerBlock; const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( e_grid_desc_m_n, make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index 2ddbb6440d8..3a09d6038a4 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From 716860e37dc6c3286610772c8b942b86dc809d6c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 27 Apr 2023 09:49:18 +0000 Subject: [PATCH 075/118] cancel change of format script --- script/clang-format-overwrite.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index 3a09d6038a4..2ddbb6440d8 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From 0bb08f4b61f3a24926b592a9a368697b96bb4719 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 10 May 2023 09:11:24 +0000 Subject: [PATCH 076/118] 1. Enable 2-stage global Prefetch ( May cause VGPR spilling) 2. Enable FP16 accumulator blockwise_gemm --- example/01_gemm/gemm_wmma_fp16.cpp | 16 ++++---- example/01_gemm/run_gemm_example.inc | 4 ++ .../gemm_bilinear_wmma_fp16.cpp | 24 +++++------ .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 14 +++---- .../gpu/block/blockwise_gemm_wmma.hpp | 24 +++++++++-- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 4 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 6 +-- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 4 +- .../gpu/device/impl/device_gemm_wmma.hpp | 6 +-- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 4 +- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 6 ++- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 6 ++- .../gpu/grid/gridwise_gemm_wmma.hpp | 9 ++-- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 41 +++++++++---------- include/ck/utility/amd_wmma.hpp | 6 +-- 15 files changed, 101 insertions(+), 73 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index c34d63bce3c..a5ce2215f25 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -35,24 +35,24 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle BElementOp, CElementOp, GemmDefault, - 1, // Prefetch stage - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock + 2, // Prefetch stage + 128, // BlockSize + 64, // MPerBlock + 128, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave - S<4, 64, 1>, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<4, 64, 1>, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, @@ -61,7 +61,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle true, 1, // C shuffle (M Repeat) Per store 1, // C shuffle (N Repeat) Per store - S<1, 32, 1, 8>, + S<1, 32, 1, 4>, 8>; // clang-format on diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index 30f11d9089f..b9806a72a68 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -47,6 +47,10 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); break; + case 5: + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(b_k_n); + break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 77d0fbefcf0..1f3fdcb2ebc 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -80,34 +80,34 @@ using DeviceOpInstance = BElementOp, CDEElementOp, GemmSpec, - 1, - 64, - 32, + 2, + 128, 64, + 128, 64, 8, 16, 16, 2, - 2, - S<4, 16, 1>, + 4, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 4, - 4, + 8, + 8, true, - S<4, 16, 1>, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 4, - 4, + 8, + 8, true, 1, 1, - S<1, 2, 1, 32>, - 1>; + S<1, 32, 1, 4>, + 8>; int main(int argc, char* argv[]) { diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index ae74e4833b0..a9dceed3175 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -67,24 +67,24 @@ using DeviceOpInstanceKKNN = ASpec, BSpec, DESpec, - 1, - 256, + 2, 128, + 64, 128, 32, 8, 16, 16, - 8, - 1, - S<4, 64, 1>, + 2, + 4, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, - S<4, 64, 1>, + S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, @@ -93,7 +93,7 @@ using DeviceOpInstanceKKNN = true, 1, 1, - S<1, 16, 1, 16>, + S<1, 32, 1, 4>, 8>; using DeviceOpInstance = DeviceOpInstanceKKNN; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 95918252a81..f0784cb21fa 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -211,10 +211,27 @@ struct BlockwiseGemmWMMA constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - + constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3]; + return make_naive_tensor_descriptor( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, + I1, + I1, + Number{}, + I1, + I1, + MAccVgprs), + make_tuple(Number{} * MAccVgprs * AccStride, + Number{} * MAccVgprs * AccStride, + Number{} * MAccVgprs * AccStride, + MAccVgprs * AccStride, + MAccVgprs * AccStride, + MAccVgprs * AccStride, + AccStride) + ); + #if 0 return make_naive_tensor_descriptor_packed( // |MRepeat |MWave |MSubGroup |NRepeat |NWave // |NThreadPerSubGroup |MAccVgprs @@ -225,6 +242,7 @@ struct BlockwiseGemmWMMA I1, NThreadPerSubGroup, MAccVgprs)); + #endif } template diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 508d12549b7..387b7aa0684 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -140,8 +140,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto AEnableLds_manu = true; static constexpr auto BEnableLds_manu = true; - static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu; - static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu; + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); static constexpr auto matrix_padder = MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 2554cb189b8..9397938b4f4 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -151,9 +151,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto B0EnableLds_manu = true; static constexpr auto B1EnableLds_manu = true; - static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu; - static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu; - static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu; + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch >1); + static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch >1); + static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch >1); using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< Sequence, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index b4ef5237461..301c7d83b40 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -101,8 +101,8 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD 1); + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); static constexpr auto matrix_padder = MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 5f754b765b2..0f55f8c681c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -94,8 +94,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm1); + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch>1); static constexpr auto matrix_padder = MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; @@ -467,7 +467,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm || is_same_v)) + if constexpr(!(is_same_v || is_same_v || is_same_v)) { printf("DeviceOp err: AccDataType"); return false; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 5d582ab52ef..cdbb34a205d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -177,8 +177,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle static constexpr auto AEnableLds_manu = false; static constexpr auto BEnableLds_manu = false; - static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu; - static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu; + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumGemmKPrefetchStage > 1); + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumGemmKPrefetchStage > 1); static constexpr auto conv_to_gemm_transformer = TransformConvFwdToGemm{}; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index c332625e802..51009bb36ca 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -868,7 +868,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma /* index_t SrcScalarStrideInVector, */ 1, /* index_t DstScalarStrideInVector, */ 1, /* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, -/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, + NumGemmKPrefetchStage>( a_grid_desc, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, @@ -943,7 +944,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma 1, 1, B0ThreadTransferSrcResetCoordinateAfterRun, - true>( + true, + NumGemmKPrefetchStage>( b0_grid_desc, make_multi_index(0, 0, 0), b0_element_op, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index b22f703611e..af70d4e9916 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -874,7 +874,8 @@ struct GridwiseGemmMultipleD_Wmma /* index_t SrcScalarStrideInVector, */ 1, /* index_t DstScalarStrideInVector, */ 1, /* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, -/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, + NumGemmKPrefetchStage>( a_grid_desc, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, @@ -950,7 +951,8 @@ struct GridwiseGemmMultipleD_Wmma 1, 1, BThreadTransferSrcResetCoordinateAfterRun, - true>( + true, + NumGemmKPrefetchStage>( b_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index a181b8016c9..1fb2e153c71 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -636,7 +636,8 @@ struct GridwiseGemm_Wmma /* index_t SrcScalarStrideInVector, */ 1, /* index_t DstScalarStrideInVector, */ 1, /* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, -/* bool ThreadTransferDstResetCoordinateAfterRun, */ true>( +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, + NumGemmKPrefetchStage>( a_grid_desc, make_multi_index(0, m_block_data_idx_on_grid, 0), a_element_op, @@ -712,7 +713,8 @@ struct GridwiseGemm_Wmma 1, 1, BThreadTransferSrcResetCoordinateAfterRun, - true>( + true, + NumGemmKPrefetchStage>( b_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, @@ -814,10 +816,11 @@ struct GridwiseGemm_Wmma /*******************************************************************************/ // write out to C, implement shuffle { + // C mapping in single thread. constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); - // This API Provide All dimension (size) you need + // C mapping in single block constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 83ecc61b39f..775353bea83 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -89,6 +89,7 @@ struct wmma_type @@ -113,7 +113,7 @@ struct wmma_type::Run(a, b, reg_c); + intrin_wmma_f32_16x16x16_f16_w32::Run(a, b, reg_c); } else if constexpr(wave_size == 64) { @@ -134,6 +134,7 @@ struct wmma_type @@ -158,7 +159,6 @@ struct wmma_type struct wmma_type @@ -191,15 +191,14 @@ struct wmma_type::Run(a, b, reg_c); + intrin_wmma_f16_16x16x16_f16_w32::Run(a, b, reg_c); } else if constexpr(wave_size == 64) { - intrin_wmma_f16_16x16x16_f16_w64::Run(a, b, reg_c); + intrin_wmma_f16_16x16x16_f16_w64::Run(a, b, reg_c); } } }; - template struct wmma_type::Run(a, b, reg_c); + intrin_wmma_bf16_16x16x16_bf16_w32::Run(a, b, reg_c); } else if constexpr(wave_size == 64) { - intrin_wmma_bf16_16x16x16_bf16_w64::Run(a, b, reg_c); + intrin_wmma_bf16_16x16x16_bf16_w64::Run(a, b, reg_c); } } }; -#endif - template struct wmma_type( + wmma_instr.template run( p_a_wave, p_b_wave, p_c_thread); } else { - wmma_instr.template run( + wmma_instr.template run( p_b_wave, p_a_wave, p_c_thread); } } @@ -556,7 +555,7 @@ struct WmmaGemm __host__ __device__ static constexpr auto GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths() { - return make_tuple(I1, I1, Number{}); + return make_tuple(I1, I1, Number{}, Number{}); } }; diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index 3fa86ca0790..bf091425485 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -12,11 +12,11 @@ namespace ck { /********************************WAVE32 MODE***********************************************/ // src: fp16, dst: fp32 -template +template struct intrin_wmma_f32_16x16x16_f16_w32; -template -struct intrin_wmma_f32_16x16x16_f16_w32<16, 16, AssemblyBackend> +template <> +struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> { template __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) From 5bf77d8baea8b073acefee2943eca999d8248348 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 10 May 2023 09:26:39 +0000 Subject: [PATCH 077/118] clang-format --- .../gpu/block/blockwise_gemm_wmma.hpp | 19 +++++-------- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 6 ++--- .../gpu/device/impl/device_gemm_wmma.hpp | 7 ++--- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 6 +++-- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 27 +++++++------------ 5 files changed, 27 insertions(+), 38 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index f0784cb21fa..8c1a16c4570 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -211,27 +211,20 @@ struct BlockwiseGemmWMMA constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3]; + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3]; return make_naive_tensor_descriptor( // |MRepeat |MWave |MSubGroup |NRepeat |NWave // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, - I1, - I1, - Number{}, - I1, - I1, - MAccVgprs), + make_tuple(Number{}, I1, I1, Number{}, I1, I1, MAccVgprs), make_tuple(Number{} * MAccVgprs * AccStride, Number{} * MAccVgprs * AccStride, Number{} * MAccVgprs * AccStride, MAccVgprs * AccStride, MAccVgprs * AccStride, MAccVgprs * AccStride, - AccStride) - ); - #if 0 + AccStride)); +#if 0 return make_naive_tensor_descriptor_packed( // |MRepeat |MWave |MSubGroup |NRepeat |NWave // |NThreadPerSubGroup |MAccVgprs @@ -242,7 +235,7 @@ struct BlockwiseGemmWMMA I1, NThreadPerSubGroup, MAccVgprs)); - #endif +#endif } template diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 9397938b4f4..082e7185d86 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -151,9 +151,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto B0EnableLds_manu = true; static constexpr auto B1EnableLds_manu = true; - static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch >1); - static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch >1); - static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch >1); + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); + static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1); + static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1); using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< Sequence, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 0f55f8c681c..98d3d168f4d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -94,8 +94,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm1); - static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch>1); + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); static constexpr auto matrix_padder = MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; @@ -467,7 +467,8 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm || is_same_v || is_same_v)) + if constexpr(!(is_same_v || is_same_v || + is_same_v)) { printf("DeviceOp err: AccDataType"); return false; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index cdbb34a205d..435d58dd0ad 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -177,8 +177,10 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle static constexpr auto AEnableLds_manu = false; static constexpr auto BEnableLds_manu = false; - static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumGemmKPrefetchStage > 1); - static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumGemmKPrefetchStage > 1); + static constexpr auto AEnableLds = + AEnableLds_auto || AEnableLds_manu || (NumGemmKPrefetchStage > 1); + static constexpr auto BEnableLds = + BEnableLds_auto || BEnableLds_manu || (NumGemmKPrefetchStage > 1); static constexpr auto conv_to_gemm_transformer = TransformConvFwdToGemm{}; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 775353bea83..545292943f1 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -104,11 +104,7 @@ struct wmma_type + template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const { if constexpr(wave_size == 32) @@ -142,7 +138,7 @@ struct wmma_type @@ -182,11 +178,7 @@ struct wmma_type + template __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const { if constexpr(wave_size == 32) @@ -261,7 +253,7 @@ struct wmma_type( - p_a_wave, p_b_wave, p_c_thread); + wmma_instr.template run(p_a_wave, p_b_wave, p_c_thread); } else { - wmma_instr.template run( - p_b_wave, p_a_wave, p_c_thread); + wmma_instr.template run(p_b_wave, p_a_wave, p_c_thread); } } @@ -555,7 +545,10 @@ struct WmmaGemm __host__ __device__ static constexpr auto GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths() { - return make_tuple(I1, I1, Number{}, Number{}); + return make_tuple(I1, + I1, + Number{}, + Number{}); } }; From 2ec3f4c3f675c3a67229b0fc88ab5e0d4455f195 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 18 May 2023 08:48:18 +0000 Subject: [PATCH 078/118] 1. change blockwise gemm loopover direction from kmn to mnk ( ~1% improvement) 2. change kernel timing mode to 50 warmup + 50 timed repeat --- example/01_gemm/gemm_wmma_fp16.cpp | 8 +++---- include/ck/host_utility/kernel_launch.hpp | 9 ++++--- .../gpu/block/blockwise_gemm_wmma.hpp | 24 +++++++++++-------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index a5ce2215f25..6d19648dc91 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -37,14 +37,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle GemmDefault, 2, // Prefetch stage 128, // BlockSize - 64, // MPerBlock - 128, // NPerBlock + 128, // MPerBlock + 64, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave - 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index c12147bb14f..48a920ef050 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -32,10 +32,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config, printf("Warm up 1 time\n"); #endif - // warm up - // kernel<<>>(args...); + const int nrepeat = 50; + + for(int i = 0; i < nrepeat; ++i) + { + kernel<<>>(args...); + } - const int nrepeat = 100; #if DEBUG_LOG printf("Start running %d times...\n", nrepeat); #endif diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 8c1a16c4570..519a027e547 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -312,10 +312,14 @@ struct BlockwiseGemmWMMA // basic intrinsic to determine loopover direction if constexpr(MRepeat < NRepeat) { - static_for<0, KPerBlock / WmmaK, 1>{}( - [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A + + + static_for<0, NRepeat, 1>{}([&](auto n0) { + static_for<0, KPerBlock / WmmaK, 1>{}( + [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, make_tuple(Number{}, @@ -327,8 +331,6 @@ struct BlockwiseGemmWMMA a_thread_desc_, make_tuple(I0, m0, I0, I0, I0), a_thread_buf); - - static_for<0, NRepeat, 1>{}([&](auto n0) { // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, @@ -370,10 +372,14 @@ struct BlockwiseGemmWMMA } else { - static_for<0, KPerBlock / WmmaK, 1>{}( - [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B + + + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, KPerBlock / WmmaK, 1>{}( + [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, make_tuple(Number{}, @@ -385,8 +391,6 @@ struct BlockwiseGemmWMMA b_thread_desc_, make_tuple(I0, n0, I0, I0, I0), b_thread_buf); - - static_for<0, MRepeat, 1>{}([&](auto m0) { // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, From c713d22405660b52c490e8cb95058bbb714e98a5 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 19 May 2023 06:00:33 +0000 Subject: [PATCH 079/118] Update low level abstration of blockwise gemm wmma --- example/01_gemm/gemm_wmma_fp16.cpp | 4 +- include/ck/host_utility/kernel_launch.hpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 303 +++++++++++------- .../gpu/device/impl/device_gemm_wmma.hpp | 29 +- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 12 +- .../gpu/grid/gridwise_gemm_wmma.hpp | 155 ++++----- .../threadwise_tensor_slice_transfer.hpp | 9 +- 7 files changed, 307 insertions(+), 207 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index 6d19648dc91..e09c32588db 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -37,8 +37,8 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle GemmDefault, 2, // Prefetch stage 128, // BlockSize - 128, // MPerBlock - 64, // NPerBlock + 128, // MPerBlock + 64, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index 48a920ef050..7593056b620 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -34,7 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, #endif const int nrepeat = 50; - for(int i = 0; i < nrepeat; ++i) + for(int i = 0; i < nrepeat; ++i) { kernel<<>>(args...); } diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 519a027e547..b0afa851142 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -55,6 +55,7 @@ struct BlockwiseGemmWMMA static constexpr auto I2 = Number<2>{}; static constexpr auto I3 = Number<3>{}; static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; static constexpr auto WmmaK = Number<16>{}; using ThisThreadBlock = ThisThreadBlock; @@ -62,8 +63,13 @@ struct BlockwiseGemmWMMA // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. static constexpr index_t WaveSize = 32; - static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I4); - static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I4); + // When use LDS, each Row(16 consecutive lanes) read whole data from source buffer + // When not use LDS, each Row read half of whole data from source buffer, exchange the data via + // permutation + static constexpr index_t A_KRow = AEnableLds ? 1 : 2; + static constexpr index_t B_KRow = BEnableLds ? 1 : 2; + static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5); + static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5); static constexpr auto wmma_gemm = WmmaGemm{}; @@ -71,10 +77,6 @@ struct BlockwiseGemmWMMA static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); - // Read from Lds, duplicate Twice, Read from VGPR, no duplication. - static constexpr index_t A_Data_Duplicated_Rate = AEnableLds ? 2 : 1; - static constexpr index_t B_Data_Duplicated_Rate = BEnableLds ? 2 : 1; - StaticBufferTupleOfVector{}, waveId_m, blk_idx[I0], Number{}, waveId_n, blk_idx[I1], blk_idx[I2]); } - using Tuple5 = decltype(CalculateAThreadOriginDataIndex()); - __host__ __device__ BlockwiseGemmWMMA(Tuple5 a_origin = CalculateAThreadOriginDataIndex(), - Tuple5 b_origin = CalculateBThreadOriginDataIndex()) + using Tuple6 = decltype(CalculateAThreadOriginDataIndex()); + __host__ __device__ BlockwiseGemmWMMA(Tuple6 a_origin = CalculateAThreadOriginDataIndex(), + Tuple6 b_origin = CalculateBThreadOriginDataIndex()) : a_thread_copy_(a_origin), b_thread_copy_(b_origin) { static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(), @@ -224,18 +226,6 @@ struct BlockwiseGemmWMMA MAccVgprs * AccStride, MAccVgprs * AccStride, AccStride)); -#if 0 - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave - // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, - I1, - MSubGroup, - Number{}, - I1, - NThreadPerSubGroup, - MAccVgprs)); -#endif } template @@ -312,36 +302,26 @@ struct BlockwiseGemmWMMA // basic intrinsic to determine loopover direction if constexpr(MRepeat < NRepeat) { - - static_for<0, MRepeat, 1>{}([&](auto m0) { - - - static_for<0, NRepeat, 1>{}([&](auto n0) { - static_for<0, KPerBlock / WmmaK, 1>{}( + static_for<0, KPerBlock / WmmaK, 1>{}( [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - // read A + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, - m0, - I0, - I0, - I0), + make_tuple(Number{}, m0, I0, I0, I0, I0), a_block_buf, a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0), + make_tuple(I0, m0, I0, I0, I0, I0), a_thread_buf); + + static_for<0, NRepeat, 1>{}([&](auto n0) { // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, - n0, - I0, - I0, - I0), + make_tuple(Number{}, n0, I0, I0, I0, I0), b_block_buf, b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0), + make_tuple(I0, n0, I0, I0, I0, I0), b_thread_buf); vector_type a_thread_vec; @@ -350,12 +330,100 @@ struct BlockwiseGemmWMMA static_for<0, WmmaK, 1>{}([&](auto i) { a_thread_vec.template AsType()(i) = a_thread_buf[Number{}]; + make_tuple(i / A_K1 / A_KRow, + m0, + 0, + (i / A_K1) % A_KRow, + 0, + i % A_K1))>{}]; b_thread_vec.template AsType()(i) = b_thread_buf[Number{}]; + make_tuple(i / B_K1 / B_KRow, + n0, + 0, + (i / B_K1) % B_KRow, + 0, + i % B_K1))>{}]; }); +#if 0 + if (get_thread_local_1d_id() == 0){ + printf("repeat: m,n,k:(%02d, %02d, %02d) a_thread_buf: %04x %04x %04x %04x %04x %04x %04x %04x | %04x %04x %04x %04x %04x %04x %04x %04x\n", + m0.value, n0.value, k.value, + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))), + *(reinterpret_cast(&(a_thread_buf[Number{}]))) + ); + } + + // if (get_thread_local_1d_id() == 0){ + // printf("repeat: m,n,k:(%02d, %02d, %02d) b_thread_buf: %04x %04x %04x %04x %04x %04x %04x %04x | %04x %04x %04x %04x %04x %04x %04x %04x\n", + // m0.value, n0.value, k.value, + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))), + // *(reinterpret_cast(&(b_thread_buf[Number{}]))) + // ); + // } +#endif + using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; @@ -372,36 +440,26 @@ struct BlockwiseGemmWMMA } else { - - static_for<0, NRepeat, 1>{}([&](auto n0) { - - static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, KPerBlock / WmmaK, 1>{}( - [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - // read B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, - n0, - I0, - I0, - I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0), - b_thread_buf); + static_for<0, NRepeat, 1>{}([&](auto n0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, KPerBlock / WmmaK, 1>{}( + [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + // read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + b_thread_buf); // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, - m0, - I0, - I0, - I0), + make_tuple(Number{}, m0, I0, I0, I0, I0), a_block_buf, a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0), + make_tuple(I0, m0, I0, I0, I0, I0), a_thread_buf); vector_type a_thread_vec; @@ -410,10 +468,20 @@ struct BlockwiseGemmWMMA static_for<0, WmmaK, 1>{}([&](auto i) { b_thread_vec.template AsType()(i) = b_thread_buf[Number{}]; + make_tuple(i / B_K1 / B_KRow, + n0, + 0, + (i / B_K1) % B_KRow, + 0, + i % B_K1))>{}]; a_thread_vec.template AsType()(i) = a_thread_buf[Number{}]; + make_tuple(i / A_K1 / A_KRow, + m0, + 0, + (i / A_K1) % A_KRow, + 0, + i % A_K1))>{}]; }); using wmma_input_type_a = typename vector_type::type; @@ -427,20 +495,39 @@ struct BlockwiseGemmWMMA b_thread_vec.template AsType()(Number<0>{}), c_thread_buf.GetVectorTypeReference(Number{})); }); - }); }); + }); } } protected: - static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, Number{}), - make_tuple(Number{}, Number{}, Number{}, Number{}, Number<1>{})); - - // B[K0, N0, N1, N2, K1] - static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, Number{}), - make_tuple(Number{}, Number{}, Number{}, Number{}, Number<1>{})); + static constexpr auto a_thread_desc_ = + make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number<1>{})); + + static constexpr auto b_thread_desc_ = + make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number<1>{})); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( @@ -452,15 +539,16 @@ struct BlockwiseGemmWMMA template <> struct AThreadCopySelector { - using type = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - A_K1, - A_K1>; + using type = + ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + A_K1, + A_K1>; }; template <> @@ -472,9 +560,9 @@ struct BlockwiseGemmWMMA decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), tensor_operation::element_wise::PassThrough, - Sequence<1, 1, 1, 1, A_K1>, - Sequence<0, 1, 2, 3, 4>, - 4, + Sequence, + Sequence<0, 1, 2, 3, 4, 5>, + 5, A_K1, 0x76543210, 0xfedcba98, @@ -487,15 +575,16 @@ struct BlockwiseGemmWMMA template <> struct BThreadCopySelector { - using type = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - B_K1, - B_K1>; + using type = + ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + B_K1, + B_K1>; }; template <> @@ -507,9 +596,9 @@ struct BlockwiseGemmWMMA decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), tensor_operation::element_wise::PassThrough, - Sequence<1, 1, 1, 1, B_K1>, - Sequence<0, 1, 2, 3, 4>, - 4, + Sequence, + Sequence<0, 1, 2, 3, 4, 5>, + 5, B_K1, 0x76543210, 0xfedcba98, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 98d3d168f4d..4c28c72a5f0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -80,6 +80,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{}; static constexpr auto I4 = Number<4>{}; static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; @@ -136,18 +137,21 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + A_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(M0 * MRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -187,18 +191,21 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + B_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(N0 * NRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -372,7 +379,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm CThreadBuffer& c_thread_buf, index_t num_loop) { -#if 0 - constexpr auto a_block_origin_idx = generate_sequence_v2( - []() constexpr { - return Number<0>{}; - }, - Number{}); -#endif - - constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0); auto a_block_buf_switch = a_block_buf; // preload data into LDS @@ -404,7 +396,7 @@ struct GridwiseGemmPipeline_v1<1, true, false> CThreadBuffer& c_thread_buf, index_t num_loop) { - constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0); auto b_block_buf_switch = b_block_buf; // preload data into LDS diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 1fb2e153c71..0e5d77d32f8 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -172,10 +172,23 @@ struct GridwiseGemm_Wmma else { constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, K1), - make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(Number{} * Number{} * K1, + Number{} * K1, + Number{} * K1, + K1, + K1, + K1, + I1)); } }(); @@ -206,10 +219,23 @@ struct GridwiseGemm_Wmma else { constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - // KWmma->NRepeat->NWave->NRow->NPerWmma->K1 Per Thread + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, K1), - make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(Number{} * Number{} * K1, + Number{} * K1, + Number{} * K1, + K1, + K1, + K1, + I1)); } }(); @@ -229,7 +255,7 @@ struct GridwiseGemm_Wmma { constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -249,7 +275,7 @@ struct GridwiseGemm_Wmma { constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -264,23 +290,26 @@ struct GridwiseGemm_Wmma constexpr auto a_wave_desc = [&]() { if constexpr(AEnableLds) { - // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1 - constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); - constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1 + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + constexpr auto A_KRow = I1; return transform_tensor_descriptor( ABlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // KWmma_MRepeat_MWave_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 - constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); - constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = ABlockDesc_{}.GetLength(I3); + constexpr auto A_KRow = ABlockDesc_{}.GetLength(I4); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I6); // Err: merge transform cause non-constexpr issue @@ -301,26 +330,12 @@ struct GridwiseGemm_Wmma // Sequence<4>{})); // Workaround, Freeze transform - return transform_tensor_descriptor( - ABlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -334,44 +349,33 @@ struct GridwiseGemm_Wmma if constexpr(BEnableLds) { // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 - constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + constexpr auto B_KRow = I1; return transform_tensor_descriptor( BBlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // KWmma_NRepeat_NWave_KRow_NPerWmma_K1 -> K0_NRepeat_Nwaves_NPerWmma_K1 - constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = BBlockDesc_{}.GetLength(I5); + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = BBlockDesc_{}.GetLength(I3); + constexpr auto B_KRow = BBlockDesc_{}.GetLength(I4); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I6); // Workaround, Freeze transform - return transform_tensor_descriptor( - BBlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -415,9 +419,9 @@ struct GridwiseGemm_Wmma else { return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * - a_grid_desc.GetLength(I4), + a_grid_desc.GetLength(I5), a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * - a_grid_desc.GetLength(I5)); + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6)); } }; @@ -430,9 +434,9 @@ struct GridwiseGemm_Wmma else { return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * - b_grid_desc.GetLength(I4), + b_grid_desc.GetLength(I5), b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * - b_grid_desc.GetLength(I5)); + b_grid_desc.GetLength(I4) * b_grid_desc.GetLength(I6)); } }; @@ -599,7 +603,8 @@ struct GridwiseGemm_Wmma return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); } else{ - return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) + * a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6); } }(); @@ -652,6 +657,7 @@ struct GridwiseGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); @@ -664,11 +670,12 @@ struct GridwiseGemm_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, ABlockTransferSrcScalarPerVector, AThreadTransferSrcResetCoordinateAfterRun, true>( @@ -676,6 +683,7 @@ struct GridwiseGemm_Wmma make_multi_index(0, m_block_data_idx_on_grid/(MWaves * MPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); @@ -729,6 +737,7 @@ struct GridwiseGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; auto b_block_buf = make_static_buffer( b_block_desc.GetElementSpaceSize()); @@ -741,11 +750,12 @@ struct GridwiseGemm_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, BBlockTransferSrcScalarPerVector, BThreadTransferSrcResetCoordinateAfterRun, true>( @@ -753,6 +763,7 @@ struct GridwiseGemm_Wmma make_multi_index(0, n_block_data_idx_on_grid/(NWaves * NPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 82ff101dd77..a04a162bfa7 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1387,7 +1387,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow // copy data from src_buf into dst_vector static_for<0, DstScalarPerVector, 1>{}([&](auto i) { - // src_desc error, non constexpr? + // src_desc error, non constexpr, caused by merge transform constexpr index_t src_offset = src_desc.CalculateOffset( src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector); @@ -1396,8 +1396,6 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow SrcData v_this_row, v_theother_row; // int type temp value due to intrinsic requirement - // TODO: This temp value will generate the scratch memory if - // IntraRowSwizzlePerm is flase int temp = 0; // apply element-wise operation @@ -1419,7 +1417,10 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow 1, 0); v_theother_row = type_convert_sp(temp); - + // if (get_thread_local_1d_id() == 0){ + // printf("src_offset:%d, dst_offset for this row: %d, dst_offset + // for the other row: %d \n", + // src_offset, dst_offset, dst_offset+DstScalarPerVector);} if(get_thread_local_1d_id() % 32 < 16) { // apply type convert From 3ccfb0aecab7d7801e768bb3f04e3063becf2224 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 19 May 2023 06:45:08 +0000 Subject: [PATCH 080/118] (2/5) bilinear gemm pass, perf bug: skip a lds has lower performance than skip b lds --- .../gemm_bilinear_wmma_fp16.cpp | 18 +- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 86 ++++------ .../gpu/device/impl/device_gemm_wmma.hpp | 48 +----- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 162 ++++++++++-------- .../gpu/grid/gridwise_gemm_wmma.hpp | 2 +- 5 files changed, 134 insertions(+), 182 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 1f3fdcb2ebc..66c72ee20f7 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -80,33 +80,33 @@ using DeviceOpInstance = BElementOp, CDEElementOp, GemmSpec, - 2, + 1, 128, 64, - 128, 64, - 8, + 64, + 4, 16, 16, - 2, + 1, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 8, - 8, + 4, + 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 8, - 8, + 4, + 4, true, 1, 1, - S<1, 32, 1, 4>, + S<1, 64, 1, 2>, 8>; int main(int argc, char* argv[]) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 301c7d83b40..33e880078c8 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -87,6 +87,7 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{}; static constexpr auto I4 = Number<4>{}; static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; @@ -98,8 +99,8 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD 1); static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); @@ -144,18 +145,21 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + A_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(M0 * MRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -195,18 +199,21 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + B_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(N0 * NRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -438,14 +445,11 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD, - true>; // Last Option is W/O + has_main_k_block_loop>; // Last Option is W/O - ave_time = + return launch_and_time_kernel(stream_config, kernel, dim3(grid_size), @@ -482,48 +486,16 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD{}); } else { - const auto kernel = kernel_gemm_mupltipe_d_wmma_cshuffle< - GridwiseOp, - ADataType, - BDataType, - typename GridwiseOp::DsGridPointer, - EDataType, - remove_reference_t, - remove_reference_t, - remove_reference_t< - typename GridwiseOp::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, - remove_reference_t< - typename GridwiseOp::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, - AElementwiseOperation, - BElementwiseOperation, - CDEElementwiseOperation, - remove_reference_t, - false>; - - ave_time = - launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_ds_grid_, - arg.p_e_grid_, - arg.a_grid_desc, - arg.b_grid_desc, - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, - arg.e_grid_desc_mblock_mperblock_nblock_nperblock, - arg.a_element_op_, - arg.b_element_op_, - arg.cde_element_op_, - arg.block_2_ctile_map_); + return launch_kernel(integral_constant{}); } - - return ave_time; } // polymorphic diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 4c28c72a5f0..e5713877eb9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -382,11 +382,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, - true>; // Last Option is W/O + has_main_k_block_loop>; - ave_time = launch_and_time_kernel(stream_config, + return launch_and_time_kernel(stream_config, kernel, dim3(grid_size), dim3(BlockSize), @@ -417,42 +413,16 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm{}); } else { - const auto kernel = kernel_gemm_wmma< - GridwiseGemm, - ADataType, - BDataType, - CDataType, - remove_reference_t, - remove_reference_t, - remove_reference_t< - typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, - AElementwiseOperation, - BElementwiseOperation, - CElementwiseOperation, - remove_reference_t, - false>; - - ave_time = launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_, - arg.b_grid_desc_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); + return launch_kernel(integral_constant{}); } - - return ave_time; } // polymorphic diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index af70d4e9916..625dadc71bb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -379,10 +379,23 @@ struct GridwiseGemmMultipleD_Wmma else { constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, K1), - make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(Number{} * Number{} * K1, + Number{} * K1, + Number{} * K1, + K1, + K1, + K1, + I1)); } }(); @@ -413,10 +426,23 @@ struct GridwiseGemmMultipleD_Wmma else { constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - // KWmma->NRepeat->NWave->NRow->NPerWmma->K1 Per Thread + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, K1), - make_tuple(Number{} * K1, K1, K1, K1, K1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(Number{} * Number{} * K1, + Number{} * K1, + Number{} * K1, + K1, + K1, + K1, + I1)); } }(); @@ -436,7 +462,7 @@ struct GridwiseGemmMultipleD_Wmma { constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -456,7 +482,7 @@ struct GridwiseGemmMultipleD_Wmma { constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -471,45 +497,33 @@ struct GridwiseGemmMultipleD_Wmma constexpr auto a_wave_desc = [&]() { if constexpr(AEnableLds) { - // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1 - constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); - constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1 + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + constexpr auto A_KRow = I1; return transform_tensor_descriptor( ABlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // KWmma_MRepeat_MWave_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 - constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); - constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); - - // Workaround, Freeze transform - return transform_tensor_descriptor( - ABlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = ABlockDesc_{}.GetLength(I3); + constexpr auto A_KRow = ABlockDesc_{}.GetLength(I4); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I6); + + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -523,44 +537,33 @@ struct GridwiseGemmMultipleD_Wmma if constexpr(BEnableLds) { // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 - constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + constexpr auto B_KRow = I1; return transform_tensor_descriptor( BBlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // KWmma_NRepeat_NWave_KRow_NPerWmma_K1 -> K0_NRepeat_Nwaves_NPerWmma_K1 - constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = BBlockDesc_{}.GetLength(I5); + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = BBlockDesc_{}.GetLength(I3); + constexpr auto B_KRow = BBlockDesc_{}.GetLength(I4); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I6); // Workaround, Freeze transform - return transform_tensor_descriptor( - BBlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -620,9 +623,9 @@ struct GridwiseGemmMultipleD_Wmma else { return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * - a_grid_desc.GetLength(I4), + a_grid_desc.GetLength(I5), a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * - a_grid_desc.GetLength(I5)); + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6)); } }; @@ -635,9 +638,9 @@ struct GridwiseGemmMultipleD_Wmma else { return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * - b_grid_desc.GetLength(I4), + b_grid_desc.GetLength(I5), b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * - b_grid_desc.GetLength(I5)); + b_grid_desc.GetLength(I4) * b_grid_desc.GetLength(I6)); } }; @@ -837,7 +840,8 @@ struct GridwiseGemmMultipleD_Wmma return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); } else{ - return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6); } }(); @@ -888,8 +892,9 @@ struct GridwiseGemmMultipleD_Wmma else { // Thread-wise copy - // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 + // KPerBlock/WmmaK -> MRepeat -> MWaves -> K0PerWmma -> KRow -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); @@ -902,11 +907,12 @@ struct GridwiseGemmMultipleD_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, ABlockTransferSrcScalarPerVector, AThreadTransferSrcResetCoordinateAfterRun, true>( @@ -914,6 +920,7 @@ struct GridwiseGemmMultipleD_Wmma make_multi_index(0, m_block_data_idx_on_grid/(MWaves * MPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); @@ -967,7 +974,8 @@ struct GridwiseGemmMultipleD_Wmma // Thread-wise copy // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - auto b_block_buf = make_static_buffer( + constexpr auto K0PerWmma = WmmaK/2/K1Value; + auto b_block_buf = make_static_buffer( b_block_desc.GetElementSpaceSize()); // Limitation: NumDim of Src and Dst descriptor should be identical @@ -979,11 +987,12 @@ struct GridwiseGemmMultipleD_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, BBlockTransferSrcScalarPerVector, BThreadTransferSrcResetCoordinateAfterRun, true>( @@ -991,6 +1000,7 @@ struct GridwiseGemmMultipleD_Wmma make_multi_index(0, n_block_data_idx_on_grid/(NWaves * NPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 0e5d77d32f8..8972718f39c 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -655,7 +655,7 @@ struct GridwiseGemm_Wmma else { // Thread-wise copy - // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 + // KPerBlock/WmmaK -> MRepeat -> MWaves -> K0PerWmma -> KRow -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; constexpr auto K0PerWmma = WmmaK/2/K1Value; auto a_block_buf = make_static_buffer( From 12a4ea69d501146b2d775173040e0a7ca70964eb Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 19 May 2023 07:26:53 +0000 Subject: [PATCH 081/118] (3/5) batched gemm pass, perf bug: skip a lds has lower performance than skip b lds --- .../batched_gemm_bias_e_permute_wmma_fp16.cpp | 20 ++--- .../gpu/block/blockwise_gemm_wmma.hpp | 80 +------------------ ...d_contraction_multiple_d_wmma_cshuffle.hpp | 40 ++++++---- 3 files changed, 36 insertions(+), 104 deletions(-) diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp index a9dceed3175..1ad26255f6d 100644 --- a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp +++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_wmma_fp16.cpp @@ -67,33 +67,33 @@ using DeviceOpInstanceKKNN = ASpec, BSpec, DESpec, - 2, + 1, 128, 64, - 128, - 32, - 8, + 64, + 64, + 4, 16, 16, - 2, + 1, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 8, - 8, + 4, + 4, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, - 8, - 8, + 4, + 4, true, 1, 1, - S<1, 32, 1, 4>, + S<1, 64, 1, 2>, 8>; using DeviceOpInstance = DeviceOpInstanceKKNN; diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index b0afa851142..d0462929f42 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -305,7 +305,7 @@ struct BlockwiseGemmWMMA static_for<0, KPerBlock / WmmaK, 1>{}( [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A + // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, make_tuple(Number{}, m0, I0, I0, I0, I0), @@ -346,84 +346,6 @@ struct BlockwiseGemmWMMA i % B_K1))>{}]; }); -#if 0 - if (get_thread_local_1d_id() == 0){ - printf("repeat: m,n,k:(%02d, %02d, %02d) a_thread_buf: %04x %04x %04x %04x %04x %04x %04x %04x | %04x %04x %04x %04x %04x %04x %04x %04x\n", - m0.value, n0.value, k.value, - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))), - *(reinterpret_cast(&(a_thread_buf[Number{}]))) - ); - } - - // if (get_thread_local_1d_id() == 0){ - // printf("repeat: m,n,k:(%02d, %02d, %02d) b_thread_buf: %04x %04x %04x %04x %04x %04x %04x %04x | %04x %04x %04x %04x %04x %04x %04x %04x\n", - // m0.value, n0.value, k.value, - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))), - // *(reinterpret_cast(&(b_thread_buf[Number{}]))) - // ); - // } -#endif - using wmma_input_type_a = typename vector_type::type; using wmma_input_type_b = typename vector_type::type; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 387b7aa0684..979880ef809 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -125,6 +125,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto I3 = Number<3>{}; static constexpr auto I4 = Number<4>{}; static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; @@ -136,9 +137,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; // If true, LDS is used unconditionally - // Bug, MNK vector load check not implemented correctly - static constexpr auto AEnableLds_manu = true; - static constexpr auto BEnableLds_manu = true; + static constexpr auto AEnableLds_manu = false; + static constexpr auto BEnableLds_manu = false; static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); @@ -220,18 +220,21 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle } else { - constexpr auto A_KRow = WmmaK / K1; - const auto A_KWmma = K / WmmaK; + constexpr auto A_KRow = 2; + constexpr auto A_K0PerWmma = WmmaK / A_KRow / K1Number; + const auto A_KWmma = K / WmmaK; const auto M0 = M / MPerBlock; - + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + A_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(M0 * MRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -309,18 +312,21 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle } else { - constexpr auto B_KRow = WmmaK / K1; - const auto B_KWmma = K / WmmaK; + constexpr auto B_KRow = 2; + constexpr auto B_K0PerWmma = WmmaK / B_KRow / K1Number; + const auto B_KWmma = K / WmmaK; const auto N0 = N / NPerBlock; - + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( b_grid_desc_n_k, - make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + B_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(N0 * NRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -752,7 +758,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle else { return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I3) * - arg.a_grid_desc_.GetLength(I5); + arg.a_grid_desc_.GetLength(I4) * arg.a_grid_desc_.GetLength(I6); } }(); @@ -1036,7 +1042,11 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle << MRepeat << ", " << NRepeat << ">" - << " NumPrefetch: " + << " AEnableLds: " + << AEnableLds << ", " + << "BEnableLds: " + << BEnableLds << ", " + << "NumPrefetch: " << NumPrefetch << ", " << "LoopScheduler: " << LoopSchedToString[LoopSched] << ", " From fd4ff3a762c9871e5c480dde6a396c5461c801aa Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 19 May 2023 07:46:01 +0000 Subject: [PATCH 082/118] (4/5) grouped conv pass --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 22 +++++----- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 43 +++++++++++++------ 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 84add520b61..78852ea391b 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -52,32 +52,32 @@ using DeviceConvFwdInstance = ConvSpec, // ConvForwardSpecialization GemmSpec, // GemmSpecialization 1, // Prefetch stage - 256, // BlockSize - 128, // MPerBlock - 128, // NPerBlock - 32, // KPerBlock - 8, // K1 + 128, // BlockSize + 64, // MPerBlock + 64, // NPerBlock + 64, // KPerBlock + 4, // K1 16, // MPerWMMA 16, // NPerWMMA 4, // MRepeat - 2, // NRepeat - S<4, 8, 8>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + 1, // NRepeat + S<4, 8, 4>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder 2, // ABlockTransferSrcVectorDim 1, // ABlockTransferSrcScalarPerVector 1, // ABlockTransferDstScalarPerVector_AK1 true, // ABlockLdsExtraM - S<4, 8, 8>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<4, 8, 4>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder 2, // BBlockTransferSrcVectorDim 1, // BBlockTransferSrcScalarPerVector 1, // BBlockTransferDstScalarPerVector_BK1 true, // BBlockLdsExtraN - 4, - 2, - S<1, 32, 1, 8>, + 1, + 1, + S<1, 16, 1, 8>, 8>; template diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 435d58dd0ad..8a026744b03 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -163,6 +163,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle static constexpr auto I3 = Number<3>{}; static constexpr auto I4 = Number<4>{}; static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; // K1 = Max Vector Access Pixels static constexpr auto K1Number = Number{}; @@ -232,18 +233,21 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } else { - constexpr auto A_KRow = WmmaK / K1; - const auto A_KWmma = K / WmmaK; + constexpr auto A_KRow = 2; + constexpr auto A_K0PerWmma = WmmaK / A_KRow / K1Number; + const auto A_KWmma = K / WmmaK; const auto M0 = M / MPerBlock; - + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( in_gemmm_gemmk_desc, - make_tuple(make_unmerge_transform(make_tuple(A_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + A_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(M0 * MRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -275,18 +279,21 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } else { - constexpr auto B_KRow = WmmaK / K1; - const auto B_KWmma = K / WmmaK; + constexpr auto B_KRow = 2; + constexpr auto B_K0PerWmma = WmmaK / B_KRow / K1Number; + const auto B_KWmma = K / WmmaK; const auto N0 = N / NPerBlock; - + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 return transform_tensor_descriptor( wei_gemmn_gemmk_desc, - make_tuple(make_unmerge_transform(make_tuple(B_KWmma, Number{}, K1Number)), + make_tuple(make_unmerge_transform(make_tuple( + B_KWmma, Number{}, Number{}, K1Number)), make_unmerge_transform( make_tuple(N0 * NRepeat, Number{}, Number{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } } @@ -556,7 +563,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle else { return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I3) * - arg.a_grid_desc_.GetLength(I5); + arg.a_grid_desc_.GetLength(I4) * arg.a_grid_desc_.GetLength(I6); } }(); @@ -884,9 +891,19 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle << KPerBlock << ", " << getConvForwardSpecializationString(ConvForwardSpecialization) << ", " << K1 << ", " + << MPerWmma << ", " + << NPerWmma << ", " + << MRepeat << ", " + << NRepeat + << ">" + << " AEnableLds: " + << AEnableLds << ", " + << "BEnableLds: " + << BEnableLds << ", " + << "ABlockTransferSrcScalarPerVector: " << ABlockTransferSrcScalarPerVector << ", " - << BBlockTransferSrcScalarPerVector - << ">"; + << "BBlockTransferSrcScalarPerVector: " + << BBlockTransferSrcScalarPerVector; // clang-format on return str.str(); From bee4e344b09762dc7fbf1ffad96d0f81b918d2eb Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Fri, 19 May 2023 08:38:15 +0000 Subject: [PATCH 083/118] (5/5) attention pass, todo: debug lds perf bug --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 4 +- .../gpu/block/blockwise_gemm_wmma.hpp | 97 ++++--- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 61 +++-- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 37 ++- .../gpu/device/impl/device_gemm_wmma.hpp | 30 +-- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 247 +++++++++--------- .../transform_contraction_to_gemm.hpp | 48 ++-- 7 files changed, 276 insertions(+), 248 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 78852ea391b..08fd3e834fb 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -53,8 +53,8 @@ using DeviceConvFwdInstance = GemmSpec, // GemmSpecialization 1, // Prefetch stage 128, // BlockSize - 64, // MPerBlock - 64, // NPerBlock + 64, // MPerBlock + 64, // NPerBlock 64, // KPerBlock 4, // K1 16, // MPerWMMA diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index d0462929f42..c9488c317e0 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -305,7 +305,7 @@ struct BlockwiseGemmWMMA static_for<0, KPerBlock / WmmaK, 1>{}( [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A + // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, make_tuple(Number{}, m0, I0, I0, I0, I0), @@ -365,58 +365,57 @@ struct BlockwiseGemmWMMA static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, KPerBlock / WmmaK, 1>{}( - [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - // read B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - b_thread_buf); - // read A - a_thread_copy_.Run( - a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0, I0), - a_thread_buf); - - vector_type a_thread_vec; - vector_type b_thread_vec; + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of + // k=0,kpack*1, ... read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + b_thread_buf); + // read A + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, m0, I0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0, I0), + a_thread_buf); - static_for<0, WmmaK, 1>{}([&](auto i) { - b_thread_vec.template AsType()(i) = - b_thread_buf[Number{}]; - a_thread_vec.template AsType()(i) = - a_thread_buf[Number{}]; - }); + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - }); + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); }); }); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 082e7185d86..1965298a996 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -136,6 +136,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto I3 = Number<3>{}; static constexpr auto I4 = Number<4>{}; static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; static constexpr auto WmmaK = 16; @@ -175,13 +176,15 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } else { - return Transform::MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AKRow_MPerWmma_AK1( - Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), - Number{}, - Number{}, - Number{}, - Number{}, - Number{}); + return Transform:: + MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AK0PerWmma_AKRow_MPerWmma_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, + a_gs_ms_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); } } @@ -197,14 +200,15 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } else { - return Transform::MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BKRow_LPerWmma_BK1( - Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, - b0_gs_ls_ks_strides_vec), - Number{}, - Number{}, - Number{}, - Number{}, - Number{}); + return Transform:: + MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BK0PerWmma_BKRow_LPerWmma_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); } } @@ -220,14 +224,15 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle } else { - return Transform::MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves_BLRow_NPerWmma_BL1( - Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, - b1_gs_ns_ls_strides_vec), - Number{}, - Number{}, - Number{}, - Number{}, - Number{}); + return Transform:: + MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves__BL0PerWmma_BLRow_NPerWmma_BL1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); } } @@ -521,7 +526,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle else { return arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I3) * - arg.a_grid_desc.GetLength(I5); + arg.a_grid_desc.GetLength(I4) * arg.a_grid_desc.GetLength(I6); } }(); @@ -826,7 +831,13 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle << "CSpec" << getTensorSpecializationString(CSpec) << ", " << getMaskingSpecializationString(MaskingSpec) << ">" - << " NumPrefetch: " + << " AEnableLds: " + << AEnableLds << ", " + << "B0EnableLds: " + << B0EnableLds << ", " + << "B1EnableLds: " + << B1EnableLds << ", " + << "NumPrefetch: " << NumPrefetch << ", " << "LoopScheduler: " << LoopSchedToString[LoopSched] << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 33e880078c8..19f35f7a561 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -468,26 +468,25 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD, has_main_k_block_loop>; // Last Option is W/O - return - launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_ds_grid_, - arg.p_e_grid_, - arg.a_grid_desc, - arg.b_grid_desc, - arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, - arg.e_grid_desc_mblock_mperblock_nblock_nperblock, - arg.a_element_op_, - arg.b_element_op_, - arg.cde_element_op_, - arg.block_2_ctile_map_); + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_ds_grid_, + arg.p_e_grid_, + arg.a_grid_desc, + arg.b_grid_desc, + arg.ds_grid_desc_mblock_mperblock_nblock_nperblock, + arg.e_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.cde_element_op_, + arg.block_2_ctile_map_); }; - + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) { return launch_kernel(integral_constant{}); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index e5713877eb9..f4d5f3833a3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -398,21 +398,21 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm, has_main_k_block_loop>; - return launch_and_time_kernel(stream_config, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - arg.p_a_grid_, - arg.p_b_grid_, - arg.p_c_grid_, - arg.a_grid_desc_, - arg.b_grid_desc_k0_n_k1_, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.block_2_ctile_map_); + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_, + arg.b_grid_desc_k0_n_k1_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); }; if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index 51009bb36ca..c0f8ca45105 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -243,10 +243,23 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma else { constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - // KWmma->MRepeat->MWave->KRow->MPerWmma->K1 Per Thread + constexpr auto K0PerWmma = WmmaK / 2 / AK1; + // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, AK1), - make_tuple(Number{} * AK1, AK1, AK1, AK1, AK1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + AK1), + make_tuple(Number{} * Number{} * AK1, + Number{} * AK1, + Number{} * AK1, + AK1, + AK1, + AK1, + I1)); } }(); @@ -277,10 +290,23 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma else { constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - // KWmma->NRepeat->NWave->NRow->NPerWmma->BK1 Per Thread + constexpr auto K0PerWmma = WmmaK / 2 / BK1; + // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, BK1), - make_tuple(Number{} * BK1, BK1, BK1, BK1, BK1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + BK1), + make_tuple(Number{} * Number{} * BK1, + Number{} * BK1, + Number{} * BK1, + BK1, + BK1, + BK1, + I1)); } }(); @@ -310,10 +336,23 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma else { constexpr auto LWmmaPerblock = LPerBlock / WmmaL; - // LWmma->NRepeat->NWave->NRow->LPerWmma->BL1 Per Thread + constexpr auto L0PerWmma = WmmaL / 2 / BL1; + // LWmma->NRepeat->MWave->L0PerWmma->LRow->MPerWmma->L1 Per Thread return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1, I1, I1, BL1), - make_tuple(Number{} * BL1, BL1, BL1, BL1, BL1, I1)); + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + BL1), + make_tuple(Number{} * Number{} * BL1, + Number{} * BL1, + Number{} * BL1, + BL1, + BL1, + BL1, + I1)); } }(); @@ -333,7 +372,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma { constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -353,7 +392,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma { constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -371,7 +410,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma { constexpr auto LWmmaPerBlock = LTilePerBlock / WmmaL; - return make_multi_index(LWmmaPerBlock, 0, 0, 0, 0, 0); + return make_multi_index(LWmmaPerBlock, 0, 0, 0, 0, 0, 0); } }(); @@ -387,44 +426,32 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma if constexpr(AEnableLds) { // AK0_M_AK1 -> AK0_MRepeat_Mwaves_MPerWmma_AK1 - constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); - constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + constexpr auto A_KRow = I1; return transform_tensor_descriptor( ABlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // KWmma_MRepeat_MWave_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 - constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); - constexpr auto A_K1 = ABlockDesc_{}.GetLength(I5); - - // Workaround, Freeze transform - return transform_tensor_descriptor( - ABlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = ABlockDesc_{}.GetLength(I3); + constexpr auto A_KRow = ABlockDesc_{}.GetLength(I4); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I6); + + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -439,44 +466,33 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma if constexpr(B0EnableLds) { // BK0_L_BK1 -> BK0_LRepeat_Lwaves_LPerWmma_BK1 - constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2); + constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2); + constexpr auto B_KRow = I1; return transform_tensor_descriptor( B0BlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // KWmma_LRepeat_LWave_KRow_LPerWmma_K1 -> K0_LRepeat_Lwaves_LPerWmma_K1 - constexpr auto KWmma = B0BlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I5); + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = B0BlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = B0BlockDesc_{}.GetLength(I3); + constexpr auto B_KRow = B0BlockDesc_{}.GetLength(I4); + constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I6); // Workaround, Freeze transform - return transform_tensor_descriptor( - B0BlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -489,14 +505,14 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma { constexpr index_t A_L0 = A1BlockDesc_AL0_M_AL1{}.GetLength(I0); constexpr index_t A_L1 = A1BlockDesc_AL0_M_AL1{}.GetLength(I2); - + constexpr auto A_LRow = I1; return transform_tensor_descriptor( A1BlockDesc_AL0_M_AL1{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, A_LRow)), make_unmerge_transform(make_tuple(Number{}, I1, I1)), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } template @@ -507,44 +523,31 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma if constexpr(B1EnableLds) { // BL0_N_BL1 -> BL0_NRepeat_Nwaves_NPerWmma_BL1 - constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0); - constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2); + constexpr auto B_L0 = B1BlockDesc_{}.GetLength(I0); + constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I2); + constexpr auto B_LRow = I1; return transform_tensor_descriptor( B1BlockDesc_{}, - make_tuple(make_pass_through_transform(Number{}), + make_tuple(make_unmerge_transform(make_tuple(Number{}, B_LRow)), make_unmerge_transform(make_tuple( Number{}, Number{}, Number{})), make_pass_through_transform(Number{})), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); } else { - // LWmma_NRepeat_NWave_LRow_NPerWmma_L1 -> L0_NRepeat_Nwaves_NPerWmma_L1 - constexpr auto LWmma = B1BlockDesc_{}.GetLength(I0); - constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I5); - - // Workaround, Freeze transform - return transform_tensor_descriptor( - B1BlockDesc_{}, - make_tuple(make_freeze_transform(I0), - make_pass_through_transform(Number{}), - make_pass_through_transform(Number{}), - make_pass_through_transform(I1), - make_pass_through_transform(I1), - make_pass_through_transform(Number{})), - make_tuple(Sequence<3>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<4>{}, - Sequence<5>{}), - make_tuple(Sequence<>{}, - Sequence<0>{}, - Sequence<1>{}, - Sequence<2>{}, - Sequence<3>{}, - Sequence<4>{})); + constexpr auto LWmma = B1BlockDesc_{}.GetLength(I0); + constexpr auto L0PerWmma = B1BlockDesc_{}.GetLength(I3); + constexpr auto B_LRow = B1BlockDesc_{}.GetLength(I4); + constexpr auto B_L1 = B1BlockDesc_{}.GetLength(I6); + + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); } }(); @@ -610,9 +613,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma else { return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * - a_grid_desc.GetLength(I4), + a_grid_desc.GetLength(I5), a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * - a_grid_desc.GetLength(I5)); + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6)); } }; @@ -625,9 +628,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma else { return make_tuple(b0_grid_desc.GetLength(I1) * b0_grid_desc.GetLength(I2) * - b0_grid_desc.GetLength(I4), + b0_grid_desc.GetLength(I5), b0_grid_desc.GetLength(I0) * b0_grid_desc.GetLength(I3) * - b0_grid_desc.GetLength(I5)); + b0_grid_desc.GetLength(I4) * b0_grid_desc.GetLength(I6)); } }; @@ -640,9 +643,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma else { return make_tuple(b1_grid_desc.GetLength(I1) * b1_grid_desc.GetLength(I2) * - b1_grid_desc.GetLength(I4), + b1_grid_desc.GetLength(I5), b1_grid_desc.GetLength(I0) * b1_grid_desc.GetLength(I3) * - b1_grid_desc.GetLength(I5)); + b1_grid_desc.GetLength(I4) * b1_grid_desc.GetLength(I6)); } }; @@ -884,6 +887,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); @@ -896,11 +900,12 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, ABlockTransferSrcScalarPerVector, AThreadTransferSrcResetCoordinateAfterRun, true>( @@ -908,6 +913,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma make_multi_index(0, m_block_data_idx_on_grid/(MWaves * MPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); @@ -960,6 +966,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> LRepeat -> LWaves -> KRow -> LPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; auto b0_block_buf = make_static_buffer( b0_block_desc.GetElementSpaceSize()); @@ -972,11 +979,12 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, B0BlockTransferSrcScalarPerVector, B0ThreadTransferSrcResetCoordinateAfterRun, true>( @@ -984,6 +992,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma make_multi_index(0, 0/(LWaves * LPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); @@ -1054,7 +1063,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma return make_multi_index(-a_grid_desc.GetLength(I0), 0, 0); } else{ - return make_multi_index(-a_grid_desc.GetLength(I0), 0, 0, 0, 0, 0); + return make_multi_index(-a_grid_desc.GetLength(I0), 0, 0, 0, 0, 0, 0); } }(); @@ -1063,7 +1072,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma return make_multi_index(-b0_grid_desc.GetLength(I0), LPerBlock, 0); } else{ - return make_multi_index(-b0_grid_desc.GetLength(I0), LRepeat, 0, 0, 0, 0); + return make_multi_index(-b0_grid_desc.GetLength(I0), LRepeat, 0, 0, 0, 0, 0); } }(); @@ -1072,7 +1081,8 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); } else{ - return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * a_grid_desc.GetLength(I5); + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6); } }(); @@ -1208,6 +1218,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 constexpr auto LWmmaPerBlock = LTilePerBlock / WmmaL; + constexpr auto L0PerWmma = WmmaL/2/L1Value; auto b1_block_buf = make_static_buffer( b1_block_desc.GetElementSpaceSize()); @@ -1220,11 +1231,12 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma Sequence{}, Number{}, I1, + Number{}, I1, I1, Number{}>, - Sequence<0, 1, 2, 3, 4, 5>, - 5, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, B1BlockTransferSrcScalarPerVector, B1ThreadTransferSrcResetCoordinateAfterRun, true>( @@ -1232,6 +1244,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma make_multi_index(0, n_block_data_idx_on_grid/(NWaves * NPerWmma), get_thread_local_1d_id() / 32, + 0, (get_thread_local_1d_id() % 32 )/ 16, get_thread_local_1d_id() % 16, 0)); @@ -1262,7 +1275,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma KPack, false, B1EnableLds, - true>{make_tuple(0, 0, 0, 0, 0)}; + true>{make_tuple(0, 0, 0, 0, 0, 0)}; auto acc1_thread_buf = blockwise_gemm1.GetCThreadBuffer(); @@ -1271,7 +1284,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma return b0_grid_desc.GetLength(I1); } else{ - return b0_grid_desc.GetLength(I1) * b0_grid_desc.GetLength(I2) * b0_grid_desc.GetLength(I4); + return b0_grid_desc.GetLength(I1) * b0_grid_desc.GetLength(I2) * b0_grid_desc.GetLength(I5); } }(); diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp index 3635d3bb2d1..cf9dc8f909c 100644 --- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp @@ -186,7 +186,7 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm typename MPerWmma, typename AK1> __host__ __device__ static constexpr auto - MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AKRow_MPerWmma_AK1( + MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AK0PerWmma_AKRow_MPerWmma_AK1( const AGridDesc_M_K& a_grid_desc_m_k, const WmmaK&, const MRepeat&, @@ -194,17 +194,19 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm const MPerWmma&, const AK1&) { - const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlock; - const auto K = a_grid_desc_m_k.GetLength(I1); - const auto AKWmma = K / WmmaK{}; - constexpr auto AKRow = WmmaK{} / AK1{}; + const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlock; + const auto K = a_grid_desc_m_k.GetLength(I1); + const auto AKWmma = K / WmmaK{}; + constexpr auto AKRow = 2; + constexpr auto AK0PerWmma = WmmaK{} / AKRow / AK1{}; return transform_tensor_descriptor( a_grid_desc_m_k, - make_tuple(make_unmerge_transform(make_tuple(AKWmma, AKRow, AK1{})), + make_tuple(make_unmerge_transform( + make_tuple(AKWmma, Number{}, Number{}, AK1{})), make_unmerge_transform(make_tuple(M0 * MRepeat{}, MWaves{}, MPerWmma{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } // @@ -254,7 +256,7 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm typename LPerWmma, typename BK1> __host__ __device__ static constexpr auto - MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BKRow_LPerWmma_BK1( + MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BK0PerWmma_BKRow_LPerWmma_BK1( const BGridDesc_L_K& b_grid_desc_l_k, const WmmaK&, const LRepeat&, @@ -262,17 +264,19 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm const LPerWmma&, const BK1&) { - const auto L0 = b_grid_desc_l_k.GetLength(I0) / NPerBlock; - const auto K = b_grid_desc_l_k.GetLength(I1); - const auto BKWmma = K / WmmaK{}; - constexpr auto BKRow = WmmaK{} / BK1{}; + const auto L0 = b_grid_desc_l_k.GetLength(I0) / NPerBlock; + const auto K = b_grid_desc_l_k.GetLength(I1); + const auto BKWmma = K / WmmaK{}; + constexpr auto BKRow = 2; + constexpr auto BK0PerWmma = WmmaK{} / BKRow / BK1{}; return transform_tensor_descriptor( b_grid_desc_l_k, - make_tuple(make_unmerge_transform(make_tuple(BKWmma, BKRow, BK1{})), + make_tuple(make_unmerge_transform( + make_tuple(BKWmma, Number{}, Number{}, BK1{})), make_unmerge_transform(make_tuple(L0 * LRepeat{}, LWaves{}, LPerWmma{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } // @@ -323,7 +327,7 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm typename NPerWmma, typename BL1> __host__ __device__ static constexpr auto - MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves_BLRow_NPerWmma_BL1( + MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves__BL0PerWmma_BLRow_NPerWmma_BL1( const BGridDesc_N_L& b_grid_desc_n_l, const WmmaL&, const NRepeat&, @@ -331,17 +335,19 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm const NPerWmma&, const BL1&) { - const auto N0 = b_grid_desc_n_l.GetLength(I0) / OPerBlock; - const auto L = b_grid_desc_n_l.GetLength(I1); - const auto BLWmma = L / WmmaL{}; - constexpr auto BLRow = WmmaL{} / BL1{}; + const auto N0 = b_grid_desc_n_l.GetLength(I0) / OPerBlock; + const auto L = b_grid_desc_n_l.GetLength(I1); + const auto BLWmma = L / WmmaL{}; + constexpr auto BLRow = 2; + constexpr auto BL0PerWmma = WmmaL{} / BLRow / BL1{}; return transform_tensor_descriptor( b_grid_desc_n_l, - make_tuple(make_unmerge_transform(make_tuple(BLWmma, BLRow, BL1{})), + make_tuple(make_unmerge_transform( + make_tuple(BLWmma, Number{}, Number{}, BL1{})), make_unmerge_transform(make_tuple(N0 * NRepeat{}, NWaves{}, NPerWmma{}))), make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 5>{}, Sequence<1, 2, 4>{})); + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); } // From efee4541d9d77ffb7e91f7b3dc9c0cbe2d41c508 Mon Sep 17 00:00:00 2001 From: Haocong WANG Date: Tue, 13 Jun 2023 14:52:33 +0800 Subject: [PATCH 084/118] AIT Attention API refactor (#8) * sanity pass * sanity pass 2 * confirm significant performance regression. * turn on all instances * turn off instance format * Fix bug & tunning & format * DML meta, self_attn+cross_attn * sanity pass * remove useless flag * update tile and problem size used in AIT attention * bug fix in grouped conv supporting check --- .../CMakeLists.txt | 2 + ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 265 ++-- .../cross_attention_forward_wmma_fp16.cpp | 332 +++++ ...atched_gemm_scale_softmax_gemm_permute.inc | 296 +++-- .../run_cross_attention.inc | 344 ++++++ .../run_self_attention.inc | 343 ++++++ .../self_attention_forward_wmma_fp16.cpp | 288 +++++ ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 1072 +++++++++++++++-- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 4 +- .../gpu/device/masking_specialization.hpp | 5 +- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 117 +- .../transform_contraction_to_gemm.hpp | 75 +- script/unet_mha.sh | 52 + 13 files changed, 2761 insertions(+), 434 deletions(-) create mode 100644 example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_cross_attention.inc create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_self_attention.inc create mode 100644 example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp create mode 100644 script/unet_mha.sh diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 5aa5692aacf..c6c6fc3209e 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -8,6 +8,8 @@ add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_pe if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) + add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp) + add_example_executable(example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp) endif() add_custom_target(example_gemm_scale_softmax_gemm) diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index aa122b4c9ff..5ccdb227923 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -67,77 +67,200 @@ static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecial static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; -using DeviceGemmInstance = - ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< - NumDimG, - NumDimM, - NumDimN, - NumDimK, - NumDimO, - ADataType, - B0DataType, - B1DataType, - CDataType, - Acc0BiasDataType, - Acc0DataType, - Acc1BiasDataType, - Acc1DataType, - CShuffleDataType, - AElementOp, - B0ElementOp, - Acc0ElementOp, - B1ElementOp, - CElementOp, - GemmSpec, - TensorSpecA, - TensorSpecB0, - TensorSpecB1, - TensorSpecC, - 1, - 256, - // Gemm 0 - 128, // MPerBlock - 64, // LPerBlock - 64, // KPerBlock - 8, // K1 - // Gemm 1 - 64, // NPerBlock - 64, // LTilePerBlock - 8, // L1 - 16, // MPerWMMA - 16, // LPerWMMA - 16, // NPerWMMA - // Per repeat = wave_m = wave_num, wave_n = 1 - 1, // MRepeat - 4, // LRepeat - 4, // NRepeat - S<4, 64, 1>, // ABlockTransfer MK -> K0 M K1 - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - true, - S<4, 64, 1>, // B0BlockTransfer LK -> K0 L K1 - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 8, - 8, - true, - S<4, 8, 8>, // B1BlockTransfer NL -> L0 N L1 - S<0, 2, 1>, - S<0, 2, 1>, - 1, - 8, - 1, - false, - 1, // CShuffleMWmmaPerWavePerShuffle - 2, // CShuffleNWmmaPerWavePerShuffle - S<1, 64, 1, 4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock - 8, // CShuffleBlockTransferScalarPerVector_NPerBlock - MaskingSpec>; // MaskingSpecialization - +// clang-format off +// #define CK_MHA_USE_WAVE_1 +// #define CK_MHA_USE_WAVE_2 +#define CK_MHA_USE_WAVE_4 +#define CK_MHA_USE_WAVE_8 +using DeviceMHAFactory = + std::tuple< +#ifdef CK_MHA_USE_WAVE_1 + // 1 wave, mrepeat = 1, nrepeat = 2, k/o repeat = 1~5 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_2 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_4 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_8 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec> +#endif + >; +// clang-format on // Ref Gemm0: fp16 in, fp32 out using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using B0DataType = F16; +using B1DataType = F16; +using Acc0DataType = F32; +using Acc1DataType = F32; +using CShuffleDataType = F32; +using CDataType = F16; +using Acc0BiasDataType = ck::Tuple<>; +using Acc1BiasDataType = ck::Tuple<>; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 1; +static constexpr ck::index_t NumDimN = 1; +static constexpr ck::index_t NumDimK = 1; +static constexpr ck::index_t NumDimO = 1; + +using AElementOp = PassThrough; +using B0ElementOp = PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; +static constexpr auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; + +// clang-format off +#define CK_MHA_USE_WAVE_1 +#define CK_MHA_USE_WAVE_2 +#define CK_MHA_USE_WAVE_4 +#define CK_MHA_USE_WAVE_8 +using DeviceMHAFactory = + std::tuple< +#ifdef CK_MHA_USE_WAVE_1 + // 1 wave, mrepeat = 1, nrepeat = 2, k/o repeat = 1~5 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 32, 160, 8, 8, + // Gemm 1 + 80, 32, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 2, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 64, 80, 8, 8, + // Gemm 1 + 80, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 64, 48, 8, 8, + // Gemm 1 + 48, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 3, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_2 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 64, 48, 8, 8, + // Gemm 1 + 48, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 3, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 64, 80, 8, 8, + // Gemm 1 + 80, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 32, 160, 8, 8, + // Gemm 1 + 80, 32, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 2, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_4 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 128, 80, 8, 8, + // Gemm 1 + 80, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 192, 48, 8, 8, + // Gemm 1 + 48, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 12, 3, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 64, 48, 8, 8, + // Gemm 1 + 48, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 3, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_8 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 192, 48, 8,4, + // Gemm 1 + 48, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 12, 3, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec> +#endif + >; +// clang-format on +// Ref Gemm0: fp16 in, fp32 out +using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +// Ref Softmax: fp32 in, fp16 out +using ReferenceSoftmaxInstance = + ck::tensor_operation::host::ReferenceSoftmax; + +// Ref Gemm1: fp16 in, fp16 out +using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +#include "run_cross_attention.inc" + +int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 099ea7354b4..2f494a7ab89 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -127,16 +127,31 @@ int run(int argc, char* argv[]) b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; - case 6: // Rand: a b0 ; unit: b1 pass + case 6: // Rand: a b0 ; unit: B1 a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); break; - case 7: // Rand: a b1 ; unit: b0 pass + case 7: // Rand: a b1 ; unit: b0 a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); break; + case 8: // Rand: a ; unit: b0 b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 9: // Rand: b0 ; unit: a b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 10: // Rand: b1 ; unit: a b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); @@ -160,139 +175,166 @@ int run(int argc, char* argv[]) auto c_element_op = CElementOp{}; // do GEMM + float best_perf = .0; + float best_time = .0; + int not_pass = 0; + std::string best_kernel = ""; + printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? - auto gemm = DeviceGemmInstance{}; - auto invoker = gemm.MakeInvoker(); - auto argument = gemm.MakeArgument( - static_cast(a_device_buf.GetDeviceBuffer()), - static_cast(b0_device_buf.GetDeviceBuffer()), - static_cast(b1_device_buf.GetDeviceBuffer()), - static_cast(c_device_buf.GetDeviceBuffer()), - {}, // std::array p_acc0_biases; - {}, // std::array p_acc1_biases; - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ns_ks_lengths, - b0_gs_ns_ks_strides, - b1_gs_os_ns_lengths, - b1_gs_os_ns_strides, - c_gs_ms_os_lengths, - c_gs_ms_os_strides, - {}, // std::array, 1>{acc0_biases_gs_ms_ns_lengths}, - {}, // std::array, 1>{acc0_biases_gs_ms_ns_strides}, - {}, // std::array, 1>{acc1_biases_gs_ms_os_lengths}, - {}, // std::array, 1>{acc1_biases_gs_ms_os_strides}, - a_element_op, - b0_element_op, - acc0_element_op, - b1_element_op, - c_element_op); - - if(!gemm.IsSupportedArgument(argument)) - { - std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; - - return 0; - } + ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { + const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + + using DeviceMHAInstance = ck::remove_cvref_t; + auto gemm = DeviceMHAInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b0_device_buf.GetDeviceBuffer()), + static_cast(b1_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + O, + G0, + G1, + alpha, + input_permute, + output_permute); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; - ck::index_t BatchCount = G0 * G1; + // return 0; + } - float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + ck::index_t BatchCount = G0 * G1; - std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + - sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * - BatchCount; + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - float tflops = static_cast(flop) / 1.E9 / ave_time; + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + BatchCount; - float gb_per_sec = num_btype / 1.E6 / ave_time; + float tflops = static_cast(flop) / 1.E9 / ave_time; - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " - << gemm.GetTypeString() << std::endl; + float gb_per_sec = num_btype / 1.E6 / ave_time; - if(do_verification) - { - c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); - - Tensor a_g_m_k({BatchCount, M, K}); - Tensor b0_g_k_n({BatchCount, K, N}); - Tensor b1_g_n_o({BatchCount, N, O}); - Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 - Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax - Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 - - // permute - a_gs_ms_ks.ForEach([&](auto& self, auto idx) { - a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); - }); - b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { - b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); - }); - b1_gs_os_ns.ForEach([&](auto& self, auto idx) { - b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); - }); - - // gemm 0 - auto ref_gemm0 = ReferenceGemm0Instance{}; - auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); - auto ref_gemm0_argument = ref_gemm0.MakeArgument( - a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); - - ref_gemm0_invoker.Run(ref_gemm0_argument); - - // masking - const auto mask = DeviceGemmInstance::C0MatrixMask(N); - acc0_g_m_n.ForEach([&](auto& self, auto idx) { - if(mask.IsMaskedElement(idx[1], idx[2])) - self(idx) = -ck::NumericLimits::Infinity(); - }); - - // softmax - auto ref_softmax = ReferenceSoftmaxInstance{}; - auto ref_softmax_invoker = ref_softmax.MakeInvoker(); - auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); - - ref_softmax_invoker.Run(ref_softmax_argument); - - // gemm1 - auto ref_gemm1 = ReferenceGemm1Instance{}; - auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); - auto ref_gemm1_argument = ref_gemm1.MakeArgument( - a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op); - - ref_gemm1_invoker.Run(ref_gemm1_argument); - - // permute - c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { - const size_t& g0 = idx[0]; - const size_t& g1 = idx[1]; - - const size_t g = g0 * G1 + g1; - - self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); - }); - - // default absolute error and relative error is 0.001 - double rtol = 1e-3; - double atol = 1e-3; - - // when BF16 is taken, set absolute error and relative error to 0.01 - if(std::is_same_v && std::is_same_v && - std::is_same_v && std::is_same_v) + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + if(tflops > best_perf) { - rtol = 1e-2; - atol = 1e-2; + best_perf = tflops; + best_time = ave_time * 1000; + best_kernel = gemm.GetTypeString(); } - - return ck::utils::check_err(c_gs_ms_os_device_result.mData, - c_gs_ms_os_host_result.mData, - "Error: Incorrect results!", - rtol, - atol) - ? 0 - : 1; - } - - return 0; + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g_m_k({BatchCount, M, K}); + Tensor b0_g_k_n({BatchCount, K, N}); + Tensor b1_g_n_o({BatchCount, N, O}); + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax + Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument( + a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + acc0_g_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[1], idx[2])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n, + b1_g_n_o, + c_g_m_o_host_result, + PassThrough{}, + b1_element_op, + c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { + const size_t& g0 = idx[0]; + const size_t& g1 = idx[1]; + + const size_t g = g0 * G1 + g1; + + self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); + }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) + { + rtol = 1e-2; + atol = 1e-2; + } + + bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol); + printf("Verification: %s, Pass: %s\n", + do_verification ? "ON" : "OFF", + this_run_verification ? "YES" : "NO"); + + if(!this_run_verification) + { + not_pass = 1; + printf("%d th MHA instance verification Failed \n", i.value); + } + } + }); + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M + << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time + << " us" << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + return not_pass; } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention.inc new file mode 100644 index 00000000000..c5ae4a6b01d --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention.inc @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +int run(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape for A/B0/B1/C + // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o + ck::index_t M = 256; + ck::index_t N = 64; + ck::index_t K = 80; + ck::index_t O = 80; + + // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape + // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) + // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t G0 = 2; + ck::index_t G1 = 8; + + float alpha = 1; + + bool input_permute = false; + bool output_permute = true; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 13) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + O = std::stoi(argv[7]); + G0 = std::stoi(argv[8]); + G1 = std::stoi(argv[9]); + + alpha = std::stof(argv[10]); + + input_permute = std::stoi(argv[11]); + output_permute = std::stoi(argv[12]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 11: M, N, K, O, G0, G1\n"); + printf("arg10: scale (alpha)\n"); + printf("arg11 to 12: input / output permute\n"); + exit(0); + } + + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides = + input_permute + ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] + : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::vector b0_gs_ns_ks_strides = + input_permute + ? std::vector{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K] + : std::vector{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + std::vector b1_gs_os_ns_strides = + input_permute + ? std::vector{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O] + : std::vector{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides = + output_permute + ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] + : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + Tensor b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + Tensor c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + Tensor c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; + std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl; + std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl; + std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + case 3: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + break; + case 4: // A, B0, B1 1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 5: // Rand: b1 b0; unit: a + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 6: // Rand: a b0 ; unit: B1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 7: // Rand: a b1 ; unit: b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 8: // Rand: a ; unit: b0 b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 9: // Rand: b0 ; unit: a b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 10: // Rand: b1 ; unit: a b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + } + + std::vector kv_gs_ns_ks_lengths{G0, G1, N, 2, K}; + std::vector kv_gs_ns_ks_strides = std::vector{ + N * G1 * 2 * K, 2 * K, G1 * 2 * K, K, 1}; // kv layout [G0, M, G1, 2, K] + Tensor kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides); + // merge kv into a packed pointer send to device + b0_gs_ns_ks.ForEach( + [&](auto& self, auto idx) { kv_gs_ns_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); }); + b1_gs_os_ns.ForEach( + [&](auto& self, auto idx) { kv_gs_ns_ks(idx[0], idx[1], idx[3], 1, idx[2]) = self(idx); }); + DeviceMem q_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize()); + DeviceMem kv_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize() + + sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * + c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()); + q_device_buf.ToDevice(a_gs_ms_ks.mData.data()); + kv_device_buf.ToDevice(kv_gs_ns_ks.mData.data()); + + auto a_element_op = AElementOp{}; + auto b0_element_op = B0ElementOp{}; + auto acc0_element_op = Acc0ElementOp{alpha}; + auto b1_element_op = B1ElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + float best_perf = .0; + float best_time = .0; + int not_pass = 0; + std::string best_kernel = ""; + printf("Verification: %s\n", do_verification ? "ON" : "OFF"); + // TODO ANT: replace array with vector? + ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { + const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + + using DeviceMHAInstance = ck::remove_cvref_t; + auto gemm = DeviceMHAInstance{}; + auto invoker = gemm.MakeCrossAttnInvoker(); + auto argument = + gemm.MakeCrossAttnArgument(static_cast(q_device_buf.GetDeviceBuffer()), + static_cast(kv_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + G0, + M, + N, + G1, + K, + alpha); + + // if(!gemm.IsSupportedArgument(argument)) + // { + // std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; + + // return 0; + // } + + ck::index_t BatchCount = G0 * G1; + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + BatchCount; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + if(tflops > best_perf) + { + best_perf = tflops; + best_time = ave_time * 1000; + best_kernel = gemm.GetTypeString(); + } + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g_m_k({BatchCount, M, K}); + Tensor b0_g_k_n({BatchCount, K, N}); + Tensor b1_g_n_o({BatchCount, N, O}); + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax + Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument( + a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + acc0_g_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[1], idx[2])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n, + b1_g_n_o, + c_g_m_o_host_result, + PassThrough{}, + b1_element_op, + c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { + const size_t& g0 = idx[0]; + const size_t& g1 = idx[1]; + + const size_t g = g0 * G1 + g1; + + self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); + }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) + { + rtol = 1e-2; + atol = 1e-2; + } + + bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol); + printf("Verification: %s, Pass: %s\n", + do_verification ? "ON" : "OFF", + this_run_verification ? "YES" : "NO"); + + if(!this_run_verification) + { + not_pass = 1; + printf("%d th MHA instance verification Failed \n", i.value); + } + } + }); + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M + << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time + << " us" << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + return not_pass; +} diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention.inc new file mode 100644 index 00000000000..b9c474bb0c3 --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention.inc @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +int run(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape for A/B0/B1/C + // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o + ck::index_t M = 256; + ck::index_t N = 256; + ck::index_t K = 80; + ck::index_t O = 80; + + // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape + // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) + // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t G0 = 2; + ck::index_t G1 = 8; + + float alpha = 1; + + bool input_permute = false; + bool output_permute = true; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 13) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + O = std::stoi(argv[7]); + G0 = std::stoi(argv[8]); + G1 = std::stoi(argv[9]); + + alpha = std::stof(argv[10]); + + input_permute = std::stoi(argv[11]); + output_permute = std::stoi(argv[12]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 11: M, N, K, O, G0, G1\n"); + printf("arg10: scale (alpha)\n"); + printf("arg11 to 12: input / output permute\n"); + exit(0); + } + + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides = + input_permute + ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] + : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::vector b0_gs_ns_ks_strides = + input_permute + ? std::vector{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K] + : std::vector{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + std::vector b1_gs_os_ns_strides = + input_permute + ? std::vector{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O] + : std::vector{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides = + output_permute + ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] + : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + Tensor b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + Tensor c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + Tensor c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; + std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl; + std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl; + std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + case 3: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + break; + case 4: // A, B0, B1 1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 5: // Rand: b1 b0; unit: a + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 6: // Rand: a b0 ; unit: B1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 7: // Rand: a b1 ; unit: b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 8: // Rand: a ; unit: b0 b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 9: // Rand: b0 ; unit: a b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 10: // Rand: b1 ; unit: a b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + } + + std::vector qkv_gs_ms_ks_lengths{G0, G1, M, 3, K}; + std::vector qkv_gs_ms_ks_strides = std::vector{ + M * G1 * 3 * K, 3 * K, G1 * 3 * K, K, 1}; // qkv layout [G0, M, G1, 3, K] + Tensor qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides); + // merge qkv into a packed pointer send to device + a_gs_ms_ks.ForEach( + [&](auto& self, auto idx) { qkv_gs_ms_ks(idx[0], idx[1], idx[2], 0, idx[3]) = self(idx); }); + b0_gs_ns_ks.ForEach( + [&](auto& self, auto idx) { qkv_gs_ms_ks(idx[0], idx[1], idx[2], 1, idx[3]) = self(idx); }); + b1_gs_os_ns.ForEach( + [&](auto& self, auto idx) { qkv_gs_ms_ks(idx[0], idx[1], idx[3], 2, idx[2]) = self(idx); }); + DeviceMem qkv_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize() + + sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize() + + sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * + c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()); + qkv_device_buf.ToDevice(qkv_gs_ms_ks.mData.data()); + + auto a_element_op = AElementOp{}; + auto b0_element_op = B0ElementOp{}; + auto acc0_element_op = Acc0ElementOp{alpha}; + auto b1_element_op = B1ElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + float best_perf = .0; + float best_time = .0; + int not_pass = 0; + std::string best_kernel = ""; + printf("Verification: %s\n", do_verification ? "ON" : "OFF"); + // TODO ANT: replace array with vector? + ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { + const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + + using DeviceMHAInstance = ck::remove_cvref_t; + auto gemm = DeviceMHAInstance{}; + auto invoker = gemm.MakeSelfAttnInvoker(); + auto argument = + gemm.MakeSelfAttnArgument(static_cast(qkv_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + G0, + M, + G1, + K, + alpha); + + // if(!gemm.IsSupportedArgument(argument)) + // { + // std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; + + // return 0; + // } + + ck::index_t BatchCount = G0 * G1; + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + BatchCount; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + if(tflops > best_perf) + { + best_perf = tflops; + best_time = ave_time * 1000; + best_kernel = gemm.GetTypeString(); + } + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g_m_k({BatchCount, M, K}); + Tensor b0_g_k_n({BatchCount, K, N}); + Tensor b1_g_n_o({BatchCount, N, O}); + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax + Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument( + a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + acc0_g_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[1], idx[2])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n, + b1_g_n_o, + c_g_m_o_host_result, + PassThrough{}, + b1_element_op, + c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { + const size_t& g0 = idx[0]; + const size_t& g1 = idx[1]; + + const size_t g = g0 * G1 + g1; + + self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); + }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) + { + rtol = 1e-2; + atol = 1e-2; + } + + bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol); + printf("Verification: %s, Pass: %s\n", + do_verification ? "ON" : "OFF", + this_run_verification ? "YES" : "NO"); + + if(!this_run_verification) + { + not_pass = 1; + printf("%d th MHA instance verification Failed \n", i.value); + } + } + }); + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M + << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time + << " us" << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + return not_pass; +} diff --git a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp new file mode 100644 index 00000000000..91aebf152dd --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +/* +Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g_k_l) * B1_g_l_n + |-----------------| + Gemm0 + |-------------------------------------| + Gemm1 +*/ + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using B0DataType = F16; +using B1DataType = F16; +using Acc0DataType = F32; +using Acc1DataType = F32; +using CShuffleDataType = F32; +using CDataType = F16; +using Acc0BiasDataType = ck::Tuple<>; +using Acc1BiasDataType = ck::Tuple<>; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 1; +static constexpr ck::index_t NumDimN = 1; +static constexpr ck::index_t NumDimK = 1; +static constexpr ck::index_t NumDimO = 1; + +using AElementOp = PassThrough; +using B0ElementOp = PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; +static constexpr auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; + +// clang-format off +#define CK_MHA_USE_WAVE_1 +#define CK_MHA_USE_WAVE_2 +#define CK_MHA_USE_WAVE_4 +#define CK_MHA_USE_WAVE_8 +using DeviceMHAFactory = + std::tuple< +#ifdef CK_MHA_USE_WAVE_1 + // 1 wave, mrepeat = 1, nrepeat = 2, k/o repeat = 1~5 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_2 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_4 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_8 + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec> +#endif + >; +// clang-format on +// Ref Gemm0: fp16 in, fp32 out +using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +// Ref Softmax: fp32 in, fp16 out +using ReferenceSoftmaxInstance = + ck::tensor_operation::host::ReferenceSoftmax; + +// Ref Gemm1: fp16 in, fp16 out +using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm; + +#include "run_self_attention.inc" + +int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 1965298a996..a39b71d0b1d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -5,7 +5,11 @@ #include #include +#include +#include +#include +#include "ck/ck.hpp" #include "ck/utility/common_header.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" @@ -22,6 +26,417 @@ namespace ck { namespace tensor_operation { namespace device { +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_batched_gemm_softmax_gemm_wmma_cshuffle(const ADataType* __restrict__ p_a_grid, + const B0DataType* __restrict__ p_b0_grid, + const B1DataType* __restrict__ p_b1_grid, + CDataType* __restrict__ p_c_grid, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) + + // clang-format off +// *************************************************** +// Make Tensor Descriptors + constexpr index_t array_size = 4; + std::array a_gs_ms_ks_lengths{G0, G1, M, K}; + std::array a_gs_ms_ks_strides = + input_permute + ? std::array{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] + : std::array{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::array b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::array b0_gs_ns_ks_strides = + input_permute + ? std::array{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K] + : std::array{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::array b1_gs_os_ns_lengths{G0, G1, O, N}; + std::array b1_gs_os_ns_strides = + input_permute + ? std::array{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O] + : std::array{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::array c_gs_ms_os_lengths{G0, G1, M, O}; + std::array c_gs_ms_os_strides = + output_permute + ? std::array{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] + : std::array{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + const auto a_element_op = AElementwiseOperation{}; + const auto b0_element_op = B0ElementwiseOperation{}; + const auto acc0_element_op = AccElementwiseOperation{alpha}; + const auto b1_element_op = B1ElementwiseOperation{}; + const auto c_element_op = CElementwiseOperation{}; + // fail to reuse DeviceOp::MakeArgument() because of the __device__ function required. + + const auto a_grid_desc = DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n); + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto a_grid_desc_g_m_k = + DeviceOp::Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc_g_l_k = + DeviceOp::Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc_g_n_l = + DeviceOp::Transform::MakeB1GridDescriptor_G_N_K(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto compute_base_ptr_of_batch = + typename DeviceOp::ComputeBasePtrOfStridedBatch{a_grid_desc_g_m_k, b0_grid_desc_g_l_k, b1_grid_desc_g_n_l, c_grid_desc_g_m_n}; + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + const auto c0_matrix_mask = typename DeviceOp::C0MatrixMask{b0_grid_desc_g_l_k.GetLength(Number<1>{})}; + + // clang-format on + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + GridwiseOp::template Run(p_a_grid + a_batch_offset, + p_b0_grid + b0_batch_offset, + p_b1_grid + b1_batch_offset, + p_c_grid + c_batch_offset, + p_shared, + a_grid_desc, + b0_grid_desc, + b1_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b0_element_op, + acc0_element_op, + b1_element_op, + c_element_op, + c0_matrix_mask, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b0_grid; + ignore = p_b1_grid; + ignore = p_c_grid; + ignore = M; + ignore = N; + ignore = K; + ignore = O; + ignore = G0; + ignore = G1; + ignore = input_permute; + ignore = output_permute; +#endif // end of if (defined(__gfx1100__)) +} + +// Self-Attention +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_wmma_self_attention_forward(const QKVDataType* __restrict__ p_qkv_grid, + ODataType* __restrict__ p_out_grid, + index_t batch_size, + index_t sequence_length, + index_t head_count, + index_t head_size, + float alpha) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) + + // clang-format off +// *************************************************** +// Make Tensor Descriptors +// o Self-attention(packed QKV): [batchSize, sequenceLength, headCount, 3, headSize] + constexpr index_t array_size = 4; + std::array qk_gs_ms_ks_lengths{batch_size, head_count, sequence_length, head_size}; + std::array qk_gs_ms_ks_strides{sequence_length * head_count * 3 * head_size, 3 * head_size, head_count * 3 * head_size, 1}; + + std::array v_gs_os_ns_lengths{batch_size, head_count, head_size, sequence_length}; + std::array v_gs_os_ns_strides{sequence_length * head_count * 3 * head_size, 3 * head_size, 1, head_count * 3 * head_size}; + + std::array c_gs_ms_os_lengths{batch_size, head_count, sequence_length, head_size}; + std::array c_gs_ms_os_strides{sequence_length * head_count * head_size, head_size, head_count * head_size, 1}; + + + const auto a_element_op = AElementwiseOperation{}; + const auto b0_element_op = B0ElementwiseOperation{}; + const auto acc0_element_op = AccElementwiseOperation{alpha}; + const auto b1_element_op = B1ElementwiseOperation{}; + const auto c_element_op = CElementwiseOperation{}; + + const auto a_grid_desc = DeviceOp::MakeAGridDescriptor(qk_gs_ms_ks_lengths, qk_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(qk_gs_ms_ks_lengths, qk_gs_ms_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(v_gs_os_ns_lengths, v_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n); + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto a_grid_desc_g_m_k = + DeviceOp::Transform::MakeAGridDescriptor_G_M_K(qk_gs_ms_ks_lengths, qk_gs_ms_ks_strides); + const auto b0_grid_desc_g_l_k = + DeviceOp::Transform::MakeB0GridDescriptor_G_N_K(qk_gs_ms_ks_lengths, qk_gs_ms_ks_strides); + const auto b1_grid_desc_g_n_l = + DeviceOp::Transform::MakeB1GridDescriptor_G_N_K(v_gs_os_ns_lengths, v_gs_os_ns_strides); + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto compute_base_ptr_of_batch = + typename DeviceOp::ComputeBasePtrOfStridedBatch{a_grid_desc_g_m_k, b0_grid_desc_g_l_k, b1_grid_desc_g_n_l, c_grid_desc_g_m_n}; + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + const auto c0_matrix_mask = typename DeviceOp::C0MatrixMask{b0_grid_desc_g_l_k.GetLength(Number<1>{})}; + + // clang-format on + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + const index_t qkv_gap = __builtin_amdgcn_readfirstlane(head_size); +#ifdef CK_SELF_ATTN_DEBUG + if(get_thread_global_1d_id() == 0) + { + printf("batch_size: %d\n", batch_size); + printf("sequence_length: %d\n", sequence_length); + printf("head_count: %d\n", head_count); + printf("head_size: %d\n", head_size); + printf("qkv_gap: %d\n", qkv_gap); + printf("get_grid_size(): %d\n", get_grid_size()); + printf("batch_count: %d\n", batch_count); + printf("blockid: %d\n", get_block_1d_id()); + printf("num_blocks_per_batch: %d\n", num_blocks_per_batch); + printf("g_idx: %d\n", g_idx); + printf("a_batch_offset: %ld\n", a_batch_offset); + printf("b0_batch_offset: %ld\n", b0_batch_offset); + printf("b1_batch_offset: %ld\n", b1_batch_offset); + } +#endif + GridwiseOp::template Run(p_qkv_grid + 0 * qkv_gap + a_batch_offset, + p_qkv_grid + 1 * qkv_gap + b0_batch_offset, + p_qkv_grid + 2 * qkv_gap + b1_batch_offset, + p_out_grid + c_batch_offset, + p_shared, + a_grid_desc, + b0_grid_desc, + b1_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b0_element_op, + acc0_element_op, + b1_element_op, + c_element_op, + c0_matrix_mask, + block_2_ctile_map); +#else + ignore = p_qkv_grid; + ignore = p_out_grid; + ignore = batch_size; + ignore = sequence_length; + ignore = head_count; + ignore = head_size; + ignore = alpha; +#endif // end of if (defined(__gfx1100__)) +} +// Cross-Attention +// Self-Attention +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_wmma_cross_attention_forward(const QDataType* __restrict__ p_q_grid, + const KVDataType* __restrict__ p_kv_grid, + ODataType* __restrict__ p_out_grid, + index_t batch_size, + index_t q_sequence_length, + index_t kv_sequence_length, + index_t head_count, + index_t head_size, + float alpha) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) + + // clang-format off +// *************************************************** +// Make Tensor Descriptors +// o Self-attention(packed QKV): [batchSize, sequenceLength, headCount, 3, headSize] + constexpr index_t array_size = 4; + std::array q_gs_ms_ks_lengths{batch_size, head_count, q_sequence_length, head_size}; + std::array q_gs_ms_ks_strides{q_sequence_length * head_count * head_size, head_size, head_count * head_size, 1}; + + std::array k_gs_ms_ks_lengths{batch_size, head_count, kv_sequence_length, head_size}; + std::array k_gs_ms_ks_strides{kv_sequence_length * head_count * 2 * head_size, 2 * head_size, head_count * 2 * head_size, 1}; + + std::array v_gs_os_ns_lengths{batch_size, head_count, head_size, kv_sequence_length}; + std::array v_gs_os_ns_strides{kv_sequence_length * head_count * 2 * head_size, 2 * head_size, 1, head_count * 2 * head_size}; + + std::array c_gs_ms_os_lengths{batch_size, head_count, q_sequence_length, head_size}; + std::array c_gs_ms_os_strides{q_sequence_length * head_count * head_size, head_size, head_count * head_size, 1}; + + + const auto a_element_op = AElementwiseOperation{}; + const auto b0_element_op = B0ElementwiseOperation{}; + const auto acc0_element_op = AccElementwiseOperation{alpha}; + const auto b1_element_op = B1ElementwiseOperation{}; + const auto c_element_op = CElementwiseOperation{}; + + const auto a_grid_desc = DeviceOp::MakeAGridDescriptor(q_gs_ms_ks_lengths, q_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(k_gs_ms_ks_lengths, k_gs_ms_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(v_gs_os_ns_lengths, v_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n); + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto a_grid_desc_g_m_k = + DeviceOp::Transform::MakeAGridDescriptor_G_M_K(q_gs_ms_ks_lengths, q_gs_ms_ks_strides); + const auto b0_grid_desc_g_l_k = + DeviceOp::Transform::MakeB0GridDescriptor_G_N_K(k_gs_ms_ks_lengths, k_gs_ms_ks_strides); + const auto b1_grid_desc_g_n_l = + DeviceOp::Transform::MakeB1GridDescriptor_G_N_K(v_gs_os_ns_lengths, v_gs_os_ns_strides); + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto compute_base_ptr_of_batch = + typename DeviceOp::ComputeBasePtrOfStridedBatch{a_grid_desc_g_m_k, b0_grid_desc_g_l_k, b1_grid_desc_g_n_l, c_grid_desc_g_m_n}; + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + const auto c0_matrix_mask = typename DeviceOp::C0MatrixMask{b0_grid_desc_g_l_k.GetLength(Number<1>{})}; + + // clang-format on + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + const index_t kv_gap = __builtin_amdgcn_readfirstlane(head_size); +#ifdef CK_SELF_ATTN_DEBUG + if(get_thread_global_1d_id() == 0) + { + printf("batch_size: %d\n", batch_size); + printf("q_sequence_length: %d\n", q_sequence_length); + printf("k_sequence_length: %d\n", kv_sequence_length); + printf("head_count: %d\n", head_count); + printf("head_size: %d\n", head_size); + printf("kv_gap: %d\n", kv_gap); + printf("get_grid_size(): %d\n", get_grid_size()); + printf("batch_count: %d\n", batch_count); + printf("blockid: %d\n", get_block_1d_id()); + printf("num_blocks_per_batch: %d\n", num_blocks_per_batch); + printf("g_idx: %d\n", g_idx); + printf("a_batch_offset: %ld\n", a_batch_offset); + printf("b0_batch_offset: %ld\n", b0_batch_offset); + printf("b1_batch_offset: %ld\n", b1_batch_offset); + } +#endif + GridwiseOp::template Run(p_q_grid + a_batch_offset, + p_kv_grid + 0 * kv_gap + b0_batch_offset, + p_kv_grid + 1 * kv_gap + b1_batch_offset, + p_out_grid + c_batch_offset, + p_shared, + a_grid_desc, + b0_grid_desc, + b1_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b0_element_op, + acc0_element_op, + b1_element_op, + c_element_op, + c0_matrix_mask, + block_2_ctile_map); +#else + ignore = p_q_grid; + ignore = p_kv_grid; + ignore = p_out_grid; + ignore = batch_size; + ignore = q_sequence_length; + ignore = kv_sequence_length; + ignore = head_count; + ignore = head_size; + ignore = alpha; +#endif // end of if (defined(__gfx1100__)) +} // Computes C = A * B0 * B1 // MN = MK * KL * LN // ^^^^^^ (Acc0) @@ -55,7 +470,8 @@ template ; - static auto MakeAGridDescriptor(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + __host__ __device__ static auto MakeAGridDescriptor( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) { if constexpr(AEnableLds) { return Transform::MakeAGridDescriptor_AK0_M_AK1( Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), - Number{}); + Number{}); } else { @@ -184,19 +601,20 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle Number{}, Number{}, Number{}, - Number{}); + Number{}); } } - static auto MakeB0GridDescriptor(const std::vector& b0_gs_ls_ks_lengths_vec, - const std::vector& b0_gs_ls_ks_strides_vec) + __host__ __device__ static auto MakeB0GridDescriptor( + const std::array& b0_gs_ls_ks_lengths_vec, + const std::array& b0_gs_ls_ks_strides_vec) { if constexpr(B0EnableLds) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, b0_gs_ls_ks_strides_vec), - Number{}); + Number{}); } else { @@ -208,12 +626,13 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle Number{}, Number{}, Number{}, - Number{}); + Number{}); } } - static auto MakeB1GridDescriptor(const std::vector& b1_gs_ns_ls_lengths_vec, - const std::vector& b1_gs_ns_ls_strides_vec) + __host__ __device__ static auto MakeB1GridDescriptor( + const std::array& b1_gs_ns_ls_lengths_vec, + const std::array& b1_gs_ns_ls_strides_vec) { if constexpr(B1EnableLds) { @@ -245,7 +664,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); - constexpr static auto make_MaskOutPredicate() + __host__ __device__ constexpr static auto make_MaskOutPredicate() { if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled) { @@ -260,10 +679,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle struct ComputeBasePtrOfStridedBatch { - ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, - const B0GridDesc_G_L_K& b0_grid_desc_g_l_k, - const B1GridDesc_G_N_L& b1_grid_desc_g_n_l, - const CGridDesc_G_M_N& c_grid_desc_g_m_n) + __host__ __device__ ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, + const B0GridDesc_G_L_K& b0_grid_desc_g_l_k, + const B1GridDesc_G_N_L& b1_grid_desc_g_n_l, + const CGridDesc_G_M_N& c_grid_desc_g_m_n) : a_grid_desc_g_m_k_(a_grid_desc_g_m_k), b0_grid_desc_g_l_k_(b0_grid_desc_g_l_k), b1_grid_desc_g_n_l_(b1_grid_desc_g_n_l), @@ -324,7 +743,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle MPerBlock, LPerBlock, KPerBlock, - K1, + AK1, + BK1, NPerBlock, LTilePerBlock, L1, @@ -373,6 +793,323 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle LoopSched, PipelineVer>; + struct RawArg : public BaseArgument + { + RawArg(const ADataType* p_a_grid, + const B0DataType* p_b0_grid, + const B1DataType* p_b1_grid, + CDataType* p_c_grid, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) + : p_a_grid_{p_a_grid}, + p_b0_grid_{p_b0_grid}, + p_b1_grid_{p_b1_grid}, + p_c_grid_{p_c_grid}, + M_{M}, + N_{N}, + K_{K}, + O_{O}, + G0_{G0}, + G1_{G1}, + alpha_{alpha}, + input_permute_{input_permute}, + output_permute_{output_permute} + { + } + // Pointers + const ADataType* p_a_grid_; + const B0DataType* p_b0_grid_; + const B1DataType* p_b1_grid_; + CDataType* p_c_grid_; + + // Raw Problem Size + index_t M_; + index_t N_; + index_t K_; + index_t O_; + index_t G0_; + index_t G1_; + float alpha_; + bool input_permute_; + bool output_permute_; + }; + + static auto MakeArgument(const ADataType* p_a, + const B0DataType* p_b0, + const B1DataType* p_b1, + CDataType* p_c, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) + { + return RawArg{ + p_a, p_b0, p_b1, p_c, M, N, K, O, G0, G1, alpha, input_permute, output_permute}; + } + + static bool IsSupportedArgument(const RawArg& arg) + { + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") + { + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc0 Type err"); + return false; + } + + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc1 Type err"); + return false; + } + } + else + { + printf("DeviceOp: Arch err"); + return false; + } + + constexpr index_t array_size = 4; + ck::index_t G0 = arg.G0_; + ck::index_t G1 = arg.G1_; + ck::index_t M = arg.M_; + ck::index_t N = arg.N_; + ck::index_t K = arg.K_; + ck::index_t O = arg.O_; + bool input_permute = arg.input_permute_; + bool output_permute = arg.output_permute_; + + std::array a_gs_ms_ks_lengths{G0, G1, M, K}; + std::array a_gs_ms_ks_strides = + input_permute ? std::array{M * G1 * K, K, G1 * K, 1} + // A layout [G0, M, G1, K] + : std::array{ + G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::array b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::array b0_gs_ns_ks_strides = + input_permute ? std::array{N * G1 * K, K, G1 * K, 1} + // B0 layout [G0, N, G1, K] + : std::array{ + G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::array b1_gs_os_ns_lengths{G0, G1, O, N}; + std::array b1_gs_os_ns_strides = + input_permute ? std::array{N * G1 * O, O, 1, G1 * O} + // B1 layout [G0, N, G1, O] + : std::array{ + G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::array c_gs_ms_os_lengths{G0, G1, M, O}; + std::array c_gs_ms_os_strides = + output_permute ? std::array{M * G1 * O, O, G1 * O, 1} + // C layout [G0, M, G1, O] + : std::array{ + G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + const auto a_grid_desc = + DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + + if(!GridwiseOp::CheckValidity( + a_grid_desc, b0_grid_desc, b1_grid_desc, c_grid_desc_m_n, block_2_ctile_map)) + { + return false; + } + + // Check if C permute dimension matches GEMM + GEMM shape + const index_t c_g = c_grid_desc_g_m_n.GetLength(I0); // unpadded + + if(!(c_g == batch_count)) + { + printf("DeviceOp: BatchCount err"); + return false; + } + + // Note: we need raw lengths since threadwise copy can not handle vector load when part of + // vector is out of bounds + // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O + const auto MzRaw = M; + const auto LzRaw = N; + const auto KzRaw = K; + const auto NzRaw = O; + + // Check scalar per vector requirement + const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; + const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; + const auto c_extent_lowest = NzRaw; + + if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && + b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && + b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && + c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + printf("DeviceOp: Data Transfer Vector scalar err"); + return false; + } + + std::array a_mz_kz_strides_{ + a_gs_ms_ks_strides[NumDimG + NumDimM - 1], + a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}; + std::array b0_lz_kz_strides_{ + b0_gs_ns_ks_strides[NumDimG + NumDimL - 1], + b0_gs_ns_ks_strides[NumDimG + NumDimL + NumDimK - 1]}; + std::array b1_nz_lz_strides_{ + b1_gs_os_ns_strides[NumDimG + NumDimN - 1], + b1_gs_os_ns_strides[NumDimG + NumDimN + NumDimL - 1]}; + std::array c_mz_nz_strides_{ + c_gs_ms_os_strides[NumDimG + NumDimM - 1], + c_gs_ms_os_strides[NumDimG + NumDimM + NumDimN - 1]}; + + // Check vector load/store requirement + const auto a_stride_lowest = + ABlockTransferSrcVectorDim == 2 ? a_mz_kz_strides_[1] : a_mz_kz_strides_[0]; + const auto b0_stride_lowest = + B0BlockTransferSrcVectorDim == 2 ? b0_lz_kz_strides_[1] : b0_lz_kz_strides_[0]; + const auto b1_stride_lowest = + B1BlockTransferSrcVectorDim == 2 ? b1_nz_lz_strides_[1] : b1_nz_lz_strides_[0]; + const auto c_stride_lowest = c_mz_nz_strides_[1]; + + if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || + c_stride_lowest == 1)) + { + printf("DeviceOp: Data Vectorize transfer err"); + return false; + } + + return true; + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + struct SelfAttnArg : public BaseArgument + { + SelfAttnArg(const ADataType* p_qkv_grid, + CDataType* p_out_grid, + index_t batch_size, + index_t sequence_length, + index_t head_count, + index_t head_size, + float alpha) + : p_qkv_grid_{p_qkv_grid}, + p_out_grid_{p_out_grid}, + batch_size_{batch_size}, + sequence_length_{sequence_length}, + head_count_{head_count}, + head_size_{head_size}, + alpha_{alpha} + { + } + // Pointers + const ADataType* p_qkv_grid_; + CDataType* p_out_grid_; + + // Raw Problem Size + index_t batch_size_; + index_t sequence_length_; + index_t head_count_; + index_t head_size_; + float alpha_; + }; + + static auto MakeSelfAttnArgument(const ADataType* p_qkv_grid, + CDataType* p_out_grid, + index_t batch_size, + index_t sequence_length, + index_t head_count, + index_t head_size, + float alpha) + { + return SelfAttnArg{ + p_qkv_grid, p_out_grid, batch_size, sequence_length, head_count, head_size, alpha}; + } + + struct CrossAttnArg : public BaseArgument + { + CrossAttnArg(const ADataType* p_q_grid, + const B0DataType* p_kv_grid, + CDataType* p_out_grid, + index_t batch_size, + index_t q_sequence_length, + index_t kv_sequence_length, + index_t head_count, + index_t head_size, + float alpha) + : p_q_grid_{p_q_grid}, + p_kv_grid_{p_kv_grid}, + p_out_grid_{p_out_grid}, + batch_size_{batch_size}, + q_sequence_length_{q_sequence_length}, + kv_sequence_length_{kv_sequence_length}, + head_count_{head_count}, + head_size_{head_size}, + alpha_{alpha} + { + } + // Pointers + const ADataType* p_q_grid_; + const B0DataType* p_kv_grid_; + CDataType* p_out_grid_; + + // Raw Problem Size + index_t batch_size_; + index_t q_sequence_length_; + index_t kv_sequence_length_; + index_t head_count_; + index_t head_size_; + float alpha_; + }; + + static auto MakeCrossAttnArgument(const ADataType* p_q_grid, + const B0DataType* p_kv_grid, + CDataType* p_out_grid, + index_t batch_size, + index_t q_sequence_length, + index_t kv_sequence_length, + index_t head_count, + index_t head_size, + float alpha) + { + return CrossAttnArg{p_q_grid, + p_kv_grid, + p_out_grid, + batch_size, + q_sequence_length, + kv_sequence_length, + head_count, + head_size, + alpha}; + } + // Argument struct Argument : public BaseArgument { @@ -383,14 +1120,14 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle CDataType* p_c_grid, const std::array p_acc0_biases, const std::array p_acc1_biases, - const std::vector& a_gs_ms_ks_lengths, - const std::vector& a_gs_ms_ks_strides, - const std::vector& b0_gs_ls_ks_lengths, - const std::vector& b0_gs_ls_ks_strides, - const std::vector& b1_gs_ns_ls_lengths, - const std::vector& b1_gs_ns_ls_strides, - const std::vector& c_gs_ms_ns_lengths, - const std::vector& c_gs_ms_ns_strides, + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b0_gs_ls_ks_lengths, + const std::array& b0_gs_ls_ks_strides, + const std::array& b1_gs_ns_ls_lengths, + const std::array& b1_gs_ns_ls_strides, + const std::array& c_gs_ms_ns_lengths, + const std::array& c_gs_ms_ns_strides, const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, @@ -497,11 +1234,11 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle // Strides for the last M/N/K dimensions of A/B0/B1/C // for sanity check of vector load/store - std::vector raw_lengths_mz_lz_kz_nz_; - std::vector a_mz_kz_strides_; - std::vector b0_lz_kz_strides_; - std::vector b1_nz_lz_strides_; - std::vector c_mz_nz_strides_; + std::array raw_lengths_mz_lz_kz_nz_; + std::array a_mz_kz_strides_; + std::array b0_lz_kz_strides_; + std::array b1_nz_lz_strides_; + std::array c_mz_nz_strides_; index_t batch_count_; // Batch Offset @@ -509,47 +1246,152 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle }; // Invoker + struct SelfAttnInvoker : public BaseInvoker + { + using Argument = DeviceOp::SelfAttnArg; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto M0 = math::integer_divide_ceil(arg.sequence_length_, MPerBlock); + const auto N0 = math::integer_divide_ceil(arg.head_size_, NPerBlock); + + const index_t grid_size = arg.batch_size_ * arg.head_count_ * M0 * N0; + const auto K = arg.head_size_; + + auto launch_kernel = [&](auto has_main_k_block_loop) { + const auto kernel = kernel_wmma_self_attention_forward; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_qkv_grid_, + arg.p_out_grid_, + arg.batch_size_, + arg.sequence_length_, + arg.head_count_, + arg.head_size_, + arg.alpha_); + }; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static auto MakeSelfAttnInvoker() { return SelfAttnInvoker{}; } + + // Invoker + struct CrossAttnInvoker : public BaseInvoker + { + using Argument = DeviceOp::CrossAttnArg; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto M0 = math::integer_divide_ceil(arg.q_sequence_length_, MPerBlock); + const auto N0 = math::integer_divide_ceil(arg.head_size_, NPerBlock); + + const index_t grid_size = arg.batch_size_ * arg.head_count_ * M0 * N0; + const auto K = arg.head_size_; + + auto launch_kernel = [&](auto has_main_k_block_loop) { + const auto kernel = kernel_wmma_cross_attention_forward; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_q_grid_, + arg.p_kv_grid_, + arg.p_out_grid_, + arg.batch_size_, + arg.q_sequence_length_, + arg.kv_sequence_length_, + arg.head_count_, + arg.head_size_, + arg.alpha_); + }; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static auto MakeCrossAttnInvoker() { return CrossAttnInvoker{}; } + struct Invoker : public BaseInvoker { - using Argument = DeviceOp::Argument; + using Argument = DeviceOp::RawArg; float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) { - const index_t grid_size = - arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_; - - const auto K = [&]() { - if constexpr(AEnableLds) - { - return arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I2); - } - else - { - return arg.a_grid_desc.GetLength(I0) * arg.a_grid_desc.GetLength(I3) * - arg.a_grid_desc.GetLength(I4) * arg.a_grid_desc.GetLength(I6); - } - }(); + const auto M0 = math::integer_divide_ceil(arg.M_, MPerBlock); + const auto N0 = math::integer_divide_ceil(arg.O_, NPerBlock); + const index_t grid_size = arg.G0_ * arg.G1_ * M0 * N0; + const auto K = arg.K_; + // printf("HasKBlockLoop: %d\n", GridwiseOp::CalculateHasMainKBlockLoop(K)); auto launch_kernel = [&](auto has_main_k_block_loop) { - const auto kernel = kernel_batched_gemm_softmax_gemm_wmma_cshuffle< - GridwiseOp, - ADataType, - B0DataType, - B1DataType, - CDataType, - DeviceOp::AGridDesc, - DeviceOp::B0GridDesc, - DeviceOp::B1GridDesc, - typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock, - AElementwiseOperation, - B0ElementwiseOperation, - AccElementwiseOperation, - B1ElementwiseOperation, - CElementwiseOperation, - ComputeBasePtrOfStridedBatch, - C0MatrixMask, - typename GridwiseOp::DefaultBlock2CTileMap, - has_main_k_block_loop>; + const auto kernel = + kernel_batched_gemm_softmax_gemm_wmma_cshuffle; return launch_and_time_kernel(stream_config, kernel, @@ -560,19 +1402,15 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle arg.p_b0_grid_, arg.p_b1_grid_, arg.p_c_grid_, - arg.a_grid_desc, - arg.b0_grid_desc, - arg.b1_grid_desc, - arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, - arg.a_element_op_, - arg.b0_element_op_, - arg.acc_element_op_, - arg.b1_element_op_, - arg.c_element_op_, - arg.batch_count_, - arg.compute_ptr_offset_of_batch_, - arg.c0_matrix_mask_, - arg.block_2_ctile_map_); + arg.M_, + arg.N_, + arg.K_, + arg.O_, + arg.G0_, + arg.G1_, + arg.alpha_, + arg.input_permute_, + arg.output_permute_); }; if(GridwiseOp::CalculateHasMainKBlockLoop(K)) @@ -598,7 +1436,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle // TODO: properly implement this check return true; } - +#if 0 static bool IsSupportedArgument(const Argument& arg) { if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || @@ -695,14 +1533,14 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle CDataType* p_c, const std::array p_acc0_biases, const std::array p_acc1_biases, - const std::vector& a_gs_ms_ks_lengths, - const std::vector& a_gs_ms_ks_strides, - const std::vector& b0_gs_ls_ks_lengths, - const std::vector& b0_gs_ls_ks_strides, - const std::vector& b1_gs_ns_ls_lengths, - const std::vector& b1_gs_ns_ls_strides, - const std::vector& c_gs_ms_ns_lengths, - const std::vector& c_gs_ms_ns_strides, + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b0_gs_ls_ks_lengths, + const std::array& b0_gs_ls_ks_strides, + const std::array& b1_gs_ns_ls_lengths, + const std::array& b1_gs_ns_ls_strides, + const std::array& c_gs_ms_ns_lengths, + const std::array& c_gs_ms_ns_strides, const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, @@ -739,6 +1577,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle b1_element_op, c_element_op}; } +#endif // polymorphic std::unique_ptr MakeArgumentPointer( @@ -766,20 +1605,60 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle B1ElementwiseOperation b1_element_op, CElementwiseOperation c_element_op) override { + std::array a_lengths; + std::array a_strides; + std::array b0_lengths; + std::array b0_strides; + std::array b1_lengths; + std::array b1_strides; + std::array c_lengths; + std::array c_strides; + std::transform(a_gs_ms_ks_lengths.begin(), + a_gs_ms_ks_lengths.end(), + a_lengths.begin(), + [](index_t i) { return i; }); + std::transform(a_gs_ms_ks_strides.begin(), + a_gs_ms_ks_strides.end(), + a_strides.begin(), + [](index_t i) { return i; }); + std::transform(b0_gs_ls_ks_lengths.begin(), + b0_gs_ls_ks_lengths.end(), + b0_lengths.begin(), + [](index_t i) { return i; }); + std::transform(b0_gs_ls_ks_strides.begin(), + b0_gs_ls_ks_strides.end(), + b0_strides.begin(), + [](index_t i) { return i; }); + std::transform(b1_gs_ns_ls_lengths.begin(), + b1_gs_ns_ls_lengths.end(), + b1_lengths.begin(), + [](index_t i) { return i; }); + std::transform(b1_gs_ns_ls_strides.begin(), + b1_gs_ns_ls_strides.end(), + b1_strides.begin(), + [](index_t i) { return i; }); + std::transform(c_gs_ms_ns_lengths.begin(), + c_gs_ms_ns_lengths.end(), + c_lengths.begin(), + [](index_t i) { return i; }); + std::transform(c_gs_ms_ns_strides.begin(), + c_gs_ms_ns_strides.end(), + c_strides.begin(), + [](index_t i) { return i; }); return std::make_unique(static_cast(p_a), static_cast(p_b0), static_cast(p_b1), static_cast(p_c), p_acc0_biases, p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b0_gs_ls_ks_lengths, - b0_gs_ls_ks_strides, - b1_gs_ns_ls_lengths, - b1_gs_ns_ls_strides, - c_gs_ms_ns_lengths, - c_gs_ms_ns_strides, + a_lengths, + a_strides, + b0_lengths, + b0_strides, + b1_lengths, + b1_strides, + c_lengths, + c_strides, acc0_biases_gs_ms_ls_lengths, acc0_biases_gs_ms_ls_strides, acc1_biases_gs_ms_ns_lengths, @@ -819,11 +1698,12 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle << MPerBlock << ", " << LPerBlock << ", " << KPerBlock << ", " - << K1 << ", " + << AK1 << ", " + << BK1 << ", " << MPerBlock << ", " << NPerBlock << ", " << LTilePerBlock << ", " - << L1 + << L1 << ", " << getGemmSpecializationString(GemmSpec) << ", " << "ASpec" << getTensorSpecializationString(ASpec) << ", " << "B0Spec" << getTensorSpecializationString(B0Spec) << ", " diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 8a026744b03..58d62bbccbe 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -650,7 +650,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle // check if it's 1x1, stride=1 conv for(index_t i = 0; i < NDimSpatial; ++i) { - const index_t X = arg.b_g_k_c_xs_lengths_[i + 2]; + const index_t X = arg.b_g_k_c_xs_lengths_[i + 3]; const index_t ConvStride = arg.conv_filter_strides_[i]; const index_t LeftPad = arg.input_left_pads_[i]; const index_t RightPad = arg.input_right_pads_[i]; @@ -667,7 +667,7 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle // check if it's 1x1 conv for(index_t i = 0; i < NDimSpatial; ++i) { - const index_t X = arg.b_g_k_c_xs_lengths_[i + 2]; + const index_t X = arg.b_g_k_c_xs_lengths_[i + 3]; const index_t LeftPad = arg.input_left_pads_[i]; const index_t RightPad = arg.input_right_pads_[i]; diff --git a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp index ea0f5897a75..2c6846e1c0a 100644 --- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp +++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp @@ -53,7 +53,10 @@ struct MaskOutUpperTrianglePredicate template struct C0MatrixMask_impl { - C0MatrixMask_impl(index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{}) {} + __host__ __device__ C0MatrixMask_impl(index_t NRaw) + : NRaw_(NRaw), predicate_(MaskOutPredicate{}) + { + } __host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index c0f8ca45105..bc1aa936456 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -18,102 +18,6 @@ namespace ck { -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_batched_gemm_softmax_gemm_wmma_cshuffle( - const ADataType* __restrict__ p_a_grid, - const B0DataType* __restrict__ p_b0_grid, - const B1DataType* __restrict__ p_b1_grid, - CDataType* __restrict__ p_c_grid, - const AGridDesc a_grid_desc, - const B0GridDesc b0_grid_desc, - const B1GridDesc b1_grid_desc, - const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock - c_grid_desc_mblock_mperblock_nblock_nperblock, - const AElementwiseOperation a_element_op, - const B0ElementwiseOperation b0_element_op, - const AccElementwiseOperation acc_element_op, - const B1ElementwiseOperation b1_element_op, - const CElementwiseOperation c_element_op, - const index_t batch_count, - const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch, - const C0MatrixMask c0_matrix_mask, - const Block2CTileMap block_2_ctile_map) -{ -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) - __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; - - const index_t num_blocks_per_batch = - __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); - const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); - - const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); - const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx))); - const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx))); - const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( - static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); - - GridwiseOp::template Run(p_a_grid + a_batch_offset, - p_b0_grid + b0_batch_offset, - p_b1_grid + b1_batch_offset, - p_c_grid + c_batch_offset, - p_shared, - a_grid_desc, - b0_grid_desc, - b1_grid_desc, - c_grid_desc_mblock_mperblock_nblock_nperblock, - a_element_op, - b0_element_op, - acc_element_op, - b1_element_op, - c_element_op, - c0_matrix_mask, - block_2_ctile_map); -#else - ignore = p_a_grid; - ignore = p_b0_grid; - ignore = p_b1_grid; - ignore = p_c_grid; - ignore = a_grid_desc; - ignore = b0_grid_desc; - ignore = b1_grid_desc; - ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; - ignore = a_element_op; - ignore = b0_element_op; - ignore = acc_element_op; - ignore = b1_element_op; - ignore = c_element_op; - ignore = batch_count; - ignore = compute_base_ptr_of_batch; - ignore = c0_matrix_mask; - ignore = block_2_ctile_map; -#endif // end of if (defined(__gfx1100__)) -} - // Gemm0: A [M x K] x B0 [K x L] = Acc [M x L] // Gemm1: Acc [M x L] x B1 [L x N] = C [M x N] template {}; static constexpr auto I7 = Number<7>{}; - static constexpr auto AK1 = Number{}; - static constexpr auto BK0 = Number{}; - static constexpr auto BK1 = Number{}; + static constexpr auto AK1 = Number{}; + static constexpr auto BK0 = Number{}; + static constexpr auto BK1 = Number{}; static constexpr auto L0PerBlock = LTilePerBlock / L1Value; static constexpr auto AL0 = Number{}; @@ -714,7 +619,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) { - const index_t num_loop = K / KPerBlock; + const index_t num_loop = math::integer_divide_ceil(K, KPerBlock); return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); } @@ -887,7 +792,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> MRepeat -> MWaves -> WmmaK/K1 -> MPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - constexpr auto K0PerWmma = WmmaK/2/K1Value; + constexpr auto K0PerWmma = WmmaK/2/AK1Value; auto a_block_buf = make_static_buffer( a_block_desc.GetElementSpaceSize()); @@ -903,7 +808,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma Number{}, I1, I1, - Number{}>, + Number{}>, Sequence<0, 1, 2, 3, 4, 5, 6>, 6, ABlockTransferSrcScalarPerVector, @@ -966,7 +871,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma // Thread-wise copy // KPerBlock/WmmaK -> LRepeat -> LWaves -> KRow -> LPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - constexpr auto K0PerWmma = WmmaK/2/K1Value; + constexpr auto K0PerWmma = WmmaK/2/BK1Value; auto b0_block_buf = make_static_buffer( b0_block_desc.GetElementSpaceSize()); @@ -982,7 +887,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma Number{}, I1, I1, - Number{}>, + Number{}>, Sequence<0, 1, 2, 3, 4, 5, 6>, 6, B0BlockTransferSrcScalarPerVector, @@ -1009,7 +914,7 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma /*******************************************************************************/ // Gemm0 - constexpr auto KPack = math::integer_least_multiple(K1Value, WmmaK); + constexpr auto KPack = math::integer_least_multiple(math::integer_least_multiple(AK1Value,BK1Value), WmmaK); auto blockwise_gemm0 = BlockwiseGemmWMMA< BlockSize, diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp index cf9dc8f909c..98e4f5b6f2b 100644 --- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp @@ -16,14 +16,15 @@ template -static auto MakeGridDescriptorPair(const std::vector& gs_ms_ns_lengths_vec, - const std::vector& gs_ms_ns_strides_vec) +__host__ __device__ static auto +MakeGridDescriptorPair(const std::array& gs_ms_ns_lengths_vec, + const std::array& gs_ms_ns_strides_vec) { - if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && - gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN)) - { - throw std::runtime_error("wrong! dimension must match input lengths"); - } + // if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && + // gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN)) + // { + // throw std::runtime_error("wrong! dimension must match input lengths"); + // } const auto to_tuple = [&](auto& vec, auto start, auto end) { return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); @@ -143,21 +144,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm // // A // - static auto MakeAGridDescriptorPair(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + __host__ __device__ static auto MakeAGridDescriptorPair( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) { return MakeGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec); } // TODO: rename to G_MRaw_KRaw - static auto MakeAGridDescriptor_G_M_K(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + __host__ __device__ static auto MakeAGridDescriptor_G_M_K( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) { return MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).first; } - static auto MakeAGridDescriptor_M_K(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + __host__ __device__ static auto MakeAGridDescriptor_M_K( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) { return matrix_padder.PadADescriptor_M_K( MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).second); @@ -212,21 +216,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm // // B (alias of B0) // - static auto MakeB0GridDescriptorPair(const std::vector& b0_gs_ns_ks_lengths_vec, - const std::vector& b0_gs_ns_ks_strides_vec) + __host__ __device__ static auto MakeB0GridDescriptorPair( + const std::array& b0_gs_ns_ks_lengths_vec, + const std::array& b0_gs_ns_ks_strides_vec) { return MakeGridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec); } // TODO: rename to G_MRaw_NRaw - static auto MakeB0GridDescriptor_G_N_K(const std::vector& b0_gs_ns_ks_lengths_vec, - const std::vector& b0_gs_ns_ks_strides_vec) + __host__ __device__ static auto MakeB0GridDescriptor_G_N_K( + const std::array& b0_gs_ns_ks_lengths_vec, + const std::array& b0_gs_ns_ks_strides_vec) { return MakeB0GridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec).first; } - static auto MakeB0GridDescriptor_N_K(const std::vector& b0_gs_ns_ks_lengths_vec, - const std::vector& b0_gs_ns_ks_strides_vec) + __host__ __device__ static auto MakeB0GridDescriptor_N_K( + const std::array& b0_gs_ns_ks_lengths_vec, + const std::array& b0_gs_ns_ks_strides_vec) { // alias of matrix_padder.PadB0Descriptor_N_K return matrix_padder.PadBDescriptor_N_K( @@ -282,21 +289,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm // // B1 // - static auto MakeB1GridDescriptorPair(const std::vector& b1_gs_os_ns_lengths_vec, - const std::vector& b1_gs_os_ns_strides_vec) + __host__ __device__ static auto MakeB1GridDescriptorPair( + const std::array& b1_gs_os_ns_lengths_vec, + const std::array& b1_gs_os_ns_strides_vec) { return MakeGridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec); } // TODO: rename to G_NRaw_KRaw - static auto MakeB1GridDescriptor_G_N_K(const std::vector& b1_gs_os_ns_lengths_vec, - const std::vector& b1_gs_os_ns_strides_vec) + __host__ __device__ static auto MakeB1GridDescriptor_G_N_K( + const std::array& b1_gs_os_ns_lengths_vec, + const std::array& b1_gs_os_ns_strides_vec) { return MakeB1GridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec).first; } - static auto MakeB1GridDescriptor_N_K(const std::vector& b1_gs_os_ns_lengths_vec, - const std::vector& b1_gs_os_ns_strides_vec) + __host__ __device__ static auto MakeB1GridDescriptor_N_K( + const std::array& b1_gs_os_ns_lengths_vec, + const std::array& b1_gs_os_ns_strides_vec) { // alias of matrix_padder.PadB1Descriptor_O_N return matrix_padder.PadB1Descriptor_N_K( @@ -353,21 +363,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm // // C // - static auto MakeCGridDescriptorPair(const std::vector& c_gs_ms_os_lengths_vec, - const std::vector& c_gs_ms_os_strides_vec) + __host__ __device__ static auto MakeCGridDescriptorPair( + const std::array& c_gs_ms_os_lengths_vec, + const std::array& c_gs_ms_os_strides_vec) { return MakeGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec); } // TODO: rename to G_MRaw_NRaw - static auto MakeCGridDescriptor_G_M_N(const std::vector& c_gs_ms_os_lengths_vec, - const std::vector& c_gs_ms_os_strides_vec) + __host__ __device__ static auto MakeCGridDescriptor_G_M_N( + const std::array& c_gs_ms_os_lengths_vec, + const std::array& c_gs_ms_os_strides_vec) { return MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).first; } - static auto MakeCGridDescriptor_M_N(const std::vector& c_gs_ms_os_lengths_vec, - const std::vector& c_gs_ms_os_strides_vec) + __host__ __device__ static auto MakeCGridDescriptor_M_N( + const std::array& c_gs_ms_os_lengths_vec, + const std::array& c_gs_ms_os_strides_vec) { return matrix_padder.PadCDescriptor_M_N( MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).second); diff --git a/script/unet_mha.sh b/script/unet_mha.sh new file mode 100644 index 00000000000..ce50aadab51 --- /dev/null +++ b/script/unet_mha.sh @@ -0,0 +1,52 @@ +#!/bin/bash +while getopts e: flag +do + case "${flag}" in + e) executable=${OPTARG};; + esac +done +echo "CK-NAVI31 Performance Test: MHA for AITemplate" + +VERIFICATION=0 +INITIALIZE=1 +TIMING=1 + +ALL_TEST_CASE=0 +SELF_ATTENTION=1 +CROSS_ATTENTION=0 +CAUSAL_MASK=0 +# self attention with causal mask +if [ $ALL_TEST_CASE -eq 1 ] || { [ $SELF_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 1 ]; }; then + echo "Test launched: self attention with causal mask" + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 4096 4096 40 40 2 8 0.158113881945610 1 1 + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 1024 1024 80 80 2 8 0.111803397536277 1 1 + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 256 256 160 160 2 8 0.079056940972805 1 1 + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 64 64 160 160 2 8 0.079056940972805 1 1 +fi + +# cross attention with causal mask +if [ $ALL_TEST_CASE -eq 1 ] || { [ $CROSS_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 1 ]; }; then + echo "Test launched: cross attention with causal mask" + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 4096 64 40 40 2 8 0.158113881945610 1 1 + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 1024 64 80 80 2 8 0.111803397536277 1 1 + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 256 64 160 160 2 8 0.079056940972805 1 1 + ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 64 64 160 160 2 8 0.079056940972805 1 1 +fi + +# self attention without causal mask +if [ $ALL_TEST_CASE -eq 1 ] || { [ $SELF_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 0 ]; }; then + echo "Test launched: self attention without causal mask" + $executable $VERIFICATION $INITIALIZE $TIMING 4096 4096 64 64 2 5 0.125 1 1 + $executable $VERIFICATION $INITIALIZE $TIMING 1024 1024 64 64 2 10 0.125 1 1 + $executable $VERIFICATION $INITIALIZE $TIMING 256 256 64 64 2 20 0.125 1 1 + $executable $VERIFICATION $INITIALIZE $TIMING 64 64 64 64 2 20 0.125 1 1 +fi + +# cross attention without causal mask +if [ $ALL_TEST_CASE -eq 1 ] || { [ $CROSS_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 0 ]; }; then + echo "Test launched: cross attention without causal mask" + $executable $VERIFICATION 1 $TIMING 4096 64 40 40 2 8 0.158113881945610 1 1 + $executable $VERIFICATION 1 $TIMING 1024 64 80 80 2 8 0.111803397536277 1 1 + $executable $VERIFICATION 1 $TIMING 256 64 160 160 2 8 0.079056940972805 1 1 + $executable $VERIFICATION 1 $TIMING 64 64 160 160 2 8 0.079056940972805 1 1 +fi \ No newline at end of file From d44f666016c4cce6283cd7a8f94e0475d9675368 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 13 Jun 2023 07:38:50 +0000 Subject: [PATCH 085/118] deprecate inline asm wmma --- .../common_wmma.hpp | 2 +- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 18 +- ...e_scale_softmax_gemm_permute_wmma_fp16.cpp | 10 +- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 2 +- .../gpu/block/blockwise_gemm_wmma.hpp | 505 ------------------ ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 4 +- include/ck/utility/amd_inline_asm.hpp | 12 - include/ck/utility/amd_wmma.hpp | 4 - 8 files changed, 19 insertions(+), 538 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp index 205423a0a48..ae769ff1d38 100644 --- a/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/common_wmma.hpp @@ -39,7 +39,7 @@ using S = ck::Sequence; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC; + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 39d960299e1..6205e7db08d 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -56,24 +56,24 @@ using DeviceConvFwdInstance = 64, // MPerBlock 64, // NPerBlock 64, // KPerBlock - 4, // K1 + 8, // K1 16, // MPerWMMA 16, // NPerWMMA 4, // MRepeat 1, // NRepeat - S<4, 8, 4>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<4, 32, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // ABlockTransferSrcAccessOrder 2, // ABlockTransferSrcVectorDim - 1, // ABlockTransferSrcScalarPerVector - 1, // ABlockTransferDstScalarPerVector_AK1 + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 true, // ABlockLdsExtraM - S<4, 8, 4>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<4, 32, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder S<1, 0, 2>, // BBlockTransferSrcAccessOrder 2, // BBlockTransferSrcVectorDim - 1, // BBlockTransferSrcScalarPerVector - 1, // BBlockTransferDstScalarPerVector_BK1 + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 true, // BBlockLdsExtraN 1, 1, @@ -278,9 +278,9 @@ bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[]) switch(conv_param.num_dim_spatial_) { - case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param); + // case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param); case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); - case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); + // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); } return false; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp index cbd23d00677..6cacb6416c1 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -67,7 +67,8 @@ static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecial static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; -using DeviceGemmInstance = +using DeviceMHAFactory = +std::tuple< ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, @@ -99,7 +100,8 @@ using DeviceGemmInstance = 128, // MPerBlock 64, // LPerBlock 64, // KPerBlock - 8, // K1 + 8, // AK1 + 8, // BK1 // Gemm 1 64, // NPerBlock 64, // LTilePerBlock @@ -136,8 +138,8 @@ using DeviceGemmInstance = 2, // CShuffleNWmmaPerWavePerShuffle S<1, 64, 1, 4>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock 8, // CShuffleBlockTransferScalarPerVector_NPerBlock - MaskingSpec>; // MaskingSpecialization - + MaskingSpec> // MaskingSpecialization + >; // Ref Gemm0: fp16 in, fp32 out using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm::type b_thread_copy_; }; -// block wise level pipe designed for inline asm -template -/* A: K0PerBlock x MPerBlock x K1 - * B: K0PerBlock x NPerBlock x K1 - * C: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs - * KPACK == WMMA_K = 16 - */ -struct BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto WmmaK = Number<16>{}; - - using ThisThreadBlock = ThisThreadBlock; - - // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. - static constexpr index_t WaveSize = 32; - - static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1); - static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1); - static constexpr index_t KPerBlock = - BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2); - - static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0); - static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0); - static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2); - static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2); - - static constexpr auto wmma_gemm = WmmaGemm{}; - - static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); - static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); - - StaticBufferTupleOfVector - c_thread_buf_; - - __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } - - __device__ static auto GetWaveIdx() - { - const index_t thread_id = ThisThreadBlock::GetThreadId(); - - constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); - } - - __device__ static auto CalculateAThreadOriginDataIndex() - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - - const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); - // |KRepeat |MRepeat|MWave |MLane |KPack - return make_tuple(0, 0, waveId_m, WMMA_a_idx, 0); - } - - __device__ static auto CalculateBThreadOriginDataIndex() - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_n = wave_idx[I1]; - - const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); - // |KRepeat |NRepeat|Nwave |NLane |KPack - return make_tuple(0, 0, waveId_n, WMMA_b_idx, 0); - } - - template - __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); - - constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( - make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; - const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( - make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; - - return make_tuple(c_thread_m, c_thread_n); - } - - __host__ __device__ BlockwiseGemmWMMA_k0mk1_k0nk1_m0m1m2n0n1n2m3_CShuffle_FIFO() - { - static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() && - BK0NK1BlockDesc::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, - "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); - - static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && - NPerBlock % (NPerWMMA * NRepeat) == 0, - "wrong!"); - } - // Thread level, register decriptor. Vector-write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I0]; - constexpr auto NThreadPerSubGroup = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I1]; - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave - // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, - I1, - MSubGroup, - Number{}, - I1, - NThreadPerSubGroup, - MAccVgprs)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - - // Provide dimension size - __host__ __device__ static constexpr auto - GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); - } - - __host__ __device__ static constexpr auto MakeABlockDescriptor_K0_M0_M1_M2_K1() - { - return transform_tensor_descriptor( - AK0MK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - __host__ __device__ static constexpr auto MakeBBlockDescriptor_K0_N0_N1_N2_K1() - { - return transform_tensor_descriptor( - BK0NK1BlockDesc{}, - make_tuple(make_pass_through_transform(Number{}), - make_unmerge_transform( - make_tuple(Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{})); - } - - // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma - static constexpr auto a_block_desc_k0_m0_m1_m2_k1 = MakeABlockDescriptor_K0_M0_M1_M2_K1(); - static constexpr auto b_block_desc_k0_n0_n1_n2_k1 = MakeBBlockDescriptor_K0_N0_N1_N2_K1(); - - template - __device__ void Run(const ABlockBuffer& a_block_buf, - const BBlockBuffer& b_block_buf, - CThreadBuffer& c_thread_buf) const - { - auto a_thread_buf = make_static_buffer( - a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( - b_thread_desc_.GetElementSpaceSize()); - - // TODO: Fix it, MRepeat < NRepeat - constexpr auto RepeatDiff = MRepeat - NRepeat; - - // Read all Mrepeat, Nrepeat - static_for<0, NRepeat, 1>{}([&](auto iN) { - b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1, - make_tuple(I0, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); - }); - - static_for<0, MRepeat, 1>{}([&](auto iM) { - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(I0, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - }); - - // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat - static_for<0, RepeatDiff, 1>{}([&](auto iCut) { - static_for<0, NRepeat, 1>{}([&](auto iN) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - // s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - // s_nop(); - }); - if constexpr(KPerBlock > WmmaK) - { - // Read Consumed Next inner loop A - a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - } - }); - - static_for{}([&](auto iWmmaK) { - // Stage 2: Run FIFO fashion loopover in Square - static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) { - // Row Repeatation - static_for{}([&](auto iN) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = c_thread_desc_.CalculateOffset( - make_tuple(WmmaInnerloop + RepeatDiff, iN, 0)); - // s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - // s_nop(); - }); - - // Read Consumed Next inner loop A - a_thread_copy_.Run( - a_block_desc_k0_m0_m1_m2_k1, - make_tuple( - Number{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - - // Col Repeatation - static_for{}([&](auto iM) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); - // s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - // s_nop(); - }); - // Read Consumed Next inner loop B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, Number{}, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - b_thread_buf); - }); - - // Stage 1: Cut to Repeat Retangle to Square, assume MRepeat > NRepeat - static_for<0, RepeatDiff, 1>{}([&](auto iCut) { - static_for<0, NRepeat, 1>{}([&](auto iN) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(iCut, iN, 0)); - // s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - // s_nop(); - }); - if constexpr(KPerBlock > WmmaK) - { - a_thread_copy_.Run( - a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number<(iWmmaK + WmmaK) / A_K1>{}, Number{}, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, Number{}, I0, I0, I0), - a_thread_buf); - } - }); - }); - - // Stage 2: Run FIFO fashion loopover in Square - static_for<0, NRepeat, 1>{}([&](auto WmmaInnerloop) { - // Row Repeatation - static_for{}([&](auto iN) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(WmmaInnerloop + RepeatDiff, iN, 0)); - // s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - // s_nop(); - }); - - // Col Repeatation - static_for{}([&](auto iM) { - vector_type a_thread_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto iK) { - a_thread_vec.template AsType()(iK) = - a_thread_buf[Number{}]; - b_thread_vec.template AsType()(iK) = - b_thread_buf[Number{}]; - }); - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(iM, WmmaInnerloop, 0)); - // s_nop(); - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - // s_nop(); - }); - }); - } - - protected: - // A[M0, M1, M2, K0 = WmmaK] - static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); - - // B[N0, N1, N2, K0 = WmmaK] - static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, I1, I1, Number{})); - - // C[M, N, NumRegWMMA] - static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); - - using AThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - A_K1, - A_K1>; - - using BThreadCopy = ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4>, - 4, - B_K1, - B_K1>; - - AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()}; - BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()}; -}; - } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index 3a4ba24d762..54817814bae 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -175,8 +175,8 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; // If true, LDS is used unconditionally - static constexpr auto AEnableLds_manu = false; - static constexpr auto BEnableLds_manu = false; + static constexpr auto AEnableLds_manu = true; + static constexpr auto BEnableLds_manu = true; static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumGemmKPrefetchStage > 1); diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp index 43baa817d36..7c92e75a6a5 100644 --- a/include/ck/utility/amd_inline_asm.hpp +++ b/include/ck/utility/amd_inline_asm.hpp @@ -355,17 +355,5 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a, c3); } -// Ranged input operand -__device__ void amd_assembly_wmma_f32_16x16x16_f16_w32(half16_t a, half16_t b, float8_t& c) -{ -#if defined(__gfx11__) - asm volatile("v_wmma_f32_16x16x16_f16 %0, %1, %2, %0" : "=v"(c) : "v"(a), "v"(b), "0"(c)); -#else - ignore = a; - ignore = b; - ignore = c; -#endif -} - } // namespace ck #endif diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp index dd7f0b770a1..3ab24eed046 100644 --- a/include/ck/utility/amd_wmma.hpp +++ b/include/ck/utility/amd_wmma.hpp @@ -21,10 +21,6 @@ struct intrin_wmma_f32_16x16x16_f16_w32<16, 16> template __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c) { - // * Inline assembly need to elimate the duplicated data load, compiler won't help you - // delete them. - // amd_assembly_wmma_f32_16x16x16_f16_w32( - // reg_a, reg_b, reg_c.template AsType()(Number<0>{})); #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) reg_c.template AsType()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32( reg_a, reg_b, reg_c.template AsType()[Number<0>{}]); From 6c1aa33a03756b7f37fd65a8748e80cf7c81187d Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 13 Jun 2023 09:22:36 +0000 Subject: [PATCH 086/118] Bug fix: double lds skip --- .../tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index b9a1a015157..3ce216e2454 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -495,8 +495,8 @@ struct GridwiseGemmPipeline_v1<1, false, false> CThreadBuffer& c_thread_buf, index_t num_loop) { - constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); - constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0); + constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0); + constexpr auto a_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0); auto b_block_buf_switch = b_block_buf; auto a_block_buf_switch = a_block_buf; From 83d926dcb1393a800f16d91d3609901ee87084de Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 15 Jun 2023 03:25:00 +0000 Subject: [PATCH 087/118] clang-format --- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 53 ++++++++++--------- ...e_scale_softmax_gemm_permute_wmma_fp16.cpp | 9 ++-- include/ck/utility/amd_inline_asm.hpp | 12 ++--- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 6205e7db08d..325d42dbe47 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -49,32 +49,32 @@ using DeviceConvFwdInstance = InElementOp, WeiElementOp, OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // Prefetch stage - 128, // BlockSize - 64, // MPerBlock - 64, // NPerBlock - 64, // KPerBlock - 8, // K1 - 16, // MPerWMMA - 16, // NPerWMMA - 4, // MRepeat - 1, // NRepeat + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // Prefetch stage + 128, // BlockSize + 64, // MPerBlock + 64, // NPerBlock + 64, // KPerBlock + 8, // K1 + 16, // MPerWMMA + 16, // NPerWMMA + 4, // MRepeat + 1, // NRepeat S<4, 32, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_AK1 - true, // ABlockLdsExtraM + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + true, // ABlockLdsExtraM S<4, 32, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_BK1 - true, // BBlockLdsExtraN + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + true, // BBlockLdsExtraN 1, 1, S<1, 16, 1, 8>, @@ -279,8 +279,9 @@ bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[]) switch(conv_param.num_dim_spatial_) { // case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param); - case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); - // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); + case 2: + return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); + // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); } return false; diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp index 6cacb6416c1..672b27ef60a 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -67,9 +67,8 @@ static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecial static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; -using DeviceMHAFactory = -std::tuple< - ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< +using DeviceMHAFactory = + std::tuple, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock 8, // CShuffleBlockTransferScalarPerVector_NPerBlock - MaskingSpec> // MaskingSpecialization - >; + MaskingSpec> // MaskingSpecialization + >; // Ref Gemm0: fp16 in, fp32 out using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm(a), bit_cast(b0), c0, false); - c1 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b1), c1, false); + c0 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b0), c0, false); + c1 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b1), c1, false); #endif } @@ -257,10 +257,10 @@ __device__ void amd_assembly_outer_product_1x4(int8x4_t a, "2"(c2), "3"(c3)); #else - c0 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b0), c0, false); - c1 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b1), c1, false); - c2 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b2), c2, false); - c3 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b3), c3, false); + c0 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b0), c0, false); + c1 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b1), c1, false); + c2 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b2), c2, false); + c3 = __builtin_amdgcn_sdot4(bit_cast(a), bit_cast(b3), c3, false); #endif } From 437779593ed3c4b96aec09f3bdd16352378bf6fa Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 19 Jun 2023 15:18:05 +0800 Subject: [PATCH 088/118] Fix errors in 1. example, fmha 2. gridwise pipeline 3. deviceop, fmha, change some containers from vector to array --- ...atched_gemm_scale_softmax_gemm_permute.inc | 314 +++++++----------- 1 file changed, 126 insertions(+), 188 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 9add86cc1bf..0b876af952f 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { @@ -117,41 +117,6 @@ int run(int argc, char* argv[]) b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); break; - case 4: // A, B0, B1 1 - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 5: // Rand: b1 b0; unit: a - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - break; - case 6: // Rand: a b0 ; unit: B1 - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 7: // Rand: a b1 ; unit: b0 - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - break; - case 8: // Rand: a ; unit: b0 b1 - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 9: // Rand: b0 ; unit: a b1 - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); - break; - case 10: // Rand: b1 ; unit: a b0 - a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); - b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); - b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); - break; default: a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); @@ -175,166 +140,139 @@ int run(int argc, char* argv[]) auto c_element_op = CElementOp{}; // do GEMM - float best_perf = .0; - float best_time = .0; - int not_pass = 0; - std::string best_kernel = ""; - printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? - ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { - const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); - - using DeviceMHAInstance = ck::remove_cvref_t; - auto gemm = DeviceMHAInstance{}; - auto invoker = gemm.MakeInvoker(); - auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), - static_cast(b0_device_buf.GetDeviceBuffer()), - static_cast(b1_device_buf.GetDeviceBuffer()), - static_cast(c_device_buf.GetDeviceBuffer()), - M, - N, - K, - O, - G0, - G1, - alpha, - input_permute, - output_permute); - - if(!gemm.IsSupportedArgument(argument)) - { - std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument( + static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b0_device_buf.GetDeviceBuffer()), + static_cast(b1_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + {}, // std::array p_acc0_biases; + {}, // std::array p_acc1_biases; + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ns_ks_lengths, + b0_gs_ns_ks_strides, + b1_gs_os_ns_lengths, + b1_gs_os_ns_strides, + c_gs_ms_os_lengths, + c_gs_ms_os_strides, + {}, // std::array, 1>{acc0_biases_gs_ms_ns_lengths}, + {}, // std::array, 1>{acc0_biases_gs_ms_ns_strides}, + {}, // std::array, 1>{acc1_biases_gs_ms_os_lengths}, + {}, // std::array, 1>{acc1_biases_gs_ms_os_strides}, + a_element_op, + b0_element_op, + acc0_element_op, + b1_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; - // return 0; - } + return 0; + } - ck::index_t BatchCount = G0 * G1; + ck::index_t BatchCount = G0 * G1; - float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + - sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * - BatchCount; + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + BatchCount; - float tflops = static_cast(flop) / 1.E9 / ave_time; + float tflops = static_cast(flop) / 1.E9 / ave_time; - float gb_per_sec = num_btype / 1.E6 / ave_time; + float gb_per_sec = num_btype / 1.E6 / ave_time; - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec - << " GB/s, " << gemm.GetTypeString() << std::endl; - if(tflops > best_perf) - { - best_perf = tflops; - best_time = ave_time * 1000; - best_kernel = gemm.GetTypeString(); - } - if(do_verification) + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << gemm.GetTypeString() << std::endl; + + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g_m_k({BatchCount, M, K}); + Tensor b0_g_k_n({BatchCount, K, N}); + Tensor b1_g_n_o({BatchCount, N, O}); + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax + Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument( + a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = DeviceGemmInstance::C0MatrixMask(N); + acc0_g_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[1], idx[2])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument( + a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { + const size_t& g0 = idx[0]; + const size_t& g1 = idx[1]; + + const size_t g = g0 * G1 + g1; + + self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); + }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) { - c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); - - Tensor a_g_m_k({BatchCount, M, K}); - Tensor b0_g_k_n({BatchCount, K, N}); - Tensor b1_g_n_o({BatchCount, N, O}); - Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 - Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax - Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 - - // permute - a_gs_ms_ks.ForEach([&](auto& self, auto idx) { - a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); - }); - b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { - b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); - }); - b1_gs_os_ns.ForEach([&](auto& self, auto idx) { - b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); - }); - - // gemm 0 - auto ref_gemm0 = ReferenceGemm0Instance{}; - auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); - auto ref_gemm0_argument = ref_gemm0.MakeArgument( - a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); - - ref_gemm0_invoker.Run(ref_gemm0_argument); - - // masking - const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); - acc0_g_m_n.ForEach([&](auto& self, auto idx) { - if(mask.IsMaskedElement(idx[1], idx[2])) - self(idx) = -ck::NumericLimits::Infinity(); - }); - - // softmax - auto ref_softmax = ReferenceSoftmaxInstance{}; - auto ref_softmax_invoker = ref_softmax.MakeInvoker(); - auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); - - ref_softmax_invoker.Run(ref_softmax_argument); - - // gemm1 - auto ref_gemm1 = ReferenceGemm1Instance{}; - auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); - auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n, - b1_g_n_o, - c_g_m_o_host_result, - PassThrough{}, - b1_element_op, - c_element_op); - - ref_gemm1_invoker.Run(ref_gemm1_argument); - - // permute - c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { - const size_t& g0 = idx[0]; - const size_t& g1 = idx[1]; - - const size_t g = g0 * G1 + g1; - - self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); - }); - - // default absolute error and relative error is 0.001 - double rtol = 1e-3; - double atol = 1e-3; - - // when BF16 is taken, set absolute error and relative error to 0.01 - if(std::is_same_v && std::is_same_v && - std::is_same_v && std::is_same_v) - { - rtol = 1e-2; - atol = 1e-2; - } - - bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, - c_gs_ms_os_host_result.mData, - "Error: Incorrect results!", - rtol, - atol); - printf("Verification: %s, Pass: %s\n", - do_verification ? "ON" : "OFF", - this_run_verification ? "YES" : "NO"); - - if(!this_run_verification) - { - not_pass = 1; - printf("%d th MHA instance verification Failed \n", i.value); - } + rtol = 1e-2; + atol = 1e-2; } - }); - std::cout << "---------------------------------------------------------------------------------" - "-----------" - << std::endl; - std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M - << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; - std::cout << "---------------------------------------------------------------------------------" - "-----------" - << std::endl; - std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time - << " us" << std::endl; - std::cout << "---------------------------------------------------------------------------------" - "-----------" - << std::endl; - return not_pass; + + return ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol) + ? 0 + : 1; + } + + return 0; } From b010b095f78450112607850c3527f5e18ae57ab6 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 19 Jun 2023 15:18:29 +0800 Subject: [PATCH 089/118] part2 of previous commit --- example/13_pool2d_fwd/pool2d_fwd_common.hpp | 6 +- ...e_scale_softmax_gemm_permute_wmma_fp16.cpp | 2 +- ...m_scale_softmax_gemm_permute_wmma_fp16.cpp | 2 +- .../cross_attention_forward_wmma_fp16.cpp | 2 +- ...d_gemm_scale_softmax_gemm_permute_wmma.inc | 340 ++++++++++++++++++ ...ntion.inc => run_cross_attention_wmma.inc} | 0 ...ention.inc => run_self_attention_wmma.inc} | 0 .../self_attention_forward_wmma_fp16.cpp | 2 +- ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 116 ++++-- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 6 +- .../grid/gridwise_gemm_pipeline_selector.hpp | 6 +- 11 files changed, 439 insertions(+), 43 deletions(-) create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc rename example/32_batched_gemm_scale_softmax_gemm/{run_cross_attention.inc => run_cross_attention_wmma.inc} (100%) rename example/32_batched_gemm_scale_softmax_gemm/{run_self_attention.inc => run_self_attention_wmma.inc} (100%) diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp index 1157ccd3870..5eb41d5d5a9 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -119,9 +119,9 @@ bool pool_test(bool do_verification, {N, C, Hi, Wi}, {Y, X}, {N, C, Ho, Wo}, - {C * Hi * Wi, 1, Wi * C, C}, - {C * Ho * Wo, 1, Wo * C, C}, - {C * Ho * Wo, 1, Wo * C, C}, + {}, + {}, + {}, window_strides, input_left_pads, input_right_pads, diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp index 672b27ef60a..2c7bacfc4eb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -161,6 +161,6 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< B1ElementOp, CElementOp>; -#include "run_batched_gemm_scale_softmax_gemm_permute.inc" +#include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc" int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp index 156cc499c61..d9ab645ee9b 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp @@ -283,6 +283,6 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< B1ElementOp, CElementOp>; -#include "run_batched_gemm_scale_softmax_gemm_permute.inc" +#include "run_batched_gemm_scale_softmax_gemm_permute_wmma.inc" int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp index 8afbd0c13b9..c5b6c7efbe9 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp @@ -327,6 +327,6 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< B1ElementOp, CElementOp>; -#include "run_cross_attention.inc" +#include "run_cross_attention_wmma.inc" int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc new file mode 100644 index 00000000000..9add86cc1bf --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +int run(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape for A/B0/B1/C + // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o + ck::index_t M = 120; + ck::index_t N = 1000; + ck::index_t K = 64; + ck::index_t O = 128; + + // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape + // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) + // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t G0 = 7; + ck::index_t G1 = 13; + + float alpha = 1; + + bool input_permute = false; + bool output_permute = true; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 13) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + O = std::stoi(argv[7]); + G0 = std::stoi(argv[8]); + G1 = std::stoi(argv[9]); + + alpha = std::stof(argv[10]); + + input_permute = std::stoi(argv[11]); + output_permute = std::stoi(argv[12]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 11: M, N, K, O, G0, G1\n"); + printf("arg10: scale (alpha)\n"); + printf("arg11 to 12: input / output permute\n"); + exit(0); + } + + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides = + input_permute + ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] + : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::vector b0_gs_ns_ks_strides = + input_permute + ? std::vector{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K] + : std::vector{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + std::vector b1_gs_os_ns_strides = + input_permute + ? std::vector{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O] + : std::vector{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides = + output_permute + ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] + : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + Tensor b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + Tensor c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + Tensor c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; + std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl; + std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl; + std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + case 3: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + break; + case 4: // A, B0, B1 1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 5: // Rand: b1 b0; unit: a + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 6: // Rand: a b0 ; unit: B1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 7: // Rand: a b1 ; unit: b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 8: // Rand: a ; unit: b0 b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 9: // Rand: b0 ; unit: a b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 10: // Rand: b1 ; unit: a b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + } + + DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize()); + DeviceMem b0_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize()); + DeviceMem b1_device_buf(sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * + c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_gs_ms_ks.mData.data()); + b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data()); + b1_device_buf.ToDevice(b1_gs_os_ns.mData.data()); + + auto a_element_op = AElementOp{}; + auto b0_element_op = B0ElementOp{}; + auto acc0_element_op = Acc0ElementOp{alpha}; + auto b1_element_op = B1ElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + float best_perf = .0; + float best_time = .0; + int not_pass = 0; + std::string best_kernel = ""; + printf("Verification: %s\n", do_verification ? "ON" : "OFF"); + // TODO ANT: replace array with vector? + ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { + const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + + using DeviceMHAInstance = ck::remove_cvref_t; + auto gemm = DeviceMHAInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b0_device_buf.GetDeviceBuffer()), + static_cast(b1_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + O, + G0, + G1, + alpha, + input_permute, + output_permute); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; + + // return 0; + } + + ck::index_t BatchCount = G0 * G1; + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + + sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + BatchCount; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + if(tflops > best_perf) + { + best_perf = tflops; + best_time = ave_time * 1000; + best_kernel = gemm.GetTypeString(); + } + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g_m_k({BatchCount, M, K}); + Tensor b0_g_k_n({BatchCount, K, N}); + Tensor b1_g_n_o({BatchCount, N, O}); + Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 + Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax + Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument( + a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + acc0_g_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[1], idx[2])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n, + b1_g_n_o, + c_g_m_o_host_result, + PassThrough{}, + b1_element_op, + c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) { + const size_t& g0 = idx[0]; + const size_t& g1 = idx[1]; + + const size_t g = g0 * G1 + g1; + + self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); + }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) + { + rtol = 1e-2; + atol = 1e-2; + } + + bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol); + printf("Verification: %s, Pass: %s\n", + do_verification ? "ON" : "OFF", + this_run_verification ? "YES" : "NO"); + + if(!this_run_verification) + { + not_pass = 1; + printf("%d th MHA instance verification Failed \n", i.value); + } + } + }); + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M + << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time + << " us" << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + return not_pass; +} diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/run_cross_attention.inc rename to example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc similarity index 100% rename from example/32_batched_gemm_scale_softmax_gemm/run_self_attention.inc rename to example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc diff --git a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp index 91aebf152dd..3f964908ed7 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp @@ -283,6 +283,6 @@ using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm< B1ElementOp, CElementOp>; -#include "run_self_attention.inc" +#include "run_self_attention_wmma.inc" int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index 3fad319e906..b66b983b2ec 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -252,16 +252,16 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1Spec, CSpec>; - static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, - const std::vector& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptor_AK0_M_AK1(const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) { return Transform::MakeAGridDescriptor_AK0_M_AK1( Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), Number{}); } - static auto MakeBGridDescriptor_BK0_N_BK1(const std::vector& b_gs_ns_ks_lengths_vec, - const std::vector& b_gs_ns_ks_strides_vec) + static auto MakeBGridDescriptor_BK0_N_BK1(const std::array& b_gs_ns_ks_lengths_vec, + const std::array& b_gs_ns_ks_strides_vec) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec), @@ -269,8 +269,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle } static auto - MakeB1GridDescriptor_BK0_N_BK1(const std::vector& b1_gs_gemm1ns_gemm1ks_lengths_vec, - const std::vector& b1_gs_gemm1ns_gemm1ks_strides_vec) + MakeB1GridDescriptor_BK0_N_BK1(const std::array& b1_gs_gemm1ns_gemm1ks_lengths_vec, + const std::array& b1_gs_gemm1ns_gemm1ks_strides_vec) { return Transform::MakeB1GridDescriptor_BK0_N_BK1( Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec, @@ -453,14 +453,14 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle CDataType* p_c_grid, const std::array p_acc0_biases, const std::array p_acc1_biases, - const std::vector& a_gs_ms_ks_lengths, - const std::vector& a_gs_ms_ks_strides, - const std::vector& b_gs_ns_ks_lengths, - const std::vector& b_gs_ns_ks_strides, - const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b_gs_ns_ks_lengths, + const std::array& b_gs_ns_ks_strides, + const std::array& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + const std::array& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + const std::array& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + const std::array& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides const std::array, NumD0Tensor>& acc0_biases_gs_ms_ns_lengths, const std::array, NumD0Tensor>& acc0_biases_gs_ms_ns_strides, const std::array, NumD1Tensor>& @@ -835,20 +835,48 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1ElementwiseOperation b1_element_op, C1DEElementwiseOperation c1de_element_op) { + constexpr auto dimension = NumDimG + NumDimM + NumDimN; + + std::array a_gs_ms_ks_lengths_{}; + std::array a_gs_ms_ks_strides_{}; + std::array b_gs_ns_ks_lengths_{}; + std::array b_gs_ns_ks_strides_{}; + std::array b1_gs_gemm1ns_gemm1ks_lengths_{}; // b1_gs_os_ns_lengths + std::array b1_gs_gemm1ns_gemm1ks_strides_{}; // b1_gs_os_ns_strides + std::array c_gs_ms_gemm1ns_lengths_{}; // c_gs_ms_os_lengths + std::array c_gs_ms_gemm1ns_strides_{}; // c_gs_ms_os_strides + + std::copy(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.begin()+dimension, a_gs_ms_ks_lengths_.begin()); + std::copy(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.begin()+dimension, a_gs_ms_ks_strides_.begin()); + std::copy(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.begin()+dimension, b_gs_ns_ks_lengths_.begin()); + std::copy(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.begin()+dimension, b_gs_ns_ks_strides_.begin()); + std::copy(b1_gs_gemm1ns_gemm1ks_lengths.begin(), + b1_gs_gemm1ns_gemm1ks_lengths.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_lengths_.begin()); // b1_gs_os_ns_lengths + std::copy(b1_gs_gemm1ns_gemm1ks_strides.begin(), + b1_gs_gemm1ns_gemm1ks_strides.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_strides_.begin()); // b1_gs_os_ns_strides + std::copy(c_gs_ms_gemm1ns_lengths.begin(), + c_gs_ms_gemm1ns_lengths.begin()+dimension, + c_gs_ms_gemm1ns_lengths_.begin()); // c_gs_ms_os_lengths + std::copy(c_gs_ms_gemm1ns_strides.begin(), + c_gs_ms_gemm1ns_strides.begin()+dimension, + c_gs_ms_gemm1ns_strides_.begin()); // c_gs_ms_os_strides + return Argument{p_a, p_b, p_b1, p_c, p_acc0_biases, p_acc1_biases, - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b_gs_ns_ks_lengths, - b_gs_ns_ks_strides, - b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + a_gs_ms_ks_lengths_, + a_gs_ms_ks_strides_, + b_gs_ns_ks_lengths_, + b_gs_ns_ks_strides_, + b1_gs_gemm1ns_gemm1ks_lengths_, // b1_gs_os_ns_lengths + b1_gs_gemm1ns_gemm1ks_strides_, // b1_gs_os_ns_strides + c_gs_ms_gemm1ns_lengths_, // c_gs_ms_os_lengths + c_gs_ms_gemm1ns_strides_, // c_gs_ms_os_strides acc0_biases_gs_ms_ns_lengths, acc0_biases_gs_ms_ns_strides, acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths @@ -891,20 +919,48 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1ElementwiseOperation b1_element_op, C1DEElementwiseOperation c1de_element_op) override { + constexpr auto dimension = NumDimG + NumDimM + NumDimN; + + std::array a_gs_ms_ks_lengths_{}; + std::array a_gs_ms_ks_strides_{}; + std::array b_gs_ns_ks_lengths_{}; + std::array b_gs_ns_ks_strides_{}; + std::array b1_gs_gemm1ns_gemm1ks_lengths_{}; // b1_gs_os_ns_lengths + std::array b1_gs_gemm1ns_gemm1ks_strides_{}; // b1_gs_os_ns_strides + std::array c_gs_ms_gemm1ns_lengths_{}; // c_gs_ms_os_lengths + std::array c_gs_ms_gemm1ns_strides_{}; // c_gs_ms_os_strides + + std::copy(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.begin()+dimension, a_gs_ms_ks_lengths_.begin()); + std::copy(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.begin()+dimension, a_gs_ms_ks_strides_.begin()); + std::copy(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.begin()+dimension, b_gs_ns_ks_lengths_.begin()); + std::copy(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.begin()+dimension, b_gs_ns_ks_strides_.begin()); + std::copy(b1_gs_gemm1ns_gemm1ks_lengths.begin(), + b1_gs_gemm1ns_gemm1ks_lengths.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_lengths_.begin()); // b1_gs_os_ns_lengths + std::copy(b1_gs_gemm1ns_gemm1ks_strides.begin(), + b1_gs_gemm1ns_gemm1ks_strides.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_strides_.begin()); // b1_gs_os_ns_strides + std::copy(c_gs_ms_gemm1ns_lengths.begin(), + c_gs_ms_gemm1ns_lengths.begin()+dimension, + c_gs_ms_gemm1ns_lengths_.begin()); // c_gs_ms_os_lengths + std::copy(c_gs_ms_gemm1ns_strides.begin(), + c_gs_ms_gemm1ns_strides.begin()+dimension, + c_gs_ms_gemm1ns_strides_.begin()); // c_gs_ms_os_strides + return std::make_unique(static_cast(p_a), static_cast(p_b), static_cast(p_b1), static_cast(p_c), p_acc0_biases, // cast in struct Argument p_acc1_biases, // cast in struct Argument - a_gs_ms_ks_lengths, - a_gs_ms_ks_strides, - b_gs_ns_ks_lengths, - b_gs_ns_ks_strides, - b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + a_gs_ms_ks_lengths_, + a_gs_ms_ks_strides_, + b_gs_ns_ks_lengths_, + b_gs_ns_ks_strides_, + b1_gs_gemm1ns_gemm1ks_lengths_, // b1_gs_os_ns_lengths + b1_gs_gemm1ns_gemm1ks_strides_, // b1_gs_os_ns_strides + c_gs_ms_gemm1ns_lengths_, // c_gs_ms_os_lengths + c_gs_ms_gemm1ns_strides_, // c_gs_ms_os_strides acc0_biases_gs_ms_ns_lengths, acc0_biases_gs_ms_ns_strides, acc1_biases_gs_ms_gemm1ns_lengths, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index bc1aa936456..ef7f91ab8d5 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -119,10 +119,10 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma using GridwiseGemmPipe = remove_cvref_t())>; + LoopSched, + AEnableLds, + B0EnableLds>())>; __host__ __device__ static constexpr auto MakeABlockDescriptor() { diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index dbdf8e1ff41..48bd22a764a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -15,10 +15,10 @@ enum struct PipelineVersion }; template + LoopScheduler LoopSched = LoopScheduler::Default, + bool AEnableLds = true, + bool BEnableLds = true> constexpr auto GridwiseGemmPipeline_Selector() { if constexpr(PipelineVer == PipelineVersion::v1) From 35e5c53294b0161426ed7a5b465f01d5ee4ab78b Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 19 Jun 2023 17:15:02 +0800 Subject: [PATCH 090/118] clang format --- ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 78 ++++++++++++------- 1 file changed, 50 insertions(+), 28 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index b66b983b2ec..e8fc49a1c52 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -252,25 +252,27 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1Spec, CSpec>; - static auto MakeAGridDescriptor_AK0_M_AK1(const std::array& a_gs_ms_ks_lengths_vec, - const std::array& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptor_AK0_M_AK1( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) { return Transform::MakeAGridDescriptor_AK0_M_AK1( Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), Number{}); } - static auto MakeBGridDescriptor_BK0_N_BK1(const std::array& b_gs_ns_ks_lengths_vec, - const std::array& b_gs_ns_ks_strides_vec) + static auto MakeBGridDescriptor_BK0_N_BK1( + const std::array& b_gs_ns_ks_lengths_vec, + const std::array& b_gs_ns_ks_strides_vec) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec), Number{}); } - static auto - MakeB1GridDescriptor_BK0_N_BK1(const std::array& b1_gs_gemm1ns_gemm1ks_lengths_vec, - const std::array& b1_gs_gemm1ns_gemm1ks_strides_vec) + static auto MakeB1GridDescriptor_BK0_N_BK1( + const std::array& b1_gs_gemm1ns_gemm1ks_lengths_vec, + const std::array& b1_gs_gemm1ns_gemm1ks_strides_vec) { return Transform::MakeB1GridDescriptor_BK0_N_BK1( Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec, @@ -457,10 +459,14 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle const std::array& a_gs_ms_ks_strides, const std::array& b_gs_ns_ks_lengths, const std::array& b_gs_ns_ks_strides, - const std::array& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - const std::array& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - const std::array& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - const std::array& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + const std::array& + b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + const std::array& + b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + const std::array& + c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + const std::array& + c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides const std::array, NumD0Tensor>& acc0_biases_gs_ms_ns_lengths, const std::array, NumD0Tensor>& acc0_biases_gs_ms_ns_strides, const std::array, NumD1Tensor>& @@ -836,7 +842,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle C1DEElementwiseOperation c1de_element_op) { constexpr auto dimension = NumDimG + NumDimM + NumDimN; - + std::array a_gs_ms_ks_lengths_{}; std::array a_gs_ms_ks_strides_{}; std::array b_gs_ns_ks_lengths_{}; @@ -846,21 +852,29 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle std::array c_gs_ms_gemm1ns_lengths_{}; // c_gs_ms_os_lengths std::array c_gs_ms_gemm1ns_strides_{}; // c_gs_ms_os_strides - std::copy(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.begin()+dimension, a_gs_ms_ks_lengths_.begin()); - std::copy(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.begin()+dimension, a_gs_ms_ks_strides_.begin()); - std::copy(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.begin()+dimension, b_gs_ns_ks_lengths_.begin()); - std::copy(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.begin()+dimension, b_gs_ns_ks_strides_.begin()); + std::copy(a_gs_ms_ks_lengths.begin(), + a_gs_ms_ks_lengths.begin() + dimension, + a_gs_ms_ks_lengths_.begin()); + std::copy(a_gs_ms_ks_strides.begin(), + a_gs_ms_ks_strides.begin() + dimension, + a_gs_ms_ks_strides_.begin()); + std::copy(b_gs_ns_ks_lengths.begin(), + b_gs_ns_ks_lengths.begin() + dimension, + b_gs_ns_ks_lengths_.begin()); + std::copy(b_gs_ns_ks_strides.begin(), + b_gs_ns_ks_strides.begin() + dimension, + b_gs_ns_ks_strides_.begin()); std::copy(b1_gs_gemm1ns_gemm1ks_lengths.begin(), - b1_gs_gemm1ns_gemm1ks_lengths.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_lengths.begin() + dimension, b1_gs_gemm1ns_gemm1ks_lengths_.begin()); // b1_gs_os_ns_lengths std::copy(b1_gs_gemm1ns_gemm1ks_strides.begin(), - b1_gs_gemm1ns_gemm1ks_strides.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_strides.begin() + dimension, b1_gs_gemm1ns_gemm1ks_strides_.begin()); // b1_gs_os_ns_strides std::copy(c_gs_ms_gemm1ns_lengths.begin(), - c_gs_ms_gemm1ns_lengths.begin()+dimension, + c_gs_ms_gemm1ns_lengths.begin() + dimension, c_gs_ms_gemm1ns_lengths_.begin()); // c_gs_ms_os_lengths std::copy(c_gs_ms_gemm1ns_strides.begin(), - c_gs_ms_gemm1ns_strides.begin()+dimension, + c_gs_ms_gemm1ns_strides.begin() + dimension, c_gs_ms_gemm1ns_strides_.begin()); // c_gs_ms_os_strides return Argument{p_a, @@ -930,21 +944,29 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle std::array c_gs_ms_gemm1ns_lengths_{}; // c_gs_ms_os_lengths std::array c_gs_ms_gemm1ns_strides_{}; // c_gs_ms_os_strides - std::copy(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.begin()+dimension, a_gs_ms_ks_lengths_.begin()); - std::copy(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.begin()+dimension, a_gs_ms_ks_strides_.begin()); - std::copy(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.begin()+dimension, b_gs_ns_ks_lengths_.begin()); - std::copy(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.begin()+dimension, b_gs_ns_ks_strides_.begin()); + std::copy(a_gs_ms_ks_lengths.begin(), + a_gs_ms_ks_lengths.begin() + dimension, + a_gs_ms_ks_lengths_.begin()); + std::copy(a_gs_ms_ks_strides.begin(), + a_gs_ms_ks_strides.begin() + dimension, + a_gs_ms_ks_strides_.begin()); + std::copy(b_gs_ns_ks_lengths.begin(), + b_gs_ns_ks_lengths.begin() + dimension, + b_gs_ns_ks_lengths_.begin()); + std::copy(b_gs_ns_ks_strides.begin(), + b_gs_ns_ks_strides.begin() + dimension, + b_gs_ns_ks_strides_.begin()); std::copy(b1_gs_gemm1ns_gemm1ks_lengths.begin(), - b1_gs_gemm1ns_gemm1ks_lengths.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_lengths.begin() + dimension, b1_gs_gemm1ns_gemm1ks_lengths_.begin()); // b1_gs_os_ns_lengths std::copy(b1_gs_gemm1ns_gemm1ks_strides.begin(), - b1_gs_gemm1ns_gemm1ks_strides.begin()+dimension, + b1_gs_gemm1ns_gemm1ks_strides.begin() + dimension, b1_gs_gemm1ns_gemm1ks_strides_.begin()); // b1_gs_os_ns_strides std::copy(c_gs_ms_gemm1ns_lengths.begin(), - c_gs_ms_gemm1ns_lengths.begin()+dimension, + c_gs_ms_gemm1ns_lengths.begin() + dimension, c_gs_ms_gemm1ns_lengths_.begin()); // c_gs_ms_os_lengths std::copy(c_gs_ms_gemm1ns_strides.begin(), - c_gs_ms_gemm1ns_strides.begin()+dimension, + c_gs_ms_gemm1ns_strides.begin() + dimension, c_gs_ms_gemm1ns_strides_.begin()); // c_gs_ms_os_strides return std::make_unique(static_cast(p_a), From b37706389dde3e660a2f6f0ea53b4a9baaad9d4f Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 20 Jun 2023 14:37:56 +0000 Subject: [PATCH 091/118] API fix of gridwisegemmpipeline --- ..._batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp | 2 +- .../gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp index 59d6bad5d97..1302cf1fd0a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp @@ -114,7 +114,7 @@ struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle using ThisThreadBlock = ThisThreadBlock; - using GridwiseGemmPipe = GridwiseGemmPipeline_v1; + using GridwiseGemmPipe = GridwiseGemmPipeline_v1; // ck::Tuple static constexpr auto MakeD0sGridPointer() diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp index e7577bdcbd5..5fdf3078aeb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp @@ -93,7 +93,7 @@ struct GridwiseGemmSplitKMultipleD_xdl_cshuffle using ThisThreadBlock = ThisThreadBlock; - using GridwiseGemmPipe = GridwiseGemmPipeline_v1; + using GridwiseGemmPipe = GridwiseGemmPipeline_v1; __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1() { From 8053bca3696fd0baa6a191ebeb45c50917dc866c Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 20 Jun 2023 15:20:38 +0000 Subject: [PATCH 092/118] separate array base and vector base attention tensor transformation --- ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 5 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 4 +- ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp | 147 ++----- .../transform_contraction_to_gemm.hpp | 165 ++------ ...ransform_contraction_to_gemm_arraybase.hpp | 391 ++++++++++++++++++ 5 files changed, 461 insertions(+), 251 deletions(-) create mode 100644 include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index 30e29cc8e2b..ab6d2716c04 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -44,7 +44,7 @@ __global__ void const CElementwiseOperation c_element_op) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ - defined(__gfx940__)) + defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; const index_t block_id = get_block_1d_id(); @@ -682,7 +682,8 @@ struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle static bool IsSupportedArgument(const Argument& arg) { if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" || - ck::get_device_name() == "gfx940")) + ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" || + ck::get_device_name() == "gfx942")) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index a39b71d0b1d..378232d9f0c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -18,7 +18,7 @@ #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp" -#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp" +#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -572,7 +572,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1); static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1); - using Transform = TransformBatchedContractionContractionToBatchedGemmGemm< + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma< Sequence, Sequence, GemmSpec, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp index e8fc49a1c52..73b12bfb474 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp @@ -68,7 +68,7 @@ __global__ void const C0MatrixMask c0_matrix_mask) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \ - defined(__gfx940__)) + defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()]; const index_t num_blocks_per_batch = __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); @@ -252,27 +252,25 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1Spec, CSpec>; - static auto MakeAGridDescriptor_AK0_M_AK1( - const std::array& a_gs_ms_ks_lengths_vec, - const std::array& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) { return Transform::MakeAGridDescriptor_AK0_M_AK1( Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), Number{}); } - static auto MakeBGridDescriptor_BK0_N_BK1( - const std::array& b_gs_ns_ks_lengths_vec, - const std::array& b_gs_ns_ks_strides_vec) + static auto MakeBGridDescriptor_BK0_N_BK1(const std::vector& b_gs_ns_ks_lengths_vec, + const std::vector& b_gs_ns_ks_strides_vec) { return Transform::MakeB0GridDescriptor_BK0_N_BK1( Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec), Number{}); } - static auto MakeB1GridDescriptor_BK0_N_BK1( - const std::array& b1_gs_gemm1ns_gemm1ks_lengths_vec, - const std::array& b1_gs_gemm1ns_gemm1ks_strides_vec) + static auto + MakeB1GridDescriptor_BK0_N_BK1(const std::vector& b1_gs_gemm1ns_gemm1ks_lengths_vec, + const std::vector& b1_gs_gemm1ns_gemm1ks_strides_vec) { return Transform::MakeB1GridDescriptor_BK0_N_BK1( Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec, @@ -455,18 +453,14 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle CDataType* p_c_grid, const std::array p_acc0_biases, const std::array p_acc1_biases, - const std::array& a_gs_ms_ks_lengths, - const std::array& a_gs_ms_ks_strides, - const std::array& b_gs_ns_ks_lengths, - const std::array& b_gs_ns_ks_strides, - const std::array& - b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths - const std::array& - b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides - const std::array& - c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths - const std::array& - c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b_gs_ns_ks_lengths, + const std::vector& b_gs_ns_ks_strides, + const std::vector& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + const std::vector& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + const std::vector& c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + const std::vector& c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides const std::array, NumD0Tensor>& acc0_biases_gs_ms_ns_lengths, const std::array, NumD0Tensor>& acc0_biases_gs_ms_ns_strides, const std::array, NumD1Tensor>& @@ -730,7 +724,8 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle #endif if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" || - ck::get_device_name() == "gfx940")) + ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" || + ck::get_device_name() == "gfx942")) { return false; } @@ -791,12 +786,10 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle if(arg.d0s_nl_ns_lengths_strides_[i][1] == 1 && arg.d0s_nl_ns_lengths_strides_[i][0] % D0sTransferSrcScalarPerVector != 0) { - std::cout << "first" << std::endl; return false; } if(arg.d0s_nl_ns_lengths_strides_[i][1] != 1 && D0sTransferSrcScalarPerVector != 1) { - std::cout << "second" << std::endl; return false; } } @@ -841,56 +834,20 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1ElementwiseOperation b1_element_op, C1DEElementwiseOperation c1de_element_op) { - constexpr auto dimension = NumDimG + NumDimM + NumDimN; - - std::array a_gs_ms_ks_lengths_{}; - std::array a_gs_ms_ks_strides_{}; - std::array b_gs_ns_ks_lengths_{}; - std::array b_gs_ns_ks_strides_{}; - std::array b1_gs_gemm1ns_gemm1ks_lengths_{}; // b1_gs_os_ns_lengths - std::array b1_gs_gemm1ns_gemm1ks_strides_{}; // b1_gs_os_ns_strides - std::array c_gs_ms_gemm1ns_lengths_{}; // c_gs_ms_os_lengths - std::array c_gs_ms_gemm1ns_strides_{}; // c_gs_ms_os_strides - - std::copy(a_gs_ms_ks_lengths.begin(), - a_gs_ms_ks_lengths.begin() + dimension, - a_gs_ms_ks_lengths_.begin()); - std::copy(a_gs_ms_ks_strides.begin(), - a_gs_ms_ks_strides.begin() + dimension, - a_gs_ms_ks_strides_.begin()); - std::copy(b_gs_ns_ks_lengths.begin(), - b_gs_ns_ks_lengths.begin() + dimension, - b_gs_ns_ks_lengths_.begin()); - std::copy(b_gs_ns_ks_strides.begin(), - b_gs_ns_ks_strides.begin() + dimension, - b_gs_ns_ks_strides_.begin()); - std::copy(b1_gs_gemm1ns_gemm1ks_lengths.begin(), - b1_gs_gemm1ns_gemm1ks_lengths.begin() + dimension, - b1_gs_gemm1ns_gemm1ks_lengths_.begin()); // b1_gs_os_ns_lengths - std::copy(b1_gs_gemm1ns_gemm1ks_strides.begin(), - b1_gs_gemm1ns_gemm1ks_strides.begin() + dimension, - b1_gs_gemm1ns_gemm1ks_strides_.begin()); // b1_gs_os_ns_strides - std::copy(c_gs_ms_gemm1ns_lengths.begin(), - c_gs_ms_gemm1ns_lengths.begin() + dimension, - c_gs_ms_gemm1ns_lengths_.begin()); // c_gs_ms_os_lengths - std::copy(c_gs_ms_gemm1ns_strides.begin(), - c_gs_ms_gemm1ns_strides.begin() + dimension, - c_gs_ms_gemm1ns_strides_.begin()); // c_gs_ms_os_strides - return Argument{p_a, p_b, p_b1, p_c, p_acc0_biases, p_acc1_biases, - a_gs_ms_ks_lengths_, - a_gs_ms_ks_strides_, - b_gs_ns_ks_lengths_, - b_gs_ns_ks_strides_, - b1_gs_gemm1ns_gemm1ks_lengths_, // b1_gs_os_ns_lengths - b1_gs_gemm1ns_gemm1ks_strides_, // b1_gs_os_ns_strides - c_gs_ms_gemm1ns_lengths_, // c_gs_ms_os_lengths - c_gs_ms_gemm1ns_strides_, // c_gs_ms_os_strides + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_lengths, + b_gs_ns_ks_strides, + b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides acc0_biases_gs_ms_ns_lengths, acc0_biases_gs_ms_ns_strides, acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths @@ -933,56 +890,20 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle B1ElementwiseOperation b1_element_op, C1DEElementwiseOperation c1de_element_op) override { - constexpr auto dimension = NumDimG + NumDimM + NumDimN; - - std::array a_gs_ms_ks_lengths_{}; - std::array a_gs_ms_ks_strides_{}; - std::array b_gs_ns_ks_lengths_{}; - std::array b_gs_ns_ks_strides_{}; - std::array b1_gs_gemm1ns_gemm1ks_lengths_{}; // b1_gs_os_ns_lengths - std::array b1_gs_gemm1ns_gemm1ks_strides_{}; // b1_gs_os_ns_strides - std::array c_gs_ms_gemm1ns_lengths_{}; // c_gs_ms_os_lengths - std::array c_gs_ms_gemm1ns_strides_{}; // c_gs_ms_os_strides - - std::copy(a_gs_ms_ks_lengths.begin(), - a_gs_ms_ks_lengths.begin() + dimension, - a_gs_ms_ks_lengths_.begin()); - std::copy(a_gs_ms_ks_strides.begin(), - a_gs_ms_ks_strides.begin() + dimension, - a_gs_ms_ks_strides_.begin()); - std::copy(b_gs_ns_ks_lengths.begin(), - b_gs_ns_ks_lengths.begin() + dimension, - b_gs_ns_ks_lengths_.begin()); - std::copy(b_gs_ns_ks_strides.begin(), - b_gs_ns_ks_strides.begin() + dimension, - b_gs_ns_ks_strides_.begin()); - std::copy(b1_gs_gemm1ns_gemm1ks_lengths.begin(), - b1_gs_gemm1ns_gemm1ks_lengths.begin() + dimension, - b1_gs_gemm1ns_gemm1ks_lengths_.begin()); // b1_gs_os_ns_lengths - std::copy(b1_gs_gemm1ns_gemm1ks_strides.begin(), - b1_gs_gemm1ns_gemm1ks_strides.begin() + dimension, - b1_gs_gemm1ns_gemm1ks_strides_.begin()); // b1_gs_os_ns_strides - std::copy(c_gs_ms_gemm1ns_lengths.begin(), - c_gs_ms_gemm1ns_lengths.begin() + dimension, - c_gs_ms_gemm1ns_lengths_.begin()); // c_gs_ms_os_lengths - std::copy(c_gs_ms_gemm1ns_strides.begin(), - c_gs_ms_gemm1ns_strides.begin() + dimension, - c_gs_ms_gemm1ns_strides_.begin()); // c_gs_ms_os_strides - return std::make_unique(static_cast(p_a), static_cast(p_b), static_cast(p_b1), static_cast(p_c), p_acc0_biases, // cast in struct Argument p_acc1_biases, // cast in struct Argument - a_gs_ms_ks_lengths_, - a_gs_ms_ks_strides_, - b_gs_ns_ks_lengths_, - b_gs_ns_ks_strides_, - b1_gs_gemm1ns_gemm1ks_lengths_, // b1_gs_os_ns_lengths - b1_gs_gemm1ns_gemm1ks_strides_, // b1_gs_os_ns_strides - c_gs_ms_gemm1ns_lengths_, // c_gs_ms_os_lengths - c_gs_ms_gemm1ns_strides_, // c_gs_ms_os_strides + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b_gs_ns_ks_lengths, + b_gs_ns_ks_strides, + b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths + b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides + c_gs_ms_gemm1ns_lengths, // c_gs_ms_os_lengths + c_gs_ms_gemm1ns_strides, // c_gs_ms_os_strides acc0_biases_gs_ms_ns_lengths, acc0_biases_gs_ms_ns_strides, acc1_biases_gs_ms_gemm1ns_lengths, diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp index c1c1fd48755..ea27a40ce3c 100644 --- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp @@ -16,15 +16,14 @@ template -__host__ __device__ static auto -MakeGridDescriptorPair(const std::array& gs_ms_ns_lengths_vec, - const std::array& gs_ms_ns_strides_vec) +static auto MakeGridDescriptorPair(const std::vector& gs_ms_ns_lengths_vec, + const std::vector& gs_ms_ns_strides_vec) { - // if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && - // gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN)) - // { - // throw std::runtime_error("wrong! dimension must match input lengths"); - // } + if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && + gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN)) + { + throw std::runtime_error("wrong! dimension must match input lengths"); + } const auto to_tuple = [&](auto& vec, auto start, auto end) { return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); @@ -144,24 +143,21 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm // // A // - __host__ __device__ static auto MakeAGridDescriptorPair( - const std::array& a_gs_ms_ks_lengths_vec, - const std::array& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptorPair(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) { return MakeGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec); } // TODO: rename to G_MRaw_KRaw - __host__ __device__ static auto MakeAGridDescriptor_G_M_K( - const std::array& a_gs_ms_ks_lengths_vec, - const std::array& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptor_G_M_K(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) { return MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).first; } - __host__ __device__ static auto MakeAGridDescriptor_M_K( - const std::array& a_gs_ms_ks_lengths_vec, - const std::array& a_gs_ms_ks_strides_vec) + static auto MakeAGridDescriptor_M_K(const std::vector& a_gs_ms_ks_lengths_vec, + const std::vector& a_gs_ms_ks_strides_vec) { return matrix_padder.PadADescriptor_M_K( MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).second); @@ -183,57 +179,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } - template - __host__ __device__ static constexpr auto - MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AK0PerWmma_AKRow_MPerWmma_AK1( - const AGridDesc_M_K& a_grid_desc_m_k, - const WmmaK&, - const MRepeat&, - const MWaves&, - const MPerWmma&, - const AK1&) - { - const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlock; - const auto K = a_grid_desc_m_k.GetLength(I1); - const auto AKWmma = K / WmmaK{}; - constexpr auto AKRow = 2; - constexpr auto AK0PerWmma = WmmaK{} / AKRow / AK1{}; - - return transform_tensor_descriptor( - a_grid_desc_m_k, - make_tuple(make_unmerge_transform( - make_tuple(AKWmma, Number{}, Number{}, AK1{})), - make_unmerge_transform(make_tuple(M0 * MRepeat{}, MWaves{}, MPerWmma{}))), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); - } - // // B (alias of B0) // - __host__ __device__ static auto MakeB0GridDescriptorPair( - const std::array& b0_gs_ns_ks_lengths_vec, - const std::array& b0_gs_ns_ks_strides_vec) + static auto MakeB0GridDescriptorPair(const std::vector& b0_gs_ns_ks_lengths_vec, + const std::vector& b0_gs_ns_ks_strides_vec) { return MakeGridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec); } // TODO: rename to G_MRaw_NRaw - __host__ __device__ static auto MakeB0GridDescriptor_G_N_K( - const std::array& b0_gs_ns_ks_lengths_vec, - const std::array& b0_gs_ns_ks_strides_vec) + static auto MakeB0GridDescriptor_G_N_K(const std::vector& b0_gs_ns_ks_lengths_vec, + const std::vector& b0_gs_ns_ks_strides_vec) { return MakeB0GridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec).first; } - __host__ __device__ static auto MakeB0GridDescriptor_N_K( - const std::array& b0_gs_ns_ks_lengths_vec, - const std::array& b0_gs_ns_ks_strides_vec) + static auto MakeB0GridDescriptor_N_K(const std::vector& b0_gs_ns_ks_lengths_vec, + const std::vector& b0_gs_ns_ks_strides_vec) { // alias of matrix_padder.PadB0Descriptor_N_K return matrix_padder.PadBDescriptor_N_K( @@ -256,57 +219,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } - template - __host__ __device__ static constexpr auto - MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BK0PerWmma_BKRow_LPerWmma_BK1( - const BGridDesc_L_K& b_grid_desc_l_k, - const WmmaK&, - const LRepeat&, - const LWaves&, - const LPerWmma&, - const BK1&) - { - const auto L0 = b_grid_desc_l_k.GetLength(I0) / NPerBlock; - const auto K = b_grid_desc_l_k.GetLength(I1); - const auto BKWmma = K / WmmaK{}; - constexpr auto BKRow = 2; - constexpr auto BK0PerWmma = WmmaK{} / BKRow / BK1{}; - - return transform_tensor_descriptor( - b_grid_desc_l_k, - make_tuple(make_unmerge_transform( - make_tuple(BKWmma, Number{}, Number{}, BK1{})), - make_unmerge_transform(make_tuple(L0 * LRepeat{}, LWaves{}, LPerWmma{}))), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); - } - // // B1 // - __host__ __device__ static auto MakeB1GridDescriptorPair( - const std::array& b1_gs_os_ns_lengths_vec, - const std::array& b1_gs_os_ns_strides_vec) + static auto MakeB1GridDescriptorPair(const std::vector& b1_gs_os_ns_lengths_vec, + const std::vector& b1_gs_os_ns_strides_vec) { return MakeGridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec); } // TODO: rename to G_NRaw_KRaw - __host__ __device__ static auto MakeB1GridDescriptor_G_N_K( - const std::array& b1_gs_os_ns_lengths_vec, - const std::array& b1_gs_os_ns_strides_vec) + static auto MakeB1GridDescriptor_G_N_K(const std::vector& b1_gs_os_ns_lengths_vec, + const std::vector& b1_gs_os_ns_strides_vec) { return MakeB1GridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec).first; } - __host__ __device__ static auto MakeB1GridDescriptor_N_K( - const std::array& b1_gs_os_ns_lengths_vec, - const std::array& b1_gs_os_ns_strides_vec) + static auto MakeB1GridDescriptor_N_K(const std::vector& b1_gs_os_ns_lengths_vec, + const std::vector& b1_gs_os_ns_strides_vec) { // alias of matrix_padder.PadB1Descriptor_O_N return matrix_padder.PadB1Descriptor_N_K( @@ -330,57 +260,24 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm make_tuple(Sequence<0, 2>{}, Sequence<1>{})); } - template - __host__ __device__ static constexpr auto - MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves__BL0PerWmma_BLRow_NPerWmma_BL1( - const BGridDesc_N_L& b_grid_desc_n_l, - const WmmaL&, - const NRepeat&, - const NWaves&, - const NPerWmma&, - const BL1&) - { - const auto N0 = b_grid_desc_n_l.GetLength(I0) / OPerBlock; - const auto L = b_grid_desc_n_l.GetLength(I1); - const auto BLWmma = L / WmmaL{}; - constexpr auto BLRow = 2; - constexpr auto BL0PerWmma = WmmaL{} / BLRow / BL1{}; - - return transform_tensor_descriptor( - b_grid_desc_n_l, - make_tuple(make_unmerge_transform( - make_tuple(BLWmma, Number{}, Number{}, BL1{})), - make_unmerge_transform(make_tuple(N0 * NRepeat{}, NWaves{}, NPerWmma{}))), - make_tuple(Sequence<1>{}, Sequence<0>{}), - make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); - } - // // C // - __host__ __device__ static auto MakeCGridDescriptorPair( - const std::array& c_gs_ms_os_lengths_vec, - const std::array& c_gs_ms_os_strides_vec) + static auto MakeCGridDescriptorPair(const std::vector& c_gs_ms_os_lengths_vec, + const std::vector& c_gs_ms_os_strides_vec) { return MakeGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec); } // TODO: rename to G_MRaw_NRaw - __host__ __device__ static auto MakeCGridDescriptor_G_M_N( - const std::array& c_gs_ms_os_lengths_vec, - const std::array& c_gs_ms_os_strides_vec) + static auto MakeCGridDescriptor_G_M_N(const std::vector& c_gs_ms_os_lengths_vec, + const std::vector& c_gs_ms_os_strides_vec) { return MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).first; } - __host__ __device__ static auto MakeCGridDescriptor_M_N( - const std::array& c_gs_ms_os_lengths_vec, - const std::array& c_gs_ms_os_strides_vec) + static auto MakeCGridDescriptor_M_N(const std::vector& c_gs_ms_os_lengths_vec, + const std::vector& c_gs_ms_os_strides_vec) { return matrix_padder.PadCDescriptor_M_N( MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).second); diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp new file mode 100644 index 00000000000..56181d38c87 --- /dev/null +++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" + +namespace ck { +namespace tensor_operation { + +// assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] +template +__host__ __device__ static auto +MakeGridDescriptorPair(const std::array& gs_ms_ns_lengths_vec, + const std::array& gs_ms_ns_strides_vec) +{ + // if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN && + // gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN)) + // { + // throw std::runtime_error("wrong! dimension must match input lengths"); + // } + + const auto to_tuple = [&](auto& vec, auto start, auto end) { + return generate_tuple([&](auto i) { return vec[start + i]; }, Number{}); + }; + + const auto gs_ms_ns_lengths = + to_tuple(gs_ms_ns_lengths_vec, Number<0>{}, Number{}); + const auto gs_ms_ns_strides = + to_tuple(gs_ms_ns_strides_vec, Number<0>{}, Number{}); + + // dimension Ids for G0, G1, ... + constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{}; + + // dimension Ids for M0, M1, ... + constexpr auto mDimIds = + typename arithmetic_sequence_gen::type{}; + + // dimension Ids for N0, N1, ... + constexpr auto nDimIds = + typename arithmetic_sequence_gen::type{}; + + // lengths for G0, G1, ... + const auto gLengths = get_container_subset(gs_ms_ns_lengths, gDimIds); + + // lengths for M0, M1, ... + const auto mLengths = get_container_subset(gs_ms_ns_lengths, mDimIds); + + // lengths for N0, N1, ... + const auto nLengths = get_container_subset(gs_ms_ns_lengths, nDimIds); + + if constexpr(TensorSpec == device::TensorSpecialization::Packed) + { + auto G = container_reduce(gLengths, math::multiplies{}, Number<1>{}); + auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{}); + auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{}); + const auto grid_desc_g_mraw_nraw = make_naive_tensor_descriptor( + make_tuple(G, M, N), + make_tuple(gs_ms_ns_strides[Number{}], + gs_ms_ns_strides[Number{}], + gs_ms_ns_strides[Number{}])); + + const auto grid_desc_mraw_nraw = make_naive_tensor_descriptor( + make_tuple(M, N), + make_tuple(gs_ms_ns_strides[Number{}], + gs_ms_ns_strides[Number{}])); + + return std::make_pair(grid_desc_g_mraw_nraw, grid_desc_mraw_nraw); + } + else + { + // naive tensor C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...] + const auto grid_desc_gs_ms_ns = + make_naive_tensor_descriptor(gs_ms_ns_lengths, gs_ms_ns_strides); + + // transformed tensor C[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * + // N2 * ...] + // Note: This does not require padding as it only provides G offset calculation. Technically + // descriptor for only G is needed. Here we opt for backward compatibility purpose to return + // G_M_N + const auto grid_desc_g_mraw_nraw = + transform_tensor_descriptor(grid_desc_gs_ms_ns, + make_tuple(make_merge_transform(gLengths), + make_merge_transform(mLengths), + make_merge_transform(nLengths)), + make_tuple(gDimIds, mDimIds, nDimIds), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); + + const auto c_ms_ns_lengths = to_tuple( + gs_ms_ns_lengths_vec, Number{}, Number{}); + const auto c_ms_ns_strides = to_tuple( + gs_ms_ns_strides_vec, Number{}, Number{}); + + // transformed tensor C[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * + // N2 * ...] + const auto grid_desc_ms_ns = make_naive_tensor_descriptor(c_ms_ns_lengths, c_ms_ns_strides); + + const auto grid_desc_mraw_nraw = transform_tensor_descriptor( + grid_desc_ms_ns, + make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)), + make_tuple(mDimIds - Number{}, nDimIds - Number{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return std::make_pair(grid_desc_g_mraw_nraw, grid_desc_mraw_nraw); + } +} + +template + typename PerBlock_M_N_K_O, // Sequence<> + device::GemmSpecialization GemmSpec, + device::TensorSpecialization ASpec, + device::TensorSpecialization B0Spec, + device::TensorSpecialization B1Spec, + device::TensorSpecialization CSpec> +struct TransformBatchedContractionContractionToBatchedGemmGemm_Wmma +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + + static constexpr index_t NumDimG = NumDims_G_M_N_K_O::At(I0); + static constexpr index_t NumDimM = NumDims_G_M_N_K_O::At(I1); + static constexpr index_t NumDimN = NumDims_G_M_N_K_O::At(I2); + static constexpr index_t NumDimK = NumDims_G_M_N_K_O::At(I3); + static constexpr index_t NumDimO = NumDims_G_M_N_K_O::At(I4); + + static constexpr index_t MPerBlock = PerBlock_M_N_K_O::At(I0); + static constexpr index_t NPerBlock = PerBlock_M_N_K_O::At(I1); + static constexpr index_t KPerBlock = PerBlock_M_N_K_O::At(I2); + static constexpr index_t OPerBlock = PerBlock_M_N_K_O::At(I3); + + static constexpr auto matrix_padder = + device::GemmGemmPadder{ + MPerBlock, NPerBlock, KPerBlock, OPerBlock}; + + // + // A + // + __host__ __device__ static auto MakeAGridDescriptorPair( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) + { + return MakeGridDescriptorPair(a_gs_ms_ks_lengths_vec, + a_gs_ms_ks_strides_vec); + } + + // TODO: rename to G_MRaw_KRaw + __host__ __device__ static auto MakeAGridDescriptor_G_M_K( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) + { + return MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).first; + } + __host__ __device__ static auto MakeAGridDescriptor_M_K( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) + { + return matrix_padder.PadADescriptor_M_K( + MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).second); + } + + template + __host__ __device__ static constexpr auto + MakeAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k, const Number& AK1) + { + const auto M = a_grid_desc_m_k.GetLength(I0); + const auto K = a_grid_desc_m_k.GetLength(I1); + + const auto AK0 = K / AK1; + + return transform_tensor_descriptor(a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + template + __host__ __device__ static constexpr auto + MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AK0PerWmma_AKRow_MPerWmma_AK1( + const AGridDesc_M_K& a_grid_desc_m_k, + const WmmaK&, + const MRepeat&, + const MWaves&, + const MPerWmma&, + const AK1&) + { + const auto M0 = a_grid_desc_m_k.GetLength(I0) / MPerBlock; + const auto K = a_grid_desc_m_k.GetLength(I1); + const auto AKWmma = K / WmmaK{}; + constexpr auto AKRow = 2; + constexpr auto AK0PerWmma = WmmaK{} / AKRow / AK1{}; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform( + make_tuple(AKWmma, Number{}, Number{}, AK1{})), + make_unmerge_transform(make_tuple(M0 * MRepeat{}, MWaves{}, MPerWmma{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); + } + + // + // B (alias of B0) + // + __host__ __device__ static auto MakeB0GridDescriptorPair( + const std::array& b0_gs_ns_ks_lengths_vec, + const std::array& b0_gs_ns_ks_strides_vec) + { + return MakeGridDescriptorPair(b0_gs_ns_ks_lengths_vec, + b0_gs_ns_ks_strides_vec); + } + + // TODO: rename to G_MRaw_NRaw + __host__ __device__ static auto MakeB0GridDescriptor_G_N_K( + const std::array& b0_gs_ns_ks_lengths_vec, + const std::array& b0_gs_ns_ks_strides_vec) + { + return MakeB0GridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec).first; + } + __host__ __device__ static auto MakeB0GridDescriptor_N_K( + const std::array& b0_gs_ns_ks_lengths_vec, + const std::array& b0_gs_ns_ks_strides_vec) + { + // alias of matrix_padder.PadB0Descriptor_N_K + return matrix_padder.PadBDescriptor_N_K( + MakeB0GridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec).second); + } + + template + __host__ __device__ static constexpr auto + MakeB0GridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k, const Number& BK1) + { + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + + const auto BK0 = K / BK1; + + return transform_tensor_descriptor(b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + template + __host__ __device__ static constexpr auto + MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BK0PerWmma_BKRow_LPerWmma_BK1( + const BGridDesc_L_K& b_grid_desc_l_k, + const WmmaK&, + const LRepeat&, + const LWaves&, + const LPerWmma&, + const BK1&) + { + const auto L0 = b_grid_desc_l_k.GetLength(I0) / NPerBlock; + const auto K = b_grid_desc_l_k.GetLength(I1); + const auto BKWmma = K / WmmaK{}; + constexpr auto BKRow = 2; + constexpr auto BK0PerWmma = WmmaK{} / BKRow / BK1{}; + + return transform_tensor_descriptor( + b_grid_desc_l_k, + make_tuple(make_unmerge_transform( + make_tuple(BKWmma, Number{}, Number{}, BK1{})), + make_unmerge_transform(make_tuple(L0 * LRepeat{}, LWaves{}, LPerWmma{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); + } + + // + // B1 + // + __host__ __device__ static auto MakeB1GridDescriptorPair( + const std::array& b1_gs_os_ns_lengths_vec, + const std::array& b1_gs_os_ns_strides_vec) + { + return MakeGridDescriptorPair(b1_gs_os_ns_lengths_vec, + b1_gs_os_ns_strides_vec); + } + + // TODO: rename to G_NRaw_KRaw + __host__ __device__ static auto MakeB1GridDescriptor_G_N_K( + const std::array& b1_gs_os_ns_lengths_vec, + const std::array& b1_gs_os_ns_strides_vec) + { + return MakeB1GridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec).first; + } + __host__ __device__ static auto MakeB1GridDescriptor_N_K( + const std::array& b1_gs_os_ns_lengths_vec, + const std::array& b1_gs_os_ns_strides_vec) + { + // alias of matrix_padder.PadB1Descriptor_O_N + return matrix_padder.PadB1Descriptor_N_K( + MakeB1GridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec).second); + } + + template + __host__ __device__ static constexpr auto + MakeB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K& b1_grid_desc_n_k, const Number& B1K1) + { + const auto N = b1_grid_desc_n_k.GetLength(I0); + const auto K = b1_grid_desc_n_k.GetLength(I1); + + const auto B1K0 = K / B1K1; + + return transform_tensor_descriptor( + b1_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + + template + __host__ __device__ static constexpr auto + MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves__BL0PerWmma_BLRow_NPerWmma_BL1( + const BGridDesc_N_L& b_grid_desc_n_l, + const WmmaL&, + const NRepeat&, + const NWaves&, + const NPerWmma&, + const BL1&) + { + const auto N0 = b_grid_desc_n_l.GetLength(I0) / OPerBlock; + const auto L = b_grid_desc_n_l.GetLength(I1); + const auto BLWmma = L / WmmaL{}; + constexpr auto BLRow = 2; + constexpr auto BL0PerWmma = WmmaL{} / BLRow / BL1{}; + + return transform_tensor_descriptor( + b_grid_desc_n_l, + make_tuple(make_unmerge_transform( + make_tuple(BLWmma, Number{}, Number{}, BL1{})), + make_unmerge_transform(make_tuple(N0 * NRepeat{}, NWaves{}, NPerWmma{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); + } + + // + // C + // + __host__ __device__ static auto MakeCGridDescriptorPair( + const std::array& c_gs_ms_os_lengths_vec, + const std::array& c_gs_ms_os_strides_vec) + { + return MakeGridDescriptorPair(c_gs_ms_os_lengths_vec, + c_gs_ms_os_strides_vec); + } + + // TODO: rename to G_MRaw_NRaw + __host__ __device__ static auto MakeCGridDescriptor_G_M_N( + const std::array& c_gs_ms_os_lengths_vec, + const std::array& c_gs_ms_os_strides_vec) + { + return MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).first; + } + __host__ __device__ static auto MakeCGridDescriptor_M_N( + const std::array& c_gs_ms_os_lengths_vec, + const std::array& c_gs_ms_os_strides_vec) + { + return matrix_padder.PadCDescriptor_M_N( + MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).second); + } +}; + +} // namespace tensor_operation +} // namespace ck From fd9e80c5104bb8aea61d6b350b6f963a5b7a2ac3 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Sun, 25 Jun 2023 05:16:03 +0000 Subject: [PATCH 093/118] fix gemm --- .../gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 6 +++--- .../ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index 3172e117cb6..377b7dedd93 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -349,10 +349,10 @@ struct GridwiseGemmMultipleD_Wmma using GridwiseGemmPipe = remove_cvref_t())>; + LoopSched, + AEnableLds, + BEnableLds>())>; // Describe how data store to (LDS/VGPR) buffer from Global memory __host__ __device__ static constexpr auto MakeABlockDescriptor() diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index da3cb330a01..2b5d5db94f5 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -141,11 +141,11 @@ struct GridwiseGemm_Wmma using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = - remove_cvref_t())>; + LoopSched, + AEnableLds, + BEnableLds>())>; // Describe how data store to (LDS/VGPR) buffer from Global memory __host__ __device__ static constexpr auto MakeABlockDescriptor() From 1fb4a4740fdd81899521a0344f76503a16783292 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 26 Jun 2023 03:51:08 +0000 Subject: [PATCH 094/118] clang format --- include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 2b5d5db94f5..8e542457b26 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -141,7 +141,7 @@ struct GridwiseGemm_Wmma using ThisThreadBlock = ThisThreadBlock; using GridwiseGemmPipe = - remove_cvref_t Date: Fri, 7 Jul 2023 11:13:53 +0000 Subject: [PATCH 095/118] add gemm fp16 instances --- .../tensor_operation_instance/gpu/gemm.hpp | 24 +++ .../gpu/gemm/CMakeLists.txt | 4 + ...emm_wmma_f16_f16_f16_km_kn_mn_instance.cpp | 79 +++++++++ ...emm_wmma_f16_f16_f16_km_nk_mn_instance.cpp | 79 +++++++++ ...emm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp | 159 ++++++++++++++++++ ...emm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp | 78 +++++++++ 6 files changed, 423 insertions(+) create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp index adac7d0dcf4..966f4d01ddc 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp @@ -264,6 +264,26 @@ void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances( DeviceGemm>>& instances); +void add_device_gemm_wmma_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + template && is_same_v && is_same_v) @@ -346,6 +367,7 @@ struct DeviceOperationInstanceFactory< add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs); add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs); add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + add_device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) @@ -354,6 +376,7 @@ struct DeviceOperationInstanceFactory< add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs); add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs); + add_device_gemm_wmma_f16_f16_f16_km_kn_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) @@ -362,6 +385,7 @@ struct DeviceOperationInstanceFactory< add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs); add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs); + add_device_gemm_wmma_f16_f16_f16_km_nk_mn_instances(op_ptrs); } } else if constexpr(is_same_v && is_same_v && diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index d66010af734..c14ee1d885c 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -48,4 +48,8 @@ add_instance_library(device_gemm_instance device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp + device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp + device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp + device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp new file mode 100644 index 00000000000..e757049b4e1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[k, m] * b[k, n] = c[m, n] +using device_gemm_wmma_f16_f16_f16_km_kn_mn_instances = + std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_f16_f16_f16_km_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp new file mode 100644 index 00000000000..b399c9768a9 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[k, m] * b[n, k] = c[m, n] +using device_gemm_wmma_f16_f16_f16_km_nk_mn_instances = + std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_f16_f16_f16_km_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp new file mode 100644 index 00000000000..8f3f95864ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[m, k] * b[k, n] = c[m, n] +using device_gemm_wmma_f16_f16_f16_mk_kn_mn_instances = + std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> +#if 0 + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> +#endif + // clang-format on + >; + +void add_device_gemm_wmma_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_f16_f16_f16_mk_kn_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp new file mode 100644 index 00000000000..eed856b6cab --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using F16 = ck::half_t; +using F32 = float; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; +static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// Compilation parameters for a[m, k] * b[n, k] = c[m, n] +using device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances = std::tuple< + // clang-format off + //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| + //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| + //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | + //######################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | NBlock_NPerBlock| | + /* Prefetch 2, consume enormous vgpr resource*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 256, 128, 128, 32, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 128, 128, 64, 64, 8, 16, 16, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 64, 64, 32, 32, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 2, 32, 16, 16, 32, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + /* Prefetch 1, prefer larger KPerBlock value for better latency hiding*/ + // 8 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 256, 64, 8, 16, 16, 4, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 256, 128, 160, 64, 8, 16, 16, 2, 5, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 4>, 8>, + // 4 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 16, 16, 4, 4, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 256, 64, 64, 8, 16, 16, 8, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 256, 64, 8, 16, 16, 2, 8, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 128, 64, 80, 64, 8, 16, 16, 1, 5, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 64, 1, 2>, 8>, + // 2 Waves + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 16, 64, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 64, 32, 64, 8, 16, 16, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 64, 32, 64, 64, 8, 16, 16, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 4>, 8>, + // 1 Wave + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmWmma_CShuffle< Row, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> + // clang-format on + >; + +void add_device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances) +{ + add_device_operation_instances(instances, device_gemm_wmma_f16_f16_f16_mk_nk_mn_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck From febd76e4cad54c6f0cd6a958154da9f48287e4f8 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 20 Jul 2023 05:20:14 +0000 Subject: [PATCH 096/118] Temp save --- example/49_fpAintB_gemm/CMakeLists.txt | 5 + .../49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 75 ++ example/49_fpAintB_gemm/run_gemm_example.inc | 167 +++ .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 555 +++++++++ .../gpu/block/blockwise_gemm_wmma.hpp | 4 +- .../gpu/device/device_gemm_dequantB.hpp | 46 + .../device/impl/device_fpAintB_gemm_wmma.hpp | 660 ++++++++++ .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 1104 +++++++++++++++++ .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 108 ++ 9 files changed, 2722 insertions(+), 2 deletions(-) create mode 100644 example/49_fpAintB_gemm/CMakeLists.txt create mode 100644 example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp create mode 100644 example/49_fpAintB_gemm/run_gemm_example.inc create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm_dequantB.hpp create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp diff --git a/example/49_fpAintB_gemm/CMakeLists.txt b/example/49_fpAintB_gemm/CMakeLists.txt new file mode 100644 index 00000000000..34059c7ff90 --- /dev/null +++ b/example/49_fpAintB_gemm/CMakeLists.txt @@ -0,0 +1,5 @@ +if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") + add_custom_target(example_fpAintB_gemm_wmma) + add_example_executable(example_fp16int8_gemm_wmma fp16int8_gemm_wmma.cpp) + add_dependencies(example_fpAintB_gemm_wmma example_fp16int8_gemm_wmma) +endif() diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp new file mode 100644 index 00000000000..96e4f747816 --- /dev/null +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp" + +using ADataType = ck::half_t; +using BDataType = int8_t; +using ScaleDataType = ck::half_t; +using AccDataType = float; +using CShuffleDataType = float; +using CDataType = ck::half_t; + +using ALayout = Row; +using BLayout = Col; +using CLayout = Row; + +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +// clang-format off +using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_CShuffle + < ALayout, + BLayout, + CLayout, + ADataType, + BDataType, + ScaleDataType, + CDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CElementOp, + GemmDefault, + 2, // Prefetch stage + 128, // BlockSize + 128, // MPerBlock + 64, // NPerBlock + 64, // KPerBlock + 8, // K1 + 16, // MPerWmma + 16, // NPerWmma + 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + 1, // C shuffle (M Repeat) Per store + 1, // C shuffle (N Repeat) Per store + S<1, 32, 1, 4>, + 8>; +// clang-format on + +using ReferenceGemmInstance = ck::tensor_operation::host:: + ReferenceGemm; + +#include "run_gemm_example.inc" + +int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/49_fpAintB_gemm/run_gemm_example.inc b/example/49_fpAintB_gemm/run_gemm_example.inc new file mode 100644 index 00000000000..7d06ec4cb01 --- /dev/null +++ b/example/49_fpAintB_gemm/run_gemm_example.inc @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) +{ +#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) + static_assert(sizeof(ck::int4_t) == sizeof(int8_t)); +#endif + + using namespace ck::literals; + + auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size; + + auto f_host_tensor_descriptor = + [](std::size_t row, std::size_t col, std::size_t stride, auto layout) { + if constexpr(std::is_same_v) + { + return HostTensorDescriptor({row, col}, {stride, 1_uz}); + } + else + { + return HostTensorDescriptor({row, col}, {1_uz, stride}); + } + }; + + Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); + Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + + switch(config.init_method) + { + case 0: break; + case 1: + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + break; + case 2: + ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + break; + case 3: + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + break; + case 4: + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); + break; + case 5: + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(b_k_n); + break; + default: + ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + } + + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + + std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; + std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; + +#ifdef BUILD_INT4_EXAMPLE + DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) * + c_m_n_device_result.mDesc.GetElementSpaceSize()); + + const Tensor a_m_k_converted(a_m_k); + const Tensor b_k_n_converted(b_k_n); + + a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data()); +#else + DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); + DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + + a_m_k_device_buf.ToDevice(a_m_k.mData.data()); + b_k_n_device_buf.ToDevice(b_k_n.mData.data()); +#endif + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument( +#ifdef BUILD_INT4_EXAMPLE + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), +#else + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_buf.GetDeviceBuffer()), +#endif + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl; + + return true; + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); + + std::size_t flop = 2_uz * M * N * K; + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << gemm.GetTypeString() << std::endl; + + if(config.do_verification) + { + auto ref_gemm = ReferenceGemmInstance{}; + auto ref_invoker = ref_gemm.MakeInvoker(); + + auto ref_argument = ref_gemm.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + + ref_invoker.Run(ref_argument); + +#ifdef BUILD_INT4_EXAMPLE + Tensor c_m_n_device_result_converted(c_m_n_host_result.mDesc); + + c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data()); + + c_m_n_device_result = c_m_n_device_result_converted.CopyAsType(); + + return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result); +#else + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); +#endif + } + + return true; +} + +bool run_gemm_example(int argc, char* argv[]) +{ + ProblemSize problem_size; + ExecutionConfig config; + + return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config); +} diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp new file mode 100644 index 00000000000..283f5f87dae --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp" +#include "ck/tensor_description/tensor_adaptor.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +#define CK_MNK_LOOP + +namespace ck { + +template +/* Option: Read from LDS, big buffer hold all threads required data + * Source + * A: K0PerBlock x MPerBlock x K1 + * B: K0PerBlock x NPerBlock x K1 + * Destination + * C, non-transpose + * thread level: MRepeat x NRepeat x MAccVgprs + * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs + * KPACK == WMMA_K = 16 + * + * Option: Read from VMEM, small buffer hold each thread own required data (Skip LDS) + * Source: + * A(if skip LDS): MRepeat x KPack + * B(if skip LDS): NRepeat x KPack + * Destination + * C, non-transpose + * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs + */ +struct Blockwise_fpAintB_GemmWMMA +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto WmmaK = Number<16>{}; + + using ThisThreadBlock = ThisThreadBlock; + + // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. + static constexpr index_t WaveSize = 32; + + // When use LDS, each Row(16 consecutive lanes) read whole data from source buffer + // When not use LDS, each Row read half of whole data from source buffer, exchange the data via + // permutation + static constexpr index_t A_KRow = AEnableLds ? 1 : 2; + static constexpr index_t B_KRow = BEnableLds ? 1 : 2; + static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5); + static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5); + + static constexpr auto wmma_gemm = + WmmaGemm{}; + + static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); + static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); + + StaticBufferTupleOfVector + c_thread_buf_; + + __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } + + __device__ static auto GetWaveIdx() + { + const index_t thread_id = ThisThreadBlock::GetThreadId(); + + constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); + } + + // Default, Block buffer in LDS, thread level offset enabled + __device__ static auto CalculateAThreadOriginDataIndex() + { + if constexpr(AEnableLds) + { + const auto wave_idx = GetWaveIdx(); + const auto waveId_m = wave_idx[I0]; + const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); + + // |KRepeat |MRepeat|MWave |KRow |MLane |KPack + return make_tuple(0, 0, waveId_m, 0, WMMA_a_idx, 0); + } + else + { + return make_tuple(0, 0, 0, 0, 0, 0); + } + } + + __device__ static auto CalculateBThreadOriginDataIndex() + { + if constexpr(BEnableLds) + { + const auto wave_idx = GetWaveIdx(); + const auto waveId_n = wave_idx[I1]; + const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); + + // |KRepeat |NRepeat|Nwave |KRow |NLane |KPack + return make_tuple(0, 0, waveId_n, 0, WMMA_b_idx, 0); + } + else + { + return make_tuple(0, 0, 0, 0, 0, 0); + } + } + + template + __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); + + constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( + make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1, 2>{})); + + const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( + make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; + const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( + make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; + + return make_tuple(c_thread_m, c_thread_n); + } + + template + __device__ static auto CalculateCThreadOriginDataIndex7D(Number, Number) + { + const auto wave_idx = GetWaveIdx(); + + const auto waveId_m = wave_idx[I0]; + const auto waveId_n = wave_idx[I1]; + + const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D(); + + return make_tuple( + Number{}, waveId_m, blk_idx[I0], Number{}, waveId_n, blk_idx[I1], blk_idx[I2]); + } + + using Tuple6 = decltype(CalculateAThreadOriginDataIndex()); + __host__ __device__ BlockwiseGemmWMMA(Tuple6 a_origin = CalculateAThreadOriginDataIndex(), + Tuple6 b_origin = CalculateBThreadOriginDataIndex()) + : a_thread_copy_(a_origin), b_thread_copy_(b_origin) + { + static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(), + "wrong! Desc should be known at compile-time"); + + static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, + "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); + + static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && + NPerBlock % (NPerWMMA * NRepeat) == 0, + "wrong!"); + } + + // transposed WMMA output C' = B' * A' + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + + return make_naive_tensor_descriptor_packed( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, I1, I1, Number{}, I1, I1, NAccVgprs)); + } + + // Thread level, register decriptor. Vector-write + __host__ __device__ static constexpr auto + GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = + wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); + + constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; + constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3]; + return make_naive_tensor_descriptor( + // |MRepeat |MWave |MSubGroup |NRepeat |NWave + // |NThreadPerSubGroup |MAccVgprs + make_tuple(Number{}, I1, I1, Number{}, I1, I1, MAccVgprs), + make_tuple(Number{} * MAccVgprs * AccStride, + Number{} * MAccVgprs * AccStride, + Number{} * MAccVgprs * AccStride, + MAccVgprs * AccStride, + MAccVgprs * AccStride, + MAccVgprs * AccStride, + AccStride)); + } + + template + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = + transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple( + make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), + make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); + } + + // transposed WMMA output C' = B' * A' + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() + { + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MThreadPerSubGroup_NBlockxRepeat_NWave_NSubGroup_NAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + } + + // Provide dimension size + __host__ __device__ static constexpr auto + GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() + { + constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = + make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number{})); + + return wmma_gemm + .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( + c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); + } + + // Describe how data allocated in thread copy src buffer + // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma + static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1; + static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1; + + template + __device__ void Run(const ABlockBuffer& a_block_buf, + const BBlockBuffer& b_block_buf, + const ScaleBlockBuffer& scale_block_buf, + CThreadBuffer& c_thread_buf) const + { + auto a_thread_buf = make_static_buffer( + a_thread_desc_.GetElementSpaceSize()); + auto b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); + auto scale_thread_buf = make_static_buffer( + scale_thread_desc_.GetElementSpaceSize()); + auto converted_b_thread_buf = b_thread_buf; + + static constexpr auto dequantizer = Dequantizer{}; + + // basic intrinsic to determine loopover direction + if constexpr(MRepeat < NRepeat) + { + static_for<0, KPerBlock / WmmaK, 1>{}( + [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... + static_for<0, MRepeat, 1>{}([&](auto m0) { + // read A + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, m0, I0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0, I0), + a_thread_buf); + + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + b_thread_buf); + // read weight scale + scale_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + b_block_buf, + b_scale_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + b_scale_thread_buf); + + // convert B from int8 to fp16 + converted_b_thread_buf = type_convert(b_thread_buf); + + // multiply scale + dequantize(converted_b_thread_buf, scale_thread_buf); + + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + b_thread_vec.template AsType()(i) = + converted_b_thread_buf[Number{}]; + }); + + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + } + else + { + static_for<0, NRepeat, 1>{}([&](auto n0) { + static_for<0, MRepeat, 1>{}([&](auto m0) { + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of + // k=0,kpack*1, .. + // read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + b_thread_buf); + // read A + a_thread_copy_.Run( + a_block_desc_k0_m0_m1_m2_k1, + make_tuple(Number{}, m0, I0, I0, I0, I0), + a_block_buf, + a_thread_desc_, + make_tuple(I0, m0, I0, I0, I0, I0), + a_thread_buf); + + vector_type a_thread_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + b_thread_vec.template AsType()(i) = + b_thread_buf[Number{}]; + a_thread_vec.template AsType()(i) = + a_thread_buf[Number{}]; + }); + + using wmma_input_type_a = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; + + constexpr index_t c_offset = + c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); + + wmma_gemm.template Run( + a_thread_vec.template AsType()(Number<0>{}), + b_thread_vec.template AsType()(Number<0>{}), + c_thread_buf.GetVectorTypeReference(Number{})); + }); + }); + }); + } + } + + protected: + static constexpr auto a_thread_desc_ = + make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number<1>{})); + + static constexpr auto b_thread_desc_ = + make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(Number{}, + Number{}, + Number{}, + Number{}, + Number{}, + Number<1>{})); + + // C[M, N, NumRegWMMA] + static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( + make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); + + template + struct AThreadCopySelector; + + template <> + struct AThreadCopySelector + { + using type = + ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + A_K1, + A_K1>; + }; + + template <> + struct AThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< + ADataType, + ADataType, + decltype(a_block_desc_k0_m0_m1_m2_k1), + decltype(a_thread_desc_), + tensor_operation::element_wise::PassThrough, + Sequence, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + A_K1, + 0x76543210, + 0xfedcba98, + TransposeC ? false : true>; + }; + + template + struct BThreadCopySelector; + + template <> + struct BThreadCopySelector + { + using type = + ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + B_K1, + B_K1>; + }; + + template <> + struct BThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< + BDataType, + BDataType, + decltype(b_block_desc_k0_n0_n1_n2_k1), + decltype(b_thread_desc_), + tensor_operation::element_wise::PassThrough, + Sequence, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + B_K1, + 0x76543210, + 0xfedcba98, + TransposeC ? true : false>; + }; + + typename AThreadCopySelector::type a_thread_copy_; + typename BThreadCopySelector::type b_thread_copy_; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 576a83f6b67..679da465dab 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -362,11 +362,11 @@ struct BlockwiseGemmWMMA } else { - static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, MRepeat, 1>{}([&](auto m0) { static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of - // k=0,kpack*1, ... read B + // k=0,kpack*1, .. + // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, make_tuple(Number{}, n0, I0, I0, I0, I0), diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_dequantB.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_dequantB.hpp new file mode 100644 index 00000000000..acb18efabfe --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/device_gemm_dequantB.hpp @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// Dequantization of input tensor could not be decoupled from gridwisegemm pipeline +// As input tensor thread buffer declared inside blockwise-gemm pipeline. + +template +struct DeviceGemm_dequantB : public BaseOperator +{ + virtual std::unique_ptr + MakeArgumentPointer(const void* p_a, + const void* p_b, + const void* p_scale, + void* p_c, + ck::index_t M, + ck::index_t N, + ck::index_t K, + ck::index_t StrideA, + ck::index_t StrideB, + ck::index_t StrideC, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) = 0; + + virtual std::unique_ptr MakeInvokerPointer() = 0; +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp new file mode 100644 index 00000000000..41ecbbb5321 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -0,0 +1,660 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm_dequantB.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// 1. DequantB(K, N) = int2fp(B(K, N)) * scale(1, N) +// 2. C(M, N) = A(M, K) * DequantB(K, N) + +template +struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + // K1 = Max Vector Access Pixels + static constexpr auto K1Number = Number{}; + + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + + static constexpr auto AEnableLds_auto = NWaves == 1 ? false : true; + static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; + + // If true, LDS is used unconditionally + static constexpr auto AEnableLds_manu = false; + static constexpr auto BEnableLds_manu = false; + + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); + static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); + + static constexpr auto matrix_padder = + MatrixPadder{MPerBlock, NPerBlock, KPerBlock}; + + using DeviceOp = DeviceFpAintBGemm_Wmma_CShuffle; + + // Describe how data read from Global memory + static auto MakeAGridDescriptor(index_t MRaw, index_t KRaw, index_t StrideA) + { + const auto a_grid_desc_m_k = [&]() { + if constexpr(is_same::value) + { + const auto a_grid_desc_mraw_kraw = + make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), make_tuple(StrideA, I1)); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + else if constexpr(is_same::value) + { + const auto a_grid_desc_mraw_kraw = + make_naive_tensor_descriptor(make_tuple(MRaw, KRaw), make_tuple(I1, StrideA)); + + return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw); + } + }(); + + const auto M = a_grid_desc_m_k.GetLength(I0); + const auto K = a_grid_desc_m_k.GetLength(I1); + assert(K % K1 == 0); + + if constexpr(AEnableLds) + { + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(M)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto A_KRow = 2; + constexpr auto A_K0PerWmma = WmmaK / A_KRow / K1Number; + const auto A_KWmma = K / WmmaK; + + const auto M0 = M / MPerBlock; + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 + return transform_tensor_descriptor( + a_grid_desc_m_k, + make_tuple(make_unmerge_transform(make_tuple( + A_KWmma, Number{}, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(M0 * MRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); + } + } + + static auto MakeBGridDescriptor(index_t KRaw, index_t NRaw, index_t StrideB) + { + const auto b_grid_desc_n_k = [&]() { + if constexpr(is_same::value) + { + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(I1, StrideB)); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + else if constexpr(is_same_v) + { + const auto b_grid_desc_nraw_kraw = + make_naive_tensor_descriptor(make_tuple(NRaw, KRaw), make_tuple(StrideB, I1)); + + return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw); + } + }(); + + const auto N = b_grid_desc_n_k.GetLength(I0); + const auto K = b_grid_desc_n_k.GetLength(I1); + // When K = 1, it might be scale tensor. + assert(K % K1 == 0 && K != 1 ); + + if constexpr(BEnableLds) + { + const index_t K0 = K / K1; + + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)), + make_pass_through_transform(N)), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto B_KRow = 2; + constexpr auto B_K0PerWmma = WmmaK / B_KRow / K1Number; + const auto B_KWmma = K / WmmaK; + + const auto N0 = N / NPerBlock; + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 + return transform_tensor_descriptor( + b_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple( + B_KWmma, Number{}, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(N0 * NRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); + } + } + + static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC) + { + const auto c_grid_desc_mraw_nraw = [&]() { + if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), + make_tuple(StrideC, I1)); + } + else if constexpr(is_same::value) + { + return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), + make_tuple(I1, StrideC)); + } + }(); + + return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw); + } + + // Gridwise descriptor, mapping to whole given provblem. + using AGridDesc = decltype(MakeAGridDescriptor(1, 1, 1)); + using BGridDesc = decltype(MakeBGridDescriptor(1, 1, 1)); + using ScaleGridDesc = decltype(MakeBGridDescriptor(1, 1, 1)); + using CGridDesc_M_N = decltype(MakeCGridDescriptor_M_N(1, 1, 1)); + + // GridwiseGemm + using GridwiseGemm = + GridwiseFpAintBGemm_Wmma; + + // Argument + struct Argument : public BaseArgument + { + Argument(const ADataType* p_a_grid, + const BDataType* p_b_grid, + const ScaleDataType* p_scale, + CDataType* p_c_grid, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + index_t M01, + index_t N01, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + : p_a_grid_{p_a_grid}, + p_b_grid_{p_b_grid}, + p_scale_grid_{p_scale}, + p_c_grid_{p_c_grid}, + a_grid_desc_{}, + b_grid_desc_{}, + scale_grid_desc_{}, + c_grid_desc_m_n_{}, + c_grid_desc_mblock_mperblock_nblock_nperblock{}, + block_2_ctile_map_{}, + M01_{M01}, + N01_{N01}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + c_element_op_{c_element_op}, + MRaw_{M}, + NRaw_{N}, + KRaw_{K} + { + a_grid_desc_ = DeviceOp::MakeAGridDescriptor(M, K, StrideA); + b_grid_desc_ = DeviceOp::MakeBGridDescriptor(K, N, StrideB); + scale_grid_desc_ = DeviceOp::MakeBGridDescriptor(1, N, 1); + c_grid_desc_m_n_ = DeviceOp::MakeCGridDescriptor_M_N(M, N, StrideC); + + block_2_ctile_map_ = + GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01); + + if(GridwiseGemm::CheckValidity( + a_grid_desc_, b_grid_desc_, c_grid_desc_m_n_, block_2_ctile_map_)) + { + c_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n_); + } + } + + // private: + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + const ScaleDataType* p_b_grid_; + CDataType* p_c_grid_; + AGridDesc a_grid_desc_; + BGridDesc b_grid_desc_; + ScaleGridDesc scale_grid_desc_; + CGridDesc_M_N c_grid_desc_m_n_; + typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock; + typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_; + index_t M01_; + index_t N01_; + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CElementwiseOperation c_element_op_; + // for checking vector load/store + index_t MRaw_; + index_t NRaw_; + index_t KRaw_; + }; + + // Invoker + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_, + arg.b_grid_desc_, + arg.c_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + throw std::runtime_error( + "wrong! GridwiseGemm_k0mk1_k0nk1_m0nm1_wmma_v1r1 has invalid setting"); + } + + const index_t grid_size = + arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_); + + const auto K = [&]() { + if constexpr(AEnableLds) + { + return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I2); + } + else + { + return arg.a_grid_desc_.GetLength(I0) * arg.a_grid_desc_.GetLength(I3) * + arg.a_grid_desc_.GetLength(I4) * arg.a_grid_desc_.GetLength(I6); + } + }(); + auto launch_kernel = [&](auto has_main_k_block_loop) { + const auto kernel = kernel_fpAintB_gemm_wmma< + GridwiseGemm, + ADataType, + BDataType, + ScaleDataType, + CDataType, + remove_reference_t, + remove_reference_t, + remove_reference_t, + remove_reference_t< + typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + remove_reference_t, + has_main_k_block_loop>; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.a_grid_desc_, + arg.b_grid_desc_, + arg.c_grid_desc_mblock_mperblock_nblock_nperblock, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.block_2_ctile_map_); + }; + + if(GridwiseGemm::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + static bool IsSupportedArgument(const Argument& arg) + { + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") + { + if constexpr(!(is_same_v || is_same_v || + is_same_v)) + { + printf("DeviceOp err: AccDataType"); + return false; + } + } + else + { + printf("DeviceOp err: Arch"); + return false; + } + + // check vector load/store + { + using Row = ck::tensor_layout::gemm::RowMajor; + using Col = ck::tensor_layout::gemm::ColumnMajor; + + // check vector load of A + if constexpr(is_same_v && ABlockTransferSrcVectorDim == 2) + { + if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else if constexpr(is_same_v && ABlockTransferSrcVectorDim == 1) + { + // FIXME: not rigorous + if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else + { + return false; + } + + // check vector laod of B + if constexpr(is_same_v && BBlockTransferSrcVectorDim == 2) + { + if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else if constexpr(is_same_v && BBlockTransferSrcVectorDim == 1) + { + // FIXME: not rigorous + if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0) + { + return false; + } + } + else + { + return false; + } + + // check vector store of C + // only support RowMajor for now + if constexpr(is_same_v) + { + if(arg.NRaw_ % CShuffleBlockTransferScalarPerVector_NPerBlock != 0) + { + return false; + } + } + else + { + return false; + } + } + + return GridwiseGemm::CheckValidity(arg.a_grid_desc_, + arg.b_grid_desc_, + arg.c_grid_desc_m_n_, + arg.block_2_ctile_map_); + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const ADataType* p_a, + const BDataType* p_b, + CDataType* p_c, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{p_a, + p_b, + p_c, + M, + N, + K, + StrideA, + StrideB, + StrideC, + 1, + 1, + a_element_op, + b_element_op, + c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeArgumentPointer(const void* p_a, + const void* p_b, + void* p_c, + index_t M, + index_t N, + index_t K, + index_t StrideA, + index_t StrideB, + index_t StrideC, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) override + { + return std::make_unique(static_cast(p_a), + static_cast(p_b), + static_cast(p_c), + M, + N, + K, + StrideA, + StrideB, + StrideC, + 1, + 1, + a_element_op, + b_element_op, + c_element_op); + } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + + // clang-format off + str << "DeviceFpAintBGemm_Wmma_CShuffle" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << KPerBlock << ", " + << K1 << ", " + << MPerWmma << ", " + << NPerWmma << ", " + << MRepeat << ", " + << NRepeat + << ">" + << " AEnableLds: " + << AEnableLds << ", " + << "BEnableLds: " + << BEnableLds << ", " + << "NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp new file mode 100644 index 00000000000..2ded2c0d1bb --- /dev/null +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -0,0 +1,1104 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/multi_index_transform_helper.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +namespace ck { + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_gemm_wmma(const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + CDataType* __restrict__ p_c_grid, + const AGridDesc a_grid_desc, + const BGridDesc b_grid_desc, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CElementwiseOperation c_element_op, + const Block2CTileMap block_2_ctile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) + __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size]; + + GridwiseGemm::template Run(p_a_grid, + p_b_grid, + p_c_grid, + p_shared, + a_grid_desc, + b_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b_element_op, + c_element_op, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b_grid; + ignore = p_c_grid; + ignore = a_grid_desc; + ignore = b_grid_desc; + ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; + ignore = a_element_op; + ignore = b_element_op; + ignore = c_element_op; + ignore = block_2_ctile_map; +#endif // end of if (defined(__gfx1100__)) +} + +template +struct GridwiseFpAintBGemm_Wmma +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + static constexpr auto I7 = Number<7>{}; + + // FIX ME: To be deprecated + static constexpr auto K1 = Number{}; + + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + static constexpr auto WmmaK = 16; + + using ThisThreadBlock = ThisThreadBlock; + + using GridwiseGemmPipe = GridwiseGemmPipeline_v1_dequant; + + // Describe how data store to (LDS/VGPR) buffer from Global memory + __host__ __device__ static constexpr auto MakeABlockDescriptor() + { + constexpr auto a_block_desc = [&]() { + if constexpr(AEnableLds) + { + // K0->M->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + constexpr auto max_lds_align = K1; + + if constexpr(ABlockLdsExtraM) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->MRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(Number{} * Number{} * K1, + Number{} * K1, + Number{} * K1, + K1, + K1, + K1, + I1)); + } + }(); + + return a_block_desc; + } + + __host__ __device__ static constexpr auto MakeBBlockDescriptor() + { + constexpr auto b_block_desc = [&]() { + if constexpr(BEnableLds) + { + // K0->N->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + constexpr auto max_lds_align = K1; + + if constexpr(BBlockLdsExtraN) + { + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(Number{} * K1, K1, I1)); + } + else + { + return make_naive_tensor_descriptor_aligned( + make_tuple(Number{}, Number{}, K1), max_lds_align); + } + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread + return make_naive_tensor_descriptor( + make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(Number{} * Number{} * K1, + Number{} * K1, + Number{} * K1, + K1, + K1, + K1, + I1)); + } + }(); + + return b_block_desc; + } + + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() + { + constexpr auto a_block_copy_step = [&]() { + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / K1; + + return make_multi_index(K0PerBlock, 0, 0); + } + else + { + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); + } + }(); + + return a_block_copy_step; + } + + __host__ __device__ static constexpr auto MakeBBlockSliceCopyStep() + { + constexpr auto b_block_copy_step = [&]() { + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock / K1; + + return make_multi_index(K0PerBlock, 0, 0); + } + else + { + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + + return make_multi_index(KWmmaPerBlock, 0, 0, 0, 0, 0, 0); + } + }(); + + return b_block_copy_step; + } + + // Describe how data read from (LDS/VGPR) buffer + template + __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&) + { + + constexpr auto a_wave_desc = [&]() { + if constexpr(AEnableLds) + { + // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1 + constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2); + constexpr auto A_KRow = I1; + return transform_tensor_descriptor( + ABlockDesc_{}, + make_tuple(make_unmerge_transform(make_tuple(Number{}, A_KRow)), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); + } + else + { + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ABlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = ABlockDesc_{}.GetLength(I3); + constexpr auto A_KRow = ABlockDesc_{}.GetLength(I4); + constexpr auto A_K1 = ABlockDesc_{}.GetLength(I6); + + // Err: merge transform cause non-constexpr issue + + // return transform_tensor_descriptor( + // ABlockDesc_{}, + // make_tuple(make_merge_transform(make_tuple(Number{}, I1)), + // make_pass_through_transform(Number{}), + // make_pass_through_transform(I1), + // make_pass_through_transform(I1), + // make_pass_through_transform(Number{})), + // make_tuple(Sequence<0, 3>{}, + // Sequence<1>{}, + // Sequence<2>{}, + // Sequence<4>{}, + // Sequence<5>{}), + // make_tuple( + // Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, + // Sequence<4>{})); + + // Workaround, Freeze transform + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); + } + }(); + + return a_wave_desc; + } + + template + __host__ __device__ static constexpr auto MakeBWaveDescriptor(const BBlockDesc_&) + { + constexpr auto b_wave_desc = [&]() { + if constexpr(BEnableLds) + { + // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 + constexpr auto B_K0 = BBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I2); + constexpr auto B_KRow = I1; + return transform_tensor_descriptor( + BBlockDesc_{}, + make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); + } + else + { + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = BBlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = BBlockDesc_{}.GetLength(I3); + constexpr auto B_KRow = BBlockDesc_{}.GetLength(I4); + constexpr auto B_K1 = BBlockDesc_{}.GetLength(I6); + + // Workaround, Freeze transform + return make_naive_tensor_descriptor_packed(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{})); + } + }(); + + return b_wave_desc; + } + + __host__ __device__ static constexpr auto + // *Caution Here repeat is shuffle repeat + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + { + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + make_naive_tensor_descriptor_packed( + make_tuple(I1, + Number{}, + I1, + Number{})); + + return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat; + } + + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} + template + __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, + const BGridDesc& b_grid_desc, + const CGridDesc_M_N& c_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) + { + static_assert(is_known_at_compile_time>::value, + "wrong! K1 need to be known at compile-time"); + + static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && + (NPerBlock % (NRepeat * NPerWmma)) == 0, + "Invalid tuning param!"); + + const auto GetAProblemsizeMK = [&]() { + if constexpr(AEnableLds) + { + return make_tuple(a_grid_desc.GetLength(I1), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * + a_grid_desc.GetLength(I5), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6)); + } + }; + + const auto GetBProblemsizeNK = [&]() { + if constexpr(BEnableLds) + { + return make_tuple(b_grid_desc.GetLength(I1), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * + b_grid_desc.GetLength(I5), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * + b_grid_desc.GetLength(I4) * b_grid_desc.GetLength(I6)); + } + }; + + const auto M = GetAProblemsizeMK()[I0]; + const auto N = GetBProblemsizeNK()[I0]; + const auto K = GetAProblemsizeMK()[I1]; + + if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) && + K == GetBProblemsizeNK()[I1])) + { + printf("A: MxK = %d x %d, B: NxK = %d x %d, C: MxN = %d x %d\n", + GetAProblemsizeMK()[I0], + GetAProblemsizeMK()[I1], + GetBProblemsizeNK()[I0], + GetBProblemsizeNK()[I1], + c_grid_desc_m_n.GetLength(I0), + c_grid_desc_m_n.GetLength(I1)); + printf("GridwiseOp err: ProblemSize check"); + return false; + } + + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0)) + { + printf("GridwiseOp err: ProblemSize division"); + return false; + } + + // check gridwise gemm pipeline + const auto num_k_loop = K / KPerBlock; + + if(!GridwiseGemmPipe::IsSupported(num_k_loop)) + { + printf("GridwiseOp err: Pipeline not support this k_loop"); + return false; + } + + if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n)) + { + return false; + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + if(!(a_grid_desc.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB && + b_grid_desc.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB)) + { + return false; + } + return true; + } + + __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K) + { + const index_t num_loop = K / KPerBlock; + + return GridwiseGemmPipe::CalculateHasMainLoop(num_loop); + } + + __host__ __device__ static constexpr auto + MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n) + { + const auto M = c_grid_desc_m_n.GetLength(I0); + const auto N = c_grid_desc_m_n.GetLength(I1); + + const auto MBlock = M / MPerBlock; + const auto NBlock = N / NPerBlock; + + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor( + c_grid_desc_m_n, + make_tuple(make_unmerge_transform(make_tuple(MBlock, Number{})), + make_unmerge_transform(make_tuple(NBlock, Number{}))), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{})); + + return c_grid_desc_mblock_mperblock_nblock_nperblock; + } + + // return block_id to C matrix tile idx (m0, n0) mapping + __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap( + const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */) + { + return BlockToCTileMap_M00_N0_M01Adapt( + c_grid_desc_m_n); + } + + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; + using DefaultBlock2CTileMap = + remove_cvref_t; + + struct SharedMemTrait + { + // LDS allocation for A and B: be careful of alignment + + static constexpr auto max_lds_align = K1; + + static constexpr auto a_block_space_size_aligned = + AEnableLds ? math::integer_least_multiple(MakeABlockDescriptor().GetElementSpaceSize(), + max_lds_align) + : 0; + static constexpr auto b_block_space_size_aligned = + BEnableLds ? math::integer_least_multiple(MakeBBlockDescriptor().GetElementSpaceSize(), + max_lds_align) + : 0; + + static constexpr auto a_block_space_offset = 0; + static constexpr auto b_block_space_offset = a_block_space_size_aligned; + + // LDS allocation for C shuffle in LDS + static constexpr auto c_shuffle_block_space_size = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() + .GetElementSpaceSize(); + + static constexpr auto c_shuffle_block_space_offset = 0; + + static constexpr auto lds_size = + math::max(c_shuffle_block_space_size * sizeof(CShuffleDataType), + a_block_space_size_aligned * sizeof(ADataType) + + b_block_space_size_aligned * sizeof(BDataType)); + }; + + template + __device__ static void Run(const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + const ScaleDataType* __restrict__ p_scale_grid, + CDataType* __restrict__ p_c_grid, + void* __restrict__ p_shared, + const AGridDesc& a_grid_desc, + const BGridDesc& b_grid_desc, + const ScaleGridDesc& scale_grid_desc, + const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock& + c_grid_desc_mblock_mperblock_nblock_nperblock, + const AElementwiseOperation& a_element_op, + const BElementwiseOperation& b_element_op, + const CElementwiseOperation& c_element_op, + const Block2CTileMap& block_2_ctile_map) + { + // clang-format off +/*******************************************************************************/ +// Memory buffer zone. + const auto a_grid_buf = make_dynamic_buffer( + p_a_grid, a_grid_desc.GetElementSpaceSize()); + const auto b_grid_buf = make_dynamic_buffer( + p_b_grid, b_grid_desc.GetElementSpaceSize()); + const auto scale_grid_buf = make_dynamic_buffer( + p_scale_grid, scale_grid_desc.GetElementSpaceSize()); + auto c_grid_buf = make_dynamic_buffer( + p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize()); + +/*******************************************************************************/ +// BlockIdx.x -> [BlockId.m, BlockId.n] + const auto block_work_idx = block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); + if(!block_2_ctile_map.ValidCTileIndex( + block_work_idx, + make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0), + c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2)))) + { return; } + + // Store BlockId into SGPR + const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock); + const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock); + +/*******************************************************************************/ +// BlockLevel, A/B Matrix ThreadMapping in WMMA Source buffer, As Destinaion of BlockWise_Copy + const auto K = [&](){ + if constexpr(AEnableLds){ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2); + } + else{ + return a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) + * a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6); + } + }(); + + constexpr auto a_block_desc = MakeABlockDescriptor(); + constexpr auto b_block_desc = MakeBBlockDescriptor(); + constexpr auto scale_block_desc = MakeBBlockDescriptor(); + + auto a_block_trait = [&](){ + // A matrix blockwise copy + if constexpr(AEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto a_block_buf = make_dynamic_buffer( + static_cast(p_shared), + SharedMemTrait::a_block_space_size_aligned); + + auto a_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, +/* typename ThreadClusterLengths, */ ABlockTransferThreadClusterLengths_K0_M_K1, +/* typename ThreadClusterArrangeOrder, */ ABlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ ADataType, +/* typename DstData, */ ADataType, +/* typename SrcDesc, */ decltype(a_grid_desc), +/* typename DstDesc, */ decltype(a_block_desc), +/* typename SrcDimAccessOrder, */ ABlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, +/* index_t SrcVectorDim, */ ABlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ ABlockTransferSrcScalarPerVector, +/* index_t DstScalarPerVector, */ ABlockTransferDstScalarPerVector_K1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ AThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, + NumGemmKPrefetchStage>( + a_grid_desc, + make_multi_index(0, m_block_data_idx_on_grid, 0), + a_element_op, + a_block_desc, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + return make_tuple(a_block_buf, a_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> MRepeat -> MWaves -> K0PerWmma -> KRow -> MPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; + auto a_block_buf = make_static_buffer( + a_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto a_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + Number{}, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + ABlockTransferSrcScalarPerVector, + AThreadTransferSrcResetCoordinateAfterRun, + true>( + a_grid_desc, + make_multi_index(0, + m_block_data_idx_on_grid/(MWaves * MPerWmma), + get_thread_local_1d_id() / 32, + 0, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(a_block_buf, a_blockwise_copy); + } + }; + + auto b_block_trait = [&](){ + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto b_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::b_block_space_offset, + SharedMemTrait::b_block_space_size_aligned); + + auto b_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BDataType, + BDataType, + decltype(b_grid_desc), + decltype(b_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true, + NumGemmKPrefetchStage>( + b_grid_desc, + make_multi_index(0, n_block_data_idx_on_grid, 0), + b_element_op, + b_block_desc, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + return make_tuple(b_block_buf, b_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; + auto b_block_buf = make_static_buffer( + b_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto b_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + Number{}, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + BBlockTransferSrcScalarPerVector, + BThreadTransferSrcResetCoordinateAfterRun, + true>( + b_grid_desc, + make_multi_index(0, + n_block_data_idx_on_grid/(NWaves * NPerWmma), + get_thread_local_1d_id() / 32, + 0, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(b_block_buf, b_blockwise_copy, scale_blockwise_copy); + } + }; + + auto scale_block_trait = [&](){ + if constexpr(BEnableLds) + { + constexpr auto K0PerBlock = KPerBlock/ K1; + auto scale_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::scale_block_space_offset, + SharedMemTrait::scale_block_space_size_aligned); + + auto scale_blockwise_copy = + ThreadGroupTensorSliceTransfer_v4r1, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + ScaleDataType, + ScaleDataType, + decltype(scale_grid_desc), + decltype(scale_block_desc), + BBlockTransferSrcAccessOrder, + Sequence<0, 1, 2>, + BBlockTransferSrcVectorDim, + 2, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + 1, + 1, + BThreadTransferSrcResetCoordinateAfterRun, + true, + 1>( + scale_grid_desc, + make_multi_index(0, n_block_data_idx_on_grid, 0), + ck::tensor_operation::element_wise::PassThrough{}, + scale_block_desc, + make_multi_index(0, 0, 0), + ck::tensor_operation::element_wise::PassThrough{}); + + return make_tuple(scale_block_buf, scale_blockwise_copy); + } + else + { + // Thread-wise copy + // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 + constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK/2/K1Value; + auto scale_block_buf = make_static_buffer( + scale_block_desc.GetElementSpaceSize()); + + // Limitation: NumDim of Src and Dst descriptor should be identical + auto scale_blockwise_copy = + ThreadwiseTensorSliceTransfer_v2{}, + Number{}, + I1, + Number{}, + I1, + I1, + Number{}>, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + BBlockTransferSrcScalarPerVector, + BThreadTransferSrcResetCoordinateAfterRun, + true>( + scale_grid_desc, + make_multi_index(0, + n_block_data_idx_on_grid/(NWaves * NPerWmma), + get_thread_local_1d_id() / 32, + 0, + (get_thread_local_1d_id() % 32 )/ 16, + get_thread_local_1d_id() % 16, + 0)); + + return make_tuple(scale_block_buf, scale_blockwise_copy); + } + }; + + auto a_block_buf = a_block_trait()[I0]; + auto a_blockwise_copy = a_block_trait()[I1]; + + auto b_block_buf = b_block_trait()[I0]; + auto b_blockwise_copy = b_block_trait()[I1]; + + auto scale_block_buf = scale_block_trait()[I0]; + auto scale_blockwise_copy = scale_block_trait()[I1]; +/*******************************************************************************/ + // GEMM + constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); + + auto blockwise_gemm = + Blockwise_fpAintB_GemmWMMA{}; + + // Prepare Register for C matrix + auto c_thread_buf = blockwise_gemm.GetCThreadBuffer(); + +/*******************************************************************************/ + // Shift Per SUB_K + constexpr auto a_block_slice_copy_step = MakeABlockSliceCopyStep(); + constexpr auto b_block_slice_copy_step = MakeBBlockSliceCopyStep(); + + // gridwise GEMM pipeline + const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock); + /* + scale_blockwise_copy + */ + GridwiseGemmPipe::template Run(a_grid_desc, + a_block_desc, + a_blockwise_copy, + a_grid_buf, + a_block_buf, + a_block_slice_copy_step, + b_grid_desc, + b_block_desc, + b_blockwise_copy, + b_grid_buf, + b_block_buf, + b_block_slice_copy_step, + scale_grid_desc, + scale_block_desc, + scale_blockwise_copy, + scale_grid_buf, + scale_block_buf, + blockwise_gemm, + c_thread_buf, + KBlockMainLoop); +/*******************************************************************************/ + // write out to C, implement shuffle + { + // C mapping in single thread. + constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = + blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + // C mapping in single block + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp = + blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs(); + + constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I1); + constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I2); + constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I4); + constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I5); + constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(I6); + + // LDS descriptor, shuffle and write out in MRepeat x NRepeat times + constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat = + GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat(); + + auto c_shuffle_block_buf = make_dynamic_buffer( + static_cast(p_shared) + SharedMemTrait::c_shuffle_block_space_offset, + SharedMemTrait::c_shuffle_block_space_size); + + constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs = transform_tensor_descriptor( + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + make_tuple( + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // MRepeat per shuffle repeat + MWave, // MWave + MSubGroup, // MSubGroup * MAccVgprs = MPerWmma + MAccVgprs)), + make_freeze_transform(I0), + make_unmerge_transform(make_tuple( + Number{}, // NRepeat per shuffle repeat + NWave, // NWave + NThreadPerSubGroup))), // NThreadPerSubGroup = NPerWmma + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), + make_tuple(Sequence<>{}, Sequence<0, 1, 2, 6>{}, Sequence<>{}, Sequence<3, 4, 5>{})); + + // calculate origin of thread output tensor on global memory + // blockwise GEMM c matrix starting index + const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0); + + const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0]; + const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1]; + + const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(MRepeat, MWave, MSubGroup, MAccVgprs))), + make_tuple(Sequence<0, 1, 2, 3>{}), + make_tuple(Sequence<0>{})); + + const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor = + make_single_stage_tensor_adaptor( + make_tuple(make_merge_transform(make_tuple(NRepeat, NWave, NThreadPerSubGroup))), + make_tuple(Sequence<0, 1, 2>{}), + make_tuple(Sequence<0>{})); + + const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex( + make_multi_index(m_thread_data_on_block)); + + const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex( + make_multi_index(n_thread_data_on_block)); + + // shuffle: threadwise copy C from VGPR to LDS + auto c_thread_copy_vgpr_to_lds = + ThreadwiseTensorSliceTransfer_v1r3, + Sequence<0, 1, 2, 3, 4, 5, 6>, + 6, + 1, // vector write pixel + InMemoryDataOperationEnum::Set, + 1, + true>{ + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + make_multi_index(0, + m_thread_data_on_block_idx[I1], + m_thread_data_on_block_idx[I2], + 0, + n_thread_data_on_block_idx[I1], + n_thread_data_on_block_idx[I2], + m_thread_data_on_block_idx[I3]), + ck::tensor_operation::element_wise::PassThrough{}}; + + // shuffle: blockwise copy C from LDS to global + auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1< + ThisThreadBlock, // ThreadGroup + CElementwiseOperation, // ElementwiseOperation, + CGlobalMemoryDataOperation, // DstInMemOp, + Sequence<1, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, + 1, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>, // BlockSliceLengths, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder, + CShuffleDataType, // typename SrcData, + CDataType, // typename DstData, + decltype(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat), + decltype(c_grid_desc_mblock_mperblock_nblock_nperblock), + Sequence<0, 1, 2, 3>, // typename DimAccessOrder, + 3, // index_t VectorDim, + CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector, + true, // bool ThreadTransferSrcResetCoordinateAfterRun, + false> // bool ThreadTransferDstResetCoordinateAfterRun> + {c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + make_multi_index(0, 0, 0, 0), + c_grid_desc_mblock_mperblock_nblock_nperblock, + make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0), + c_element_op}; + + // space filling curve for local reg & global memory + // space filling curve for threadwise C in VGPR + constexpr auto sfc_c_vgpr = + SpaceFillingCurve, + Sequence<0, 1, 2, 3, 4, 5, 6>, + Sequence>{}; + + // space filling curve for shuffled blockwise C in global mem + constexpr auto sfc_c_global = + SpaceFillingCurve, + Sequence<0, 2, 1, 3>, + Sequence<1, + CShuffleMRepeatPerShuffle * MWave * MPerWmma, + 1, + CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{}; + + constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess(); + + static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!"); + + static_for<0, num_access, 1>{}([&](auto access_id) { + // make sure it's safe to write to LDS + block_sync_lds(); + + // each thread write its data from VGPR to LDS + c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + sfc_c_vgpr.GetIndexTupleOfNumber(access_id), + c_thread_buf, + c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs, + c_shuffle_block_buf); + + // make sure it's safe to read from LDS + block_sync_lds(); + + // each block copy its data from LDS to global + c_shuffle_block_copy_lds_to_global.Run( + c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat, + c_shuffle_block_buf, + c_grid_desc_mblock_mperblock_nblock_nperblock, + c_grid_buf); + + if constexpr(access_id < num_access - 1) + { + constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id); + + // move on C + c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow( + c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step); + } + }); + } + // clang-format on + } +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 3ce216e2454..dd4112939db 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -550,6 +550,114 @@ struct GridwiseGemmPipeline_v1<1, false, false> } }; +template +struct GridwiseGemmPipeline_v1_dequant; + +template <> +struct GridwiseGemmPipeline_v1_dequant<1, true, true> +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + __device__ static void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const ScaleGridDesc& scale_grid_desc, + const ScaleBlockDesc& scale_block_desc, + const ScaleGridBuffer& scale_grid_buf, + ScaleBlockBuffer& scale_block_buf, + ScaleBlockTransfer& scale_blockwise_copy, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { + // preload data into LDS + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + scale_blockwise_copy.RunRead(scale_grid_desc, scale_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + scale_blockwise_copy.RunWrite(scale_block_desc, scale_block_buf); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + + block_sync_lds(); + + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); + + block_sync_lds(); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + } + } +}; + template struct GridwiseGemmPipelineInterwave_v1; From 0c51a35ea8a60adb8feb2fc7da876ea45c272730 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 25 Jul 2023 08:46:39 +0000 Subject: [PATCH 097/118] fpAintB kernel compile pass --- example/49_fpAintB_gemm/common.hpp | 89 +++++++++ .../49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 12 +- example/49_fpAintB_gemm/run_gemm_example.inc | 14 +- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 110 ++++++++--- .../device/impl/device_fpAintB_gemm_wmma.hpp | 155 +++++++-------- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 99 +++++++--- .../grid/gridwise_gemm_pipeline_selector.hpp | 5 + .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 4 +- include/ck/utility/data_type.hpp | 10 + .../cpu/reference_fpAintB_gemm.hpp | 177 ++++++++++++++++++ ...emm_wmma_f16_f16_f16_km_kn_mn_instance.cpp | 9 +- ...emm_wmma_f16_f16_f16_km_nk_mn_instance.cpp | 9 +- ...emm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp | 7 +- 13 files changed, 558 insertions(+), 142 deletions(-) create mode 100644 example/49_fpAintB_gemm/common.hpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp diff --git a/example/49_fpAintB_gemm/common.hpp b/example/49_fpAintB_gemm/common.hpp new file mode 100644 index 00000000000..1f67d53de2b --- /dev/null +++ b/example/49_fpAintB_gemm/common.hpp @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/fill.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp" + +struct ProblemSize final +{ + ck::index_t M = 3840; + ck::index_t N = 4096; + ck::index_t K = 4096; + + ck::index_t StrideA = 4096; + ck::index_t StrideB = 4096; + ck::index_t StrideC = 4096; +}; + +struct ExecutionConfig final +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; +}; + +template +using S = ck::Sequence; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +inline bool +parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config) +{ + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + } + else if(argc == 10) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + + problem_size.M = std::stoi(argv[4]); + problem_size.N = std::stoi(argv[5]); + problem_size.K = std::stoi(argv[6]); + + problem_size.StrideA = std::stoi(argv[7]); + problem_size.StrideB = std::stoi(argv[8]); + problem_size.StrideC = std::stoi(argv[9]); + } + else + { + std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" + << std::endl + << "arg3: time kernel (0=no, 1=yes)" << std::endl + << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl; + return false; + } + + return true; +} diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp index 96e4f747816..618f1e90982 100644 --- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -37,7 +37,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_ BElementOp, CElementOp, GemmDefault, - 2, // Prefetch stage + 1, // Prefetch stage 128, // BlockSize 128, // MPerBlock 64, // NPerBlock @@ -67,8 +67,14 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_ 8>; // clang-format on -using ReferenceGemmInstance = ck::tensor_operation::host:: - ReferenceGemm; +using ReferenceGemmInstance = ck::tensor_operation::host::ReferencefpAintBGemm; #include "run_gemm_example.inc" diff --git a/example/49_fpAintB_gemm/run_gemm_example.inc b/example/49_fpAintB_gemm/run_gemm_example.inc index 7d06ec4cb01..d50b592fec1 100644 --- a/example/49_fpAintB_gemm/run_gemm_example.inc +++ b/example/49_fpAintB_gemm/run_gemm_example.inc @@ -27,6 +27,8 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + // assume scale tensor is [1, n] + Tensor scale_k_n(f_host_tensor_descriptor(K, N, 0, BLayout{})); switch(config.init_method) { @@ -34,26 +36,32 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) case 1: ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(scale_k_n); break; case 2: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + ck::utils::FillUniformDistribution{-1.f, 1.f}(scale_k_n); break; case 3: ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(scale_k_n); break; case 4: ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(scale_k_n); break; case 5: ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(scale_k_n); break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + ck::utils::FillUniformDistribution{-1.f, 1.f}(scale_k_n); } Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); @@ -61,6 +69,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; + std::cout << "scale_k_n: " << scale_k_n.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; #ifdef BUILD_INT4_EXAMPLE @@ -77,10 +86,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) #else DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); + DeviceMem scale_k_n_device_buf(sizeof(ScaleDataType) * scale_k_n.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); + scale_k_n_device_buf.ToDevice(scale_k_n.mData.data()); #endif auto a_element_op = AElementOp{}; @@ -98,6 +109,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) #else static_cast(a_m_k_device_buf.GetDeviceBuffer()), static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(scale_k_n_device_buf.GetDeviceBuffer()), static_cast(c_m_n_device_buf.GetDeviceBuffer()), #endif M, @@ -136,7 +148,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_argument = ref_gemm.MakeArgument( - a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + a_m_k, b_k_n, scale_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); ref_invoker.Run(ref_argument); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp index 283f5f87dae..472d6154a95 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -20,7 +20,7 @@ template {}; + WmmaGemm{}; static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); @@ -178,9 +179,10 @@ struct Blockwise_fpAintB_GemmWMMA } using Tuple6 = decltype(CalculateAThreadOriginDataIndex()); - __host__ __device__ BlockwiseGemmWMMA(Tuple6 a_origin = CalculateAThreadOriginDataIndex(), - Tuple6 b_origin = CalculateBThreadOriginDataIndex()) - : a_thread_copy_(a_origin), b_thread_copy_(b_origin) + __host__ __device__ + Blockwise_fpAintB_GemmWMMA(Tuple6 a_origin = CalculateAThreadOriginDataIndex(), + Tuple6 b_origin = CalculateBThreadOriginDataIndex()) + : a_thread_copy_(a_origin), b_thread_copy_(b_origin), scale_thread_copy_(b_origin) { static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(), "wrong! Desc should be known at compile-time"); @@ -290,8 +292,12 @@ struct Blockwise_fpAintB_GemmWMMA // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1; static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1; + static constexpr ScaleBlockDesc scale_block_desc_1_n0_n1_n2_1; - template + template __device__ void Run(const ABlockBuffer& a_block_buf, const BBlockBuffer& b_block_buf, const ScaleBlockBuffer& scale_block_buf, @@ -305,8 +311,6 @@ struct Blockwise_fpAintB_GemmWMMA scale_thread_desc_.GetElementSpaceSize()); auto converted_b_thread_buf = b_thread_buf; - static constexpr auto dequantizer = Dequantizer{}; - // basic intrinsic to determine loopover direction if constexpr(MRepeat < NRepeat) { @@ -333,21 +337,22 @@ struct Blockwise_fpAintB_GemmWMMA b_thread_buf); // read weight scale scale_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, + scale_block_desc_1_n0_n1_n2_1, make_tuple(Number{}, n0, I0, I0, I0, I0), - b_block_buf, - b_scale_thread_desc_, + scale_block_buf, + scale_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0), - b_scale_thread_buf); + scale_thread_buf); - // convert B from int8 to fp16 - converted_b_thread_buf = type_convert(b_thread_buf); - - // multiply scale - dequantize(converted_b_thread_buf, scale_thread_buf); + // convert B from int8 to fp16, multiply scale + static_for<0, b_thread_buf.size(), 1>{}([&](auto i) { + converted_b_thread_buf(i) = + scale_thread_buf[i / WmmaK] * + type_convert(b_thread_buf[i]); + }); vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { a_thread_vec.template AsType()(i) = @@ -358,7 +363,7 @@ struct Blockwise_fpAintB_GemmWMMA (i / A_K1) % A_KRow, 0, i % A_K1))>{}]; - b_thread_vec.template AsType()(i) = + b_thread_vec.template AsType()(i) = converted_b_thread_buf[Number::type; - using wmma_input_type_b = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -396,6 +401,20 @@ struct Blockwise_fpAintB_GemmWMMA b_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0), b_thread_buf); + // read weight scale + scale_thread_copy_.Run( + scale_block_desc_1_n0_n1_n2_1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + scale_block_buf, + scale_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + scale_thread_buf); + + // convert B from int8 to fp16, multiply scale + static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) { + converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] * + type_convert(b_thread_buf[i]); + }); // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, @@ -406,11 +425,11 @@ struct Blockwise_fpAintB_GemmWMMA a_thread_buf); vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { - b_thread_vec.template AsType()(i) = - b_thread_buf[Number()(i) = + converted_b_thread_buf[Number::type; - using wmma_input_type_b = typename vector_type::type; + using wmma_input_type_b = typename vector_type::type; constexpr index_t c_offset = c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); @@ -472,6 +491,15 @@ struct Blockwise_fpAintB_GemmWMMA Number{}, Number<1>{})); + static constexpr auto scale_thread_desc_ = + make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(I0, I1, I0, I0, I0, I0)); + // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); @@ -548,8 +576,42 @@ struct Blockwise_fpAintB_GemmWMMA TransposeC ? true : false>; }; + template + struct ScaleThreadCopySelector; + + template <> + struct ScaleThreadCopySelector + { + using type = + ThreadwiseTensorSliceTransfer_v4, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + B_K1, + B_K1>; + }; + + template <> + struct ScaleThreadCopySelector + { + using type = ThreadwiseTensorSliceTransfer_StaticToStatic< + ScaleDataType, + ScaleDataType, + decltype(scale_block_desc_1_n0_n1_n2_1), + decltype(scale_thread_desc_), + tensor_operation::element_wise::PassThrough, + Sequence, + Sequence<0, 1, 2, 3, 4, 5>, + 5, + 1>; + }; + typename AThreadCopySelector::type a_thread_copy_; typename BThreadCopySelector::type b_thread_copy_; + typename ScaleThreadCopySelector::type scale_thread_copy_; }; } // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 41ecbbb5321..0cff0aae769 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -22,7 +22,7 @@ namespace tensor_operation { namespace device { // 1. DequantB(K, N) = int2fp(B(K, N)) * scale(1, N) -// 2. C(M, N) = A(M, K) * DequantB(K, N) +// 2. C(M, N) = A(M, K) * DequantB(K, N) template + ck::PipelineVersion PipelineVer = ck::PipelineVersion::dequant_v1> struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB + BLayout, + CLayout, + ADataType, + BDataType, + CDataType, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation> { static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -103,7 +103,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB{MPerBlock, NPerBlock, KPerBlock}; - + using DeviceOp = DeviceFpAintBGemm_Wmma_CShuffle; // Describe how data read from Global memory @@ -183,7 +183,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB; + using GridwiseGemm = GridwiseFpAintBGemm_Wmma< + BlockSize, + ADataType, + BDataType, + ScaleDataType, + AccDataType, + CShuffleDataType, + CDataType, + InMemoryDataOperationEnum::Set, + AGridDesc, + BGridDesc, + ScaleGridDesc, + CGridDesc_M_N, + AElementwiseOperation, + BElementwiseOperation, + CElementwiseOperation, + MPerBlock, + NPerBlock, + KPerBlock, + MPerWmma, + NPerWmma, + K1, + MRepeat, + NRepeat, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + false, // AThreadTransferSrcResetCoordinateAfterRun, + AEnableLds, + ABlockLdsAddExtraM, + BBlockTransferThreadClusterLengths_K0_N_K1, + BBlockTransferThreadClusterArrangeOrder, + BBlockTransferSrcAccessOrder, + BBlockTransferSrcVectorDim, + BBlockTransferSrcScalarPerVector, + BBlockTransferDstScalarPerVector_K1, + false, // BThreadTransferSrcResetCoordinateAfterRun, + BEnableLds, + BBlockLdsAddExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CShuffleBlockTransferScalarPerVector_NPerBlock, + NumPrefetch, + LoopSched, + PipelineVer>; // Argument struct Argument : public BaseArgument { Argument(const ADataType* p_a_grid, const BDataType* p_b_grid, - const ScaleDataType* p_scale, + const ScaleDataType* p_scale_grid, CDataType* p_c_grid, index_t M, index_t N, @@ -310,7 +311,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB, remove_reference_t, - remove_reference_t, + remove_reference_t, remove_reference_t< typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock>, AElementwiseOperation, @@ -422,9 +423,11 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB MakeArgumentPointer(const void* p_a, const void* p_b, + const void* p_scale, void* p_c, index_t M, index_t N, @@ -595,6 +599,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB(static_cast(p_a), static_cast(p_b), + static_cast(p_scale), static_cast(p_c), M, N, @@ -623,8 +628,10 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB LoopSchedToString{ {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; - std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, - {PipelineVersion::v2, "v2"}}; + std::map PipelineVersionToString{ + {PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}, + {PipelineVersion::dequant_v1, "dequant_v1"}}; // clang-format off str << "DeviceFpAintBGemm_Wmma_CShuffle" diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index 2ded2c0d1bb..3f5af4bf9d7 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -20,9 +20,11 @@ namespace ck { template (p_a_grid, p_b_grid, + p_scale_grid, p_c_grid, p_shared, a_grid_desc, b_grid_desc, + scale_grid_desc, c_grid_desc_mblock_mperblock_nblock_nperblock, a_element_op, b_element_op, @@ -63,9 +69,11 @@ __global__ void #else ignore = p_a_grid; ignore = p_b_grid; + ignore = p_scale_grid; ignore = p_c_grid; ignore = a_grid_desc; ignore = b_grid_desc; + ignore = scale_grid_desc; ignore = c_grid_desc_mblock_mperblock_nblock_nperblock; ignore = a_element_op; ignore = b_element_op; @@ -77,12 +85,14 @@ __global__ void template + PipelineVersion PipelineVer = PipelineVersion::dequant_v1> struct GridwiseFpAintBGemm_Wmma { static constexpr auto I0 = Number<0>{}; @@ -140,7 +150,12 @@ struct GridwiseFpAintBGemm_Wmma using ThisThreadBlock = ThisThreadBlock; - using GridwiseGemmPipe = GridwiseGemmPipeline_v1_dequant; + using GridwiseGemmPipe = + remove_cvref_t())>; // Describe how data store to (LDS/VGPR) buffer from Global memory __host__ __device__ static constexpr auto MakeABlockDescriptor() @@ -237,6 +252,38 @@ struct GridwiseFpAintBGemm_Wmma return b_block_desc; } + __host__ __device__ static constexpr auto MakeScaleBlockDescriptor() + { + // Scale [1, N], all K related dimension reduce to 1 + constexpr auto scale_block_desc = [&]() { + if constexpr(BEnableLds) + { + // K0->N->K1 Per Block + constexpr auto K0PerBlock = KPerBlock / K1; + + return make_naive_tensor_descriptor( + make_tuple(Number{}, Number{}, K1), + make_tuple(I0, I1, I0)); + } + else + { + constexpr auto KWmmaPerblock = KPerBlock / WmmaK; + constexpr auto K0PerWmma = WmmaK / 2 / K1; + // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread + return make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + I1, + K1), + make_tuple(I0, I1, I0, I0, I0, I0, I0)); + } + }(); + + return scale_block_desc; + } + __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() { constexpr auto a_block_copy_step = [&]() { @@ -537,9 +584,15 @@ struct GridwiseFpAintBGemm_Wmma BEnableLds ? math::integer_least_multiple(MakeBBlockDescriptor().GetElementSpaceSize(), max_lds_align) : 0; + static constexpr auto scale_block_space_size_aligned = + BEnableLds ? math::integer_least_multiple( + MakeScaleBlockDescriptor().GetElementSpaceSize(), max_lds_align) + : 0; static constexpr auto a_block_space_offset = 0; static constexpr auto b_block_space_offset = a_block_space_size_aligned; + static constexpr auto scale_block_space_offset = + b_block_space_offset + b_block_space_size_aligned; // LDS allocation for C shuffle in LDS static constexpr auto c_shuffle_block_space_size = @@ -551,7 +604,8 @@ struct GridwiseFpAintBGemm_Wmma static constexpr auto lds_size = math::max(c_shuffle_block_space_size * sizeof(CShuffleDataType), a_block_space_size_aligned * sizeof(ADataType) + - b_block_space_size_aligned * sizeof(BDataType)); + b_block_space_size_aligned * sizeof(BDataType) + + scale_block_space_size_aligned * sizeof(ScaleDataType)); }; template @@ -609,7 +663,7 @@ struct GridwiseFpAintBGemm_Wmma constexpr auto a_block_desc = MakeABlockDescriptor(); constexpr auto b_block_desc = MakeBBlockDescriptor(); - constexpr auto scale_block_desc = MakeBBlockDescriptor(); + constexpr auto scale_block_desc = MakeScaleBlockDescriptor(); auto a_block_trait = [&](){ // A matrix blockwise copy @@ -768,7 +822,7 @@ struct GridwiseFpAintBGemm_Wmma get_thread_local_1d_id() % 16, 0)); - return make_tuple(b_block_buf, b_blockwise_copy, scale_blockwise_copy); + return make_tuple(b_block_buf, b_blockwise_copy); } }; @@ -776,13 +830,14 @@ struct GridwiseFpAintBGemm_Wmma if constexpr(BEnableLds) { constexpr auto K0PerBlock = KPerBlock/ K1; + auto scale_block_buf = make_dynamic_buffer( static_cast(p_shared) + SharedMemTrait::scale_block_space_offset, SharedMemTrait::scale_block_space_size_aligned); auto scale_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1, @@ -802,10 +857,10 @@ struct GridwiseFpAintBGemm_Wmma 1, BThreadTransferSrcResetCoordinateAfterRun, true, - 1>( + NumGemmKPrefetchStage>( scale_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), - ck::tensor_operation::element_wise::PassThrough{}, + b_element_op, scale_block_desc, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); @@ -815,13 +870,12 @@ struct GridwiseFpAintBGemm_Wmma else { // Thread-wise copy - // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; constexpr auto K0PerWmma = WmmaK/2/K1Value; + // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 auto scale_block_buf = make_static_buffer( scale_block_desc.GetElementSpaceSize()); - // Limitation: NumDim of Src and Dst descriptor should be identical auto scale_blockwise_copy = ThreadwiseTensorSliceTransfer_v2(a_grid_desc, a_block_desc, a_blockwise_copy, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index 48bd22a764a..48295b638cf 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -12,6 +12,7 @@ enum struct PipelineVersion { v1, v2, + dequant_v1, }; template {}; + } else { std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index dd4112939db..cf5c9066b9a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -600,9 +600,9 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true> const BBlockTransferStep& b_block_copy_step, const ScaleGridDesc& scale_grid_desc, const ScaleBlockDesc& scale_block_desc, + ScaleBlockTransfer& scale_blockwise_copy, const ScaleGridBuffer& scale_grid_buf, ScaleBlockBuffer& scale_block_buf, - ScaleBlockTransfer& scale_blockwise_copy, const BlockwiseGemm& blockwise_gemm, CThreadBuffer& c_thread_buf, index_t num_loop) @@ -653,7 +653,7 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true> { block_sync_lds(); - blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); } } }; diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 8d3f2dbd633..0e07c20ae55 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -1090,6 +1090,16 @@ inline __host__ __device__ constexpr bhalf_t type_convert(int8_ return type_convert(x_fp32); } +// convert int8 to fp16 via fp32 +template <> +inline __host__ __device__ constexpr half_t type_convert(int8_t x) +{ + // TODO: replace it with fast_converter + float x_fp32 = static_cast(x); + + return type_convert(x_fp32); +} + // Declare a template function for bf16 conversion using RTN template __host__ __device__ constexpr Y bf16_convert_rtn(X x); diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp new file mode 100644 index 00000000000..ac392f09069 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/library/utility/host_tensor.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferencefpAintBGemm : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const Tensor& a_m_k, + const Tensor& b_k_n, + const Tensor& scale_k_n, + Tensor& c_m_n, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + : a_m_k_{a_m_k}, + b_k_n_{b_k_n}, + scale_k_n_{scale_k_n}, + c_m_n_{c_m_n}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + c_element_op_{c_element_op} + { + } + + const Tensor& a_m_k_; + const Tensor& b_k_n_; + const Tensor& scale_k_n_; + Tensor& c_m_n_; + + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CElementwiseOperation c_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = ReferencefpAintBGemm::Argument; + + float Run(const Argument& arg) + { + auto f_mk_kn_mn = [&](auto m, auto n) { + const int K = arg.a_m_k_.mDesc.GetLengths()[1]; + + AccDataType v_acc = 0; + + for(int k = 0; k < K; ++k) + { + ADataType v_a; + BDataType v_b; + ScaleDataType v_scale; + ADataType v_converted_b; + + // use PassThrough instead of ConvertBF16RTN for reference calculation + if constexpr(is_same_v) + { + ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k)); + } + else + { + arg.a_element_op_(v_a, arg.a_m_k_(m, k)); + } + + // same for B matrix + if constexpr(is_same_v) + { + ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n)); + } + else + { + arg.b_element_op_(v_b, arg.b_k_n_(k, n)); + } + + // same for scale matrix + if constexpr(is_same_v) + { + ck::tensor_operation::element_wise::PassThrough{}(v_scale, + arg.scale_k_n_(k, n)); + } + else + { + arg.b_element_op_(v_scale, arg.scale_k_n_(k, n)); + } + + v_converted_b = type_convert(v_b) * v_scale; + v_acc += ck::type_convert(v_a) * + ck::type_convert(v_converted_b); + } + + AccDataType v_c; + + arg.c_element_op_(v_c, v_acc); + + arg.c_m_n_(m, n) = ck::type_convert(v_c); + }; + + make_ParallelTensorFunctor( + f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])( + std::thread::hardware_concurrency()); + + return 0; + } + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const Tensor& a_m_k, + const Tensor& b_k_n, + const Tensor& scale_k_n, + Tensor& c_m_n, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{a_m_k, b_k_n, scale_k_n, c_m_n, a_element_op, b_element_op, c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceGemm" + << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp index e757049b4e1..f3665eb8d8e 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp @@ -29,9 +29,8 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding; // Compilation parameters for a[k, m] * b[k, n] = c[m, n] -using device_gemm_wmma_f16_f16_f16_km_kn_mn_instances = - std::tuple< - // clang-format off +using device_gemm_wmma_f16_f16_f16_km_kn_mn_instances = std::tuple< + // clang-format off //######################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumPrefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| M| N| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer| //######################| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise|Specialization| | Size| Block| Block| Block| | WMMA| WMMA| Repeat| Repeat| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MRepeat| MRepeat| ClusterLengths| ScalarPerVector| //######################| | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerStore| PerStore| MBlock_MPerBlock| | @@ -62,8 +61,8 @@ using device_gemm_wmma_f16_f16_f16_km_kn_mn_instances = // 1 Wave DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 32, 64, 8, 16, 16, 1, 2, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, DeviceGemmWmma_CShuffle< Col, Row, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, 1, 1, S<1, 16, 1, 2>, 8> - // clang-format on - >; + // clang-format on + >; void add_device_gemm_wmma_f16_f16_f16_km_kn_mn_instances( std::vector, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8>, DeviceGemmWmma_CShuffle< Col, Col, Row, F16, F16, F16, F32, F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, true, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> - // clang-format on - >; + // clang-format on + >; void add_device_gemm_wmma_f16_f16_f16_km_nk_mn_instances( std::vector, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<2, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 8, true, 1, 1, S<1, 16, 1, 2>, 8> #endif // clang-format on - >; + >; void add_device_gemm_wmma_f16_f16_f16_mk_kn_mn_instances( std::vector Date: Fri, 28 Jul 2023 07:29:32 +0000 Subject: [PATCH 098/118] Sanity pass. --- .../49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 2 +- example/49_fpAintB_gemm/run_gemm_example.inc | 48 +++++++- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 104 +++++++++++++--- .../device/impl/device_fpAintB_gemm_wmma.hpp | 53 ++++++++- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 75 ++++++++++-- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 112 +++++++++++++++++- 6 files changed, 359 insertions(+), 35 deletions(-) diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp index 618f1e90982..8ff1077da4a 100644 --- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -9,7 +9,7 @@ using ADataType = ck::half_t; using BDataType = int8_t; using ScaleDataType = ck::half_t; using AccDataType = float; -using CShuffleDataType = float; +using CShuffleDataType = ck::half_t; using CDataType = ck::half_t; using ALayout = Row; diff --git a/example/49_fpAintB_gemm/run_gemm_example.inc b/example/49_fpAintB_gemm/run_gemm_example.inc index d50b592fec1..913a18d7a4b 100644 --- a/example/49_fpAintB_gemm/run_gemm_example.inc +++ b/example/49_fpAintB_gemm/run_gemm_example.inc @@ -28,7 +28,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) Tensor a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); // assume scale tensor is [1, n] - Tensor scale_k_n(f_host_tensor_descriptor(K, N, 0, BLayout{})); + Tensor scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{})); switch(config.init_method) { @@ -51,7 +51,7 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) case 4: ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(scale_k_n); + ck::utils::FillUniformDistributionIntegerValue{2.f, 2.f}(scale_k_n); break; case 5: ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k); @@ -64,6 +64,50 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ck::utils::FillUniformDistribution{-1.f, 1.f}(scale_k_n); } +#if 0 + printf("Matrix A:\n"); + for (int im = 0; im < M; im++) + { + for (int ik = 0; ik < K; ik++) + { + if(ik % 16 == 0){ + printf("|"); + } + + printf(" %04x", *(reinterpret_cast(&a_m_k(im,ik)))); + } + printf("\n"); + } + + printf("Matrix B:\n"); + for (int in = 0; in < N; in++) + { + for (int ik = 0; ik < K; ik++) + { + if(ik % 16 == 0){ + printf("|"); + } + + printf(" %02x", b_k_n(ik,in)); + } + printf("\n"); + } + + printf("Matrix Scale:\n"); + for (int in = 0; in < N; in++) + { + for (int ik = 0; ik < K; ik++) + { + if(ik % 16 == 0){ + printf("|"); + } + + printf(" %04x", *(reinterpret_cast(&scale_k_n(ik,in)))); + } + printf("\n"); + } + #endif + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp index 472d6154a95..434b69d5786 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -309,7 +309,8 @@ struct Blockwise_fpAintB_GemmWMMA b_thread_desc_.GetElementSpaceSize()); auto scale_thread_buf = make_static_buffer( scale_thread_desc_.GetElementSpaceSize()); - auto converted_b_thread_buf = b_thread_buf; + auto converted_b_thread_buf = make_static_buffer( + b_thread_desc_.GetElementSpaceSize()); // basic intrinsic to determine loopover direction if constexpr(MRepeat < NRepeat) @@ -345,7 +346,7 @@ struct Blockwise_fpAintB_GemmWMMA scale_thread_buf); // convert B from int8 to fp16, multiply scale - static_for<0, b_thread_buf.size(), 1>{}([&](auto i) { + static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) { converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] * type_convert(b_thread_buf[i]); @@ -390,6 +391,20 @@ struct Blockwise_fpAintB_GemmWMMA else { static_for<0, NRepeat, 1>{}([&](auto n0) { + // read weight scale + scale_thread_copy_.Run( + scale_block_desc_1_n0_n1_n2_1, + make_tuple(I0, n0, I0, I0, I0, I0), + scale_block_buf, + scale_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + scale_thread_buf); +#if 0 + printf("Tid: %03d, n: %02d, scale_thread_buf: %04x\n", + get_thread_local_1d_id(), n0.value, + *(reinterpret_cast(&scale_thread_buf[n0])) + ); +#endif static_for<0, MRepeat, 1>{}([&](auto m0) { static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of // k=0,kpack*1, .. @@ -400,16 +415,7 @@ struct Blockwise_fpAintB_GemmWMMA b_block_buf, b_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0), - b_thread_buf); - // read weight scale - scale_thread_copy_.Run( - scale_block_desc_1_n0_n1_n2_1, - make_tuple(Number{}, n0, I0, I0, I0, I0), - scale_block_buf, - scale_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - scale_thread_buf); - + b_thread_buf); // convert B from int8 to fp16, multiply scale static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) { converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] * @@ -423,7 +429,71 @@ struct Blockwise_fpAintB_GemmWMMA a_thread_desc_, make_tuple(I0, m0, I0, I0, I0, I0), a_thread_buf); - + if (true){ +#if 0 + printf("Tid: %03d, m, n, k: %02d, %02d, %02d, a_thread_buf: %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x|\n", + get_thread_local_1d_id(), m0.value, n0.value, k.value, + *(reinterpret_cast(&a_thread_buf[Number<0>{}])), + *(reinterpret_cast(&a_thread_buf[Number<1>{}])), + *(reinterpret_cast(&a_thread_buf[Number<2>{}])), + *(reinterpret_cast(&a_thread_buf[Number<3>{}])), + *(reinterpret_cast(&a_thread_buf[Number<4>{}])), + *(reinterpret_cast(&a_thread_buf[Number<5>{}])), + *(reinterpret_cast(&a_thread_buf[Number<6>{}])), + *(reinterpret_cast(&a_thread_buf[Number<7>{}])), + *(reinterpret_cast(&a_thread_buf[Number<8>{}])), + *(reinterpret_cast(&a_thread_buf[Number<9>{}])), + *(reinterpret_cast(&a_thread_buf[Number<10>{}])), + *(reinterpret_cast(&a_thread_buf[Number<11>{}])), + *(reinterpret_cast(&a_thread_buf[Number<12>{}])), + *(reinterpret_cast(&a_thread_buf[Number<13>{}])), + *(reinterpret_cast(&a_thread_buf[Number<14>{}])), + *(reinterpret_cast(&a_thread_buf[Number<15>{}])) + ); +#endif +#if 0 + printf("Tid: %03d, m, n, k: %02d, %02d, %02d, b_thread_buf: %02x %02x %02x %02x| %02x %02x %02x %02x| %02x %02x %02x %02x| %02x %02x %02x %02x|\n", + get_thread_local_1d_id(), m0.value, n0.value, k.value, + b_thread_buf[Number<0>{}], + b_thread_buf[Number<1>{}], + b_thread_buf[Number<2>{}], + b_thread_buf[Number<3>{}], + b_thread_buf[Number<4>{}], + b_thread_buf[Number<5>{}], + b_thread_buf[Number<6>{}], + b_thread_buf[Number<7>{}], + b_thread_buf[Number<8>{}], + b_thread_buf[Number<9>{}], + b_thread_buf[Number<10>{}], + b_thread_buf[Number<11>{}], + b_thread_buf[Number<12>{}], + b_thread_buf[Number<13>{}], + b_thread_buf[Number<14>{}], + b_thread_buf[Number<15>{}] + ); +#endif +#if 0 + printf("Tid: %03d, m, n, k: %02d, %02d, %02d, converted_b_thread_buf: %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x|\n", + get_thread_local_1d_id(), m0.value, n0.value, k.value, + *(reinterpret_cast(&converted_b_thread_buf[Number<0>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<1>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<2>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<3>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<4>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<5>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<6>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<7>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<8>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<9>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<10>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<11>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<12>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<13>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<14>{}])), + *(reinterpret_cast(&converted_b_thread_buf[Number<15>{}])) + ); +#endif + } vector_type a_thread_vec; vector_type b_thread_vec; @@ -497,7 +567,7 @@ struct Blockwise_fpAintB_GemmWMMA I1, Number{}, I1, - Number{}), + I1), make_tuple(I0, I1, I0, I0, I0, I0)); // C[M, N, NumRegWMMA] @@ -587,11 +657,11 @@ struct Blockwise_fpAintB_GemmWMMA ScaleDataType, decltype(scale_block_desc_1_n0_n1_n2_1), decltype(scale_thread_desc_), - Sequence, + Sequence, Sequence<0, 1, 2, 3, 4, 5>, 5, - B_K1, - B_K1>; + 1, + 1>; }; template <> diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 0cff0aae769..cb6678e391b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -182,8 +182,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB{}, Sequence<0>{}), + make_tuple(Sequence<0, 2>{}, Sequence<1>{})); + } + else + { + constexpr auto B_KRow = 2; + constexpr auto B_K0PerWmma = WmmaK / B_KRow / K1Number; + const auto B_KWmma = K / WmmaK; + + const auto N0 = N / NPerBlock; + // 0 1 0 1 2 3 4 5 6 + // M - K <-> A_KWmma - MBlock*MRepeat - MWaves - A_K0PerWmma - A_KRow - MPerWmma - A_K1 + return transform_tensor_descriptor( + scale_grid_desc_n_k, + make_tuple(make_unmerge_transform(make_tuple( + B_KWmma, Number{}, Number{}, K1Number)), + make_unmerge_transform( + make_tuple(N0 * NRepeat, Number{}, Number{}))), + make_tuple(Sequence<1>{}, Sequence<0>{}), + make_tuple(Sequence<0, 3, 4, 6>{}, Sequence<1, 2, 5>{})); + } + } + static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC) { const auto c_grid_desc_mraw_nraw = [&]() { @@ -237,7 +282,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB(p_a_grid, p_b_grid, @@ -262,7 +268,7 @@ struct GridwiseFpAintBGemm_Wmma constexpr auto K0PerBlock = KPerBlock / K1; return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, K1), + make_tuple(Number{}, Number{}, I1), make_tuple(I0, I1, I0)); } else @@ -276,7 +282,7 @@ struct GridwiseFpAintBGemm_Wmma Number{}, I1, I1, - K1), + I1), make_tuple(I0, I1, I0, I0, I0, I0, I0)); } }(); @@ -424,6 +430,52 @@ struct GridwiseFpAintBGemm_Wmma return b_wave_desc; } + template + __host__ __device__ static constexpr auto MakeScaleWaveDescriptor(const ScaleBlockDesc_&) + { + constexpr auto scale_wave_desc = [&]() { + if constexpr(BEnableLds) + { + // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 + constexpr auto B_K0 = ScaleBlockDesc_{}.GetLength(I0); + constexpr auto B_K1 = ScaleBlockDesc_{}.GetLength(I2); + constexpr auto B_KRow = I1; + return transform_tensor_descriptor( + ScaleBlockDesc_{}, + make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)), + make_unmerge_transform(make_tuple( + Number{}, Number{}, Number{})), + make_pass_through_transform(Number{})), + make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), + make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); + } + else + { + // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 + constexpr auto KWmma = ScaleBlockDesc_{}.GetLength(I0); + constexpr auto K0PerWmma = ScaleBlockDesc_{}.GetLength(I3); + constexpr auto B_KRow = ScaleBlockDesc_{}.GetLength(I4); + constexpr auto B_K1 = ScaleBlockDesc_{}.GetLength(I6); + + // Workaround, Freeze transform + return make_naive_tensor_descriptor(make_tuple(Number{}, + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(I0, + I1, + I0, + I0, + I0, + I0)); + } + }(); + + return scale_wave_desc; + } + __host__ __device__ static constexpr auto // *Caution Here repeat is shuffle repeat GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() @@ -590,9 +642,10 @@ struct GridwiseFpAintBGemm_Wmma : 0; static constexpr auto a_block_space_offset = 0; - static constexpr auto b_block_space_offset = a_block_space_size_aligned; + static constexpr auto b_block_space_offset = + (a_block_space_offset + a_block_space_size_aligned) * sizeof(ADataType)/sizeof(BDataType); static constexpr auto scale_block_space_offset = - b_block_space_offset + b_block_space_size_aligned; + (b_block_space_offset + b_block_space_size_aligned) * sizeof(BDataType)/sizeof(ScaleDataType); // LDS allocation for C shuffle in LDS static constexpr auto c_shuffle_block_space_size = @@ -753,7 +806,7 @@ struct GridwiseFpAintBGemm_Wmma auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + SharedMemTrait::b_block_space_offset, SharedMemTrait::b_block_space_size_aligned); - + // printf("b_lds_offset: %lu\n", SharedMemTrait::b_block_space_offset); auto b_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1( static_cast(p_shared) + SharedMemTrait::scale_block_space_offset, SharedMemTrait::scale_block_space_size_aligned); - + // printf("scale_lds_offset: %lu\n", SharedMemTrait::scale_block_space_offset); + auto scale_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1, + // Reduce slice length K1 to 1 + Sequence, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, ScaleDataType, @@ -851,10 +906,10 @@ struct GridwiseFpAintBGemm_Wmma Sequence<0, 1, 2>, BBlockTransferSrcVectorDim, 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, 1, 1, + 1, // no effect + 1, // no effect BThreadTransferSrcResetCoordinateAfterRun, true, NumGemmKPrefetchStage>( @@ -926,7 +981,7 @@ struct GridwiseFpAintBGemm_Wmma AccDataType, decltype(MakeAWaveDescriptor(a_block_desc)), decltype(MakeBWaveDescriptor(b_block_desc)), - decltype(MakeBWaveDescriptor(scale_block_desc)), + decltype(MakeScaleWaveDescriptor(scale_block_desc)), MPerBlock, NPerBlock, KPerBlock, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index cf5c9066b9a..3a04213a9a2 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -581,9 +581,9 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true> typename BBlockTransferStep, typename ScaleGridDesc, typename ScaleBlockDesc, + typename ScaleBlockTransfer, typename ScaleGridBuffer, typename ScaleBlockBuffer, - typename ScaleBlockTransfer, typename BlockwiseGemm, typename CThreadBuffer> __device__ static void Run(const AGridDesc& a_grid_desc, @@ -658,6 +658,116 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, true> } }; +template <> +struct GridwiseGemmPipeline_v1_dequant<1, true, false> +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + __device__ static void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const ScaleGridDesc& scale_grid_desc, + const ScaleBlockDesc& scale_block_desc, + ScaleBlockTransfer& scale_blockwise_copy, + const ScaleGridBuffer& scale_grid_buf, + ScaleBlockBuffer& scale_block_buf, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { + constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0); + auto b_block_buf_switch = b_block_buf; + + // preload data into LDS + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.Run( + b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf); + scale_blockwise_copy.Run(scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + b_blockwise_copy.Run( + b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch); + + block_sync_lds(); + + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + + blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); + + block_sync_lds(); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + + b_block_buf = b_block_buf_switch; + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); + + block_sync_lds(); + } + } +}; + template struct GridwiseGemmPipelineInterwave_v1; From 32bac6f3bc432beeda4a9033170b33bf06c7a493 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 1 Aug 2023 06:40:46 +0000 Subject: [PATCH 099/118] Temp save --- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 2 +- .../gpu/thread/threadwise_tensor_slice_transfer.hpp | 9 ++++----- .../thread/threadwise_tensor_slice_transfer_v3r1.hpp | 10 ++++++++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp index 434b69d5786..cfd49668597 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -419,7 +419,7 @@ struct Blockwise_fpAintB_GemmWMMA // convert B from int8 to fp16, multiply scale static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) { converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] * - type_convert(b_thread_buf[i]); + type_convert(b_thread_buf[i]); // call byte permute }); // read A a_thread_copy_.Run( diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 570d4e725bd..3832b522ef4 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1143,7 +1143,9 @@ struct ThreadwiseTensorSliceTransfer_v4 const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid( src_desc, src_data_coord); - +#if 0 + printf("Tid: %03d, LDS read offset: %d\n", get_thread_local_1d_id(), src_data_coord.GetOffset()); +#endif // copy data from src_buf into src_tmp_vector if constexpr(SrcBuffer::IsDynamicBuffer()) { @@ -1417,10 +1419,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow 1, 0); v_theother_row = type_convert_sp(temp); - // if (get_thread_local_1d_id() == 0){ - // printf("src_offset:%d, dst_offset for this row: %d, dst_offset - // for the other row: %d \n", - // src_offset, dst_offset, dst_offset+DstScalarPerVector);} + if(get_thread_local_1d_id() % 32 < 16) { // apply type convert diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 6665d765f81..78f25091eac 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -207,7 +207,11 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // copy data from src_buf into src_vector_container auto src_vector_container = src_vector_type{ src_buf.template Get(src_coord_.GetOffset(), is_src_valid)}; - + if (false){ + printf("Tid: %03d, a_grid_buf: %04x\n", + get_thread_local_1d_id(), + *(reinterpret_cast(&src_vector_container.template AsType()[Number<0>{}]))); + } // copy data from src_vector_container into src_thread_scratch_ src_thread_scratch_tuple_(thread_scratch_id) .template SetAsType( @@ -442,7 +446,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1 const bool is_dst_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); - +#if 0 + printf("Tid: %03d, LDS write offset: %d\n", get_thread_local_1d_id(), dst_coord_.GetOffset()); +#endif using dst_vector_type = vector_type_maker_t; using dst_vector_t = typename dst_vector_type::type; From 5cf73a5e3a42175aa1fa6d7e40113dd97a7b6f7e Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 3 Aug 2023 01:37:00 +0000 Subject: [PATCH 100/118] debug code enabled --- example/49_fpAintB_gemm/common.hpp | 34 +++++ .../49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 18 ++- example/49_fpAintB_gemm/run_gemm_example.inc | 58 +++++--- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 125 +++++++++--------- .../device/impl/device_fpAintB_gemm_wmma.hpp | 5 +- .../element/unary_element_wise_operation.hpp | 91 +++++++++++++ .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 31 +++-- .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 3 +- .../threadwise_tensor_slice_transfer.hpp | 4 +- .../threadwise_tensor_slice_transfer_v3r1.hpp | 8 +- include/ck/utility/amd_buffer_addressing.hpp | 111 +++++++++++++++- include/ck/utility/data_type.hpp | 16 +++ script/clang-format-overwrite.sh | 4 +- 13 files changed, 402 insertions(+), 106 deletions(-) diff --git a/example/49_fpAintB_gemm/common.hpp b/example/49_fpAintB_gemm/common.hpp index 1f67d53de2b..4fb4c41d056 100644 --- a/example/49_fpAintB_gemm/common.hpp +++ b/example/49_fpAintB_gemm/common.hpp @@ -48,6 +48,40 @@ using Col = ck::tensor_layout::gemm::ColumnMajor; using PassThrough = ck::tensor_operation::element_wise::PassThrough; +template +struct UnsignedWeightPreprocessor +{ +}; + +template <> +struct UnsignedWeightPreprocessor +{ + using UnsignedWeight = Tensor; + using SignedWeight = Tensor; + static UnsignedWeight convert(SignedWeight const& Input) + { + + UnsignedWeight Output = Input.template CopyAsType(); + + auto f_kn = [&](auto k, auto n) { + const uint8_t adder = 128; + int8_t v_signed_weight; + uint8_t v_unsigned_weight; + + ck::tensor_operation::element_wise::PassThrough{}(v_signed_weight, Input(k, n)); + v_unsigned_weight = ck::type_convert(v_signed_weight) + adder; + Output(k, n) = v_unsigned_weight; + }; + + make_ParallelTensorFunctor(f_kn, Input.mDesc.GetLengths()[0], Input.mDesc.GetLengths()[1])( + std::thread::hardware_concurrency()); + + return Output; + } + + UnsignedWeight operator()(SignedWeight const& Input) { return convert(Input); } +}; + inline bool parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config) { diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp index 8ff1077da4a..e8776a94bcd 100644 --- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -5,8 +5,18 @@ #include "ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp" +// Implementation follows the paper: +// Kim, Young Jin, Rawn Henry, Raffy Fahim, and Hany Hassan Awadalla. “Who Says Elephants Can’t Run: +// Bringing Large Scale MoE Models into Cloud Scale Production.” arXiv, November 17, 2022. +// https://doi.org/10.48550/arXiv.2211.10017. Assume weight (Matrix B) is add preprocess to +// unsigned. + +// The DeviceOp is CDataType = ADataType * Dequant(BDataType) * ScaleDataType +// The HostRef is CDataType = ADataType * Dequant(QuantDataType) * ScaleDataType + using ADataType = ck::half_t; -using BDataType = int8_t; +using QuantDataType = int8_t; +using BDataType = uint8_t; using ScaleDataType = ck::half_t; using AccDataType = float; using CShuffleDataType = ck::half_t; @@ -40,13 +50,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_ 1, // Prefetch stage 128, // BlockSize 128, // MPerBlock - 64, // NPerBlock + 128, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave - 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, @@ -68,7 +78,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_ // clang-format on using ReferenceGemmInstance = ck::tensor_operation::host::ReferencefpAintBGemm a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{})); - Tensor b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); + Tensor quant_b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{})); // assume scale tensor is [1, n] Tensor scale_k_n(f_host_tensor_descriptor(K, N, 0, Row{})); @@ -35,35 +35,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) case 0: break; case 1: ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(quant_b_k_n); ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(scale_k_n); break; case 2: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); - ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + ck::utils::FillUniformDistribution{-1.f, 1.f}(quant_b_k_n); ck::utils::FillUniformDistribution{-1.f, 1.f}(scale_k_n); break; case 3: ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(quant_b_k_n); ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(scale_k_n); break; case 4: ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(quant_b_k_n); ck::utils::FillUniformDistributionIntegerValue{2.f, 2.f}(scale_k_n); break; case 5: ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(quant_b_k_n); ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(scale_k_n); break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); - ck::utils::FillUniformDistribution{-1.f, 1.f}(b_k_n); + ck::utils::FillUniformDistribution{-1.f, 1.f}(quant_b_k_n); ck::utils::FillUniformDistribution{-1.f, 1.f}(scale_k_n); } + UnsignedWeightPreprocessor preprocessor; + Tensor b_k_n = preprocessor(quant_b_k_n); + #if 0 printf("Matrix A:\n"); for (int im = 0; im < M; im++) @@ -78,8 +81,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) } printf("\n"); } - - printf("Matrix B:\n"); +#endif +#if 0 + printf("Matrix QuantB:\n"); for (int in = 0; in < N; in++) { for (int ik = 0; ik < K; ik++) @@ -88,12 +92,29 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) printf("|"); } - printf(" %02x", b_k_n(ik,in)); + printf(" %02x", *(reinterpret_cast(&quant_b_k_n(ik,in)))); } printf("\n"); } - +#endif +#if 0 printf("Matrix Scale:\n"); + for(int in = 0; in < N; in++) + { + for(int ik = 0; ik < 1; ik++) + { + if(ik % 16 == 0) + { + printf("|"); + } + + printf(" %04x", *(reinterpret_cast(&scale_k_n(ik, in)))); + } + printf("\n"); + } +#endif +#if 0 + printf("Matrix B:\n"); for (int in = 0; in < N; in++) { for (int ik = 0; ik < K; ik++) @@ -102,12 +123,12 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) printf("|"); } - printf(" %04x", *(reinterpret_cast(&scale_k_n(ik,in)))); + printf(" %02x", b_k_n(ik,in)); } printf("\n"); } - #endif - +#endif + Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); @@ -191,8 +212,13 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); - auto ref_argument = ref_gemm.MakeArgument( - a_m_k, b_k_n, scale_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + auto ref_argument = ref_gemm.MakeArgument(a_m_k, + quant_b_k_n, + scale_k_n, + c_m_n_host_result, + a_element_op, + b_element_op, + c_element_op); ref_invoker.Run(ref_argument); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp index cfd49668597..981fa70a69b 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -309,8 +309,10 @@ struct Blockwise_fpAintB_GemmWMMA b_thread_desc_.GetElementSpaceSize()); auto scale_thread_buf = make_static_buffer( scale_thread_desc_.GetElementSpaceSize()); - auto converted_b_thread_buf = make_static_buffer( - b_thread_desc_.GetElementSpaceSize()); + // auto converted_b_thread_buf = make_static_buffer( + // b_thread_desc_.GetElementSpaceSize()); + tensor_operation::element_wise::FastNumericArrayConverter + fast_numeric_converter; // basic intrinsic to determine loopover direction if constexpr(MRepeat < NRepeat) @@ -345,15 +347,29 @@ struct Blockwise_fpAintB_GemmWMMA make_tuple(I0, n0, I0, I0, I0, I0), scale_thread_buf); - // convert B from int8 to fp16, multiply scale - static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) { - converted_b_thread_buf(i) = - scale_thread_buf[i / WmmaK] * - type_convert(b_thread_buf[i]); + vector_type b_int_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + b_int_vec.template AsType()(i) = + b_thread_buf[Number{}]; + }); + + // convert B from uint8 to fp16, multiply scale + b_thread_vec = fast_numeric_converter(b_int_vec); + static_for<0, WmmaK, 1>{}([&](auto i) { + b_thread_vec.template AsType()(i) = + scale_thread_buf[n0] * + b_thread_vec.template AsType()(i); }); vector_type a_thread_vec; - vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { a_thread_vec.template AsType()(i) = @@ -364,14 +380,6 @@ struct Blockwise_fpAintB_GemmWMMA (i / A_K1) % A_KRow, 0, i % A_K1))>{}]; - b_thread_vec.template AsType()(i) = - converted_b_thread_buf[Number{}]; }); using wmma_input_type_a = typename vector_type::type; @@ -390,37 +398,48 @@ struct Blockwise_fpAintB_GemmWMMA } else { - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read weight scale - scale_thread_copy_.Run( - scale_block_desc_1_n0_n1_n2_1, - make_tuple(I0, n0, I0, I0, I0, I0), - scale_block_buf, - scale_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - scale_thread_buf); + static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of + // k=0,kpack*1, .. + static_for<0, NRepeat, 1>{}([&](auto n0) { + // read weight scale + scale_thread_copy_.Run(scale_block_desc_1_n0_n1_n2_1, + make_tuple(I0, n0, I0, I0, I0, I0), + scale_block_buf, + scale_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + scale_thread_buf); #if 0 printf("Tid: %03d, n: %02d, scale_thread_buf: %04x\n", get_thread_local_1d_id(), n0.value, *(reinterpret_cast(&scale_thread_buf[n0])) ); #endif - static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of - // k=0,kpack*1, .. - // read B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - b_thread_buf); - // convert B from int8 to fp16, multiply scale - static_for<0, b_thread_buf.Size(), 1>{}([&](auto i) { - converted_b_thread_buf(i) = scale_thread_buf[i / WmmaK] * - type_convert(b_thread_buf[i]); // call byte permute - }); + // read B + b_thread_copy_.Run( + b_block_desc_k0_n0_n1_n2_k1, + make_tuple(Number{}, n0, I0, I0, I0, I0), + b_block_buf, + b_thread_desc_, + make_tuple(I0, n0, I0, I0, I0, I0), + b_thread_buf); + + vector_type b_int_vec; + vector_type b_thread_vec; + + static_for<0, WmmaK, 1>{}([&](auto i) { + b_int_vec.template AsType()(i) = + b_thread_buf[Number{}]; + }); + + // convert B from uint8 to fp16, multiply scale + b_thread_vec = fast_numeric_converter(b_int_vec); + static_for<0, WmmaK, 1>{}([&](auto i) { + b_thread_vec.template AsType()(i) = + scale_thread_buf[n0] * b_thread_vec.template AsType()(i); + }); + + static_for<0, MRepeat, 1>{}([&](auto m0) { // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, @@ -429,7 +448,8 @@ struct Blockwise_fpAintB_GemmWMMA a_thread_desc_, make_tuple(I0, m0, I0, I0, I0, I0), a_thread_buf); - if (true){ + if(true) + { #if 0 printf("Tid: %03d, m, n, k: %02d, %02d, %02d, a_thread_buf: %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x|\n", get_thread_local_1d_id(), m0.value, n0.value, k.value, @@ -495,17 +515,8 @@ struct Blockwise_fpAintB_GemmWMMA #endif } vector_type a_thread_vec; - vector_type b_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { - b_thread_vec.template AsType()(i) = - converted_b_thread_buf[Number{}]; a_thread_vec.template AsType()(i) = a_thread_buf[Number{}, Number<1>{})); - static constexpr auto scale_thread_desc_ = - make_naive_tensor_descriptor(make_tuple(Number{}, - Number{}, - I1, - Number{}, - I1, - I1), - make_tuple(I0, I1, I0, I0, I0, I0)); + static constexpr auto scale_thread_desc_ = make_naive_tensor_descriptor( + make_tuple( + Number{}, Number{}, I1, Number{}, I1, I1), + make_tuple(I0, I1, I0, I0, I0, I0)); // C[M, N, NumRegWMMA] static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index cb6678e391b..64aaaf034c2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -95,8 +95,9 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB 1); static constexpr auto BEnableLds = BEnableLds_auto || BEnableLds_manu || (NumPrefetch > 1); diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index c3e7706ef3f..57aa8638a31 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -6,6 +6,7 @@ #include "ck/utility/data_type.hpp" #include "ck/utility/math.hpp" #include "ck/utility/math_v2.hpp" +#include "ck/utility/get_id.hpp" namespace ck { namespace tensor_operation { @@ -68,6 +69,12 @@ struct PassThrough y = x; } + template <> + __host__ __device__ void operator()(uint8_t& y, const uint8_t& x) const + { + y = x; + } + template <> __host__ __device__ void operator()(int8_t& y, const int32_t& x) const { @@ -371,6 +378,90 @@ struct Swish float beta_ = 1.0f; }; +// support fastconvert of int8 to fp16 + +template +struct FastNumericArrayConverter +{ +}; + +template <> +struct FastNumericArrayConverter +{ + using InputArray = vector_type; + using OutputArray = vector_type; + + __device__ static OutputArray convert(InputArray const& Input) + { + OutputArray Output; + + uint32_t* half_2 = reinterpret_cast(&Output); + uint32_t const uint8_4 = reinterpret_cast(Input); + + // printf("Tid: %03d, uint8_4: %08x\n", + // get_thread_local_1d_id(), + // uint8_4); + + static constexpr uint32_t byte_selector_01 = 0x05010500; + static constexpr uint32_t byte_selector_23 = 0x05030502; + static constexpr uint32_t fp16_adder = 0x64646464; + half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01); + half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23); + + // printf("Tid: %03d, Part1 converted: %08x | %08x\n", + // get_thread_local_1d_id(), + // half_2[Number<0>{}], + // half_2[Number<1>{}]); + + // Lastly, we subtract 1152 from our constructed number using fp16 math to get our signed + // integer as fp16. + static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; + asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" + : "=v"(half_2[0]) + : "v"(half_2[0]), "s"(I8s_TO_F16s_MAGIC_NUM)); + asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" + : "=v"(half_2[1]) + : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM)); + // printf("Tid: %03d, Part2 converted: %08x | %08x\n", + // get_thread_local_1d_id(), + // half_2[Number<0>{}], + // half_2[Number<1>{}]); + return Output; + } + + __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); } +}; + +template +struct FastNumericArrayConverter +{ + static constexpr int VEC_WIDTH = 4; + static_assert(!(N % VEC_WIDTH), "N must be multiple of 4."); + + using InputArray = vector_type; + using OutputArray = vector_type; + + __device__ static OutputArray convert(InputArray const& Input) + { + FastNumericArrayConverter converter; + + OutputArray Output; + + using Vec_InputArray = vector_type; + using Vec_OutputArray = vector_type; + + Vec_OutputArray* half_4_ptr = reinterpret_cast(&Output); + Vec_InputArray const* uint8_4_ptr = reinterpret_cast(&Input); + + static_for<0, N / VEC_WIDTH, 1>{}( + [&](auto i) { half_4_ptr[i] = converter(uint8_4_ptr[i]); }); + + return Output; + } + + __device__ OutputArray operator()(InputArray const& Input) { return convert(Input); } +}; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index f205b3a18f2..8010550e040 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -52,11 +52,13 @@ __global__ void #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ defined(__gfx1102__)) __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size]; - if (false && get_thread_local_1d_id()==0){ + if(false && get_thread_local_1d_id() == 0) + { printf("lds_size: %lu\n", GridwiseGemm::SharedMemTrait::lds_size); printf("lds_a_size: %d\n", GridwiseGemm::SharedMemTrait::a_block_space_size_aligned); printf("lds_b_size: %d\n", GridwiseGemm::SharedMemTrait::b_block_space_size_aligned); - printf("lds_scale_size: %d\n", GridwiseGemm::SharedMemTrait::scale_block_space_size_aligned); + printf("lds_scale_size: %d\n", + GridwiseGemm::SharedMemTrait::scale_block_space_size_aligned); } GridwiseGemm::template Run(p_a_grid, @@ -459,17 +461,12 @@ struct GridwiseFpAintBGemm_Wmma // Workaround, Freeze transform return make_naive_tensor_descriptor(make_tuple(Number{}, - Number{}, - I1, - Number{}, - I1, - Number{}), - make_tuple(I0, - I1, - I0, - I0, - I0, - I0)); + Number{}, + I1, + Number{}, + I1, + Number{}), + make_tuple(I0, I1, I0, I0, I0, I0)); } }(); @@ -642,10 +639,12 @@ struct GridwiseFpAintBGemm_Wmma : 0; static constexpr auto a_block_space_offset = 0; - static constexpr auto b_block_space_offset = - (a_block_space_offset + a_block_space_size_aligned) * sizeof(ADataType)/sizeof(BDataType); + static constexpr auto b_block_space_offset = + (a_block_space_offset + a_block_space_size_aligned) * sizeof(ADataType) / + sizeof(BDataType); static constexpr auto scale_block_space_offset = - (b_block_space_offset + b_block_space_size_aligned) * sizeof(BDataType)/sizeof(ScaleDataType); + (b_block_space_offset + b_block_space_size_aligned) * sizeof(BDataType) / + sizeof(ScaleDataType); // LDS allocation for C shuffle in LDS static constexpr auto c_shuffle_block_space_size = diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 3a04213a9a2..0ff11a531f8 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -719,7 +719,8 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, false> a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); b_blockwise_copy.Run( b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf); - scale_blockwise_copy.Run(scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf); + scale_blockwise_copy.Run( + scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf); a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp index 3832b522ef4..5f350c98564 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp @@ -1145,7 +1145,7 @@ struct ThreadwiseTensorSliceTransfer_v4 src_desc, src_data_coord); #if 0 printf("Tid: %03d, LDS read offset: %d\n", get_thread_local_1d_id(), src_data_coord.GetOffset()); -#endif +#endif // copy data from src_buf into src_tmp_vector if constexpr(SrcBuffer::IsDynamicBuffer()) { @@ -1419,7 +1419,7 @@ struct ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow 1, 0); v_theother_row = type_convert_sp(temp); - + if(get_thread_local_1d_id() % 32 < 16) { // apply type convert diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp index 78f25091eac..096e93bf202 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp @@ -207,10 +207,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1 // copy data from src_buf into src_vector_container auto src_vector_container = src_vector_type{ src_buf.template Get(src_coord_.GetOffset(), is_src_valid)}; - if (false){ + if(false) + { printf("Tid: %03d, a_grid_buf: %04x\n", - get_thread_local_1d_id(), - *(reinterpret_cast(&src_vector_container.template AsType()[Number<0>{}]))); + get_thread_local_1d_id(), + *(reinterpret_cast( + &src_vector_container.template AsType()[Number<0>{}]))); } // copy data from src_vector_container into src_thread_scratch_ src_thread_scratch_tuple_(thread_scratch_id) diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index 38ee76d8836..f9bb7d0fa2c 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -312,7 +312,8 @@ __device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_w (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); if constexpr(is_same::value) @@ -614,6 +615,114 @@ __device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_w static_cast(coherence)); return bit_cast(tmp); +#endif + } + } + else if constexpr(is_same::value) + { + if constexpr(N == 1) + { + return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + } + else if constexpr(N == 2) + { +#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE + return llvm_amdgcn_raw_buffer_load_i8x2(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); +#else + int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + + return bit_cast(tmp); +#endif + } + else if constexpr(N == 4) + { +#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE + return llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); +#else + int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + + return bit_cast(tmp); +#endif + } + else if constexpr(N == 8) + { +#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE + vector_type tmp; + + tmp.AsType()(Number<0>{}) = + llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + + tmp.AsType()(Number<1>{}) = + llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset + 4 * sizeof(int8_t), + static_cast(coherence)); + + return tmp.AsType()(Number<0>{}); +#else + int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + + return bit_cast(tmp); +#endif + } + else if constexpr(N == 16) + { +#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE + vector_type tmp; + + tmp.AsType()(Number<0>{}) = + llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + + tmp.AsType()(Number<1>{}) = + llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset + 4 * sizeof(int8_t), + static_cast(coherence)); + + tmp.AsType()(Number<2>{}) = + llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset + 8 * sizeof(int8_t), + static_cast(coherence)); + + tmp.AsType()(Number<3>{}) = + llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset + 12 * sizeof(int8_t), + static_cast(coherence)); + + return tmp.AsType()(Number<0>{}); +#else + int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, + src_thread_addr_offset, + src_wave_addr_offset, + static_cast(coherence)); + + return bit_cast(tmp); #endif } } diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index 0e07c20ae55..0c09d74428e 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -133,6 +133,13 @@ struct scalar_type static constexpr index_t vector_size = 1; }; +template <> +struct scalar_type +{ + using type = uint8_t; + static constexpr index_t vector_size = 1; +}; + #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 template <> struct scalar_type @@ -944,6 +951,15 @@ using int8x16_t = typename vector_type::type; using int8x32_t = typename vector_type::type; using int8x64_t = typename vector_type::type; +// u8 +// i8 +using uint8x2_t = typename vector_type::type; +using uint8x4_t = typename vector_type::type; +using uint8x8_t = typename vector_type::type; +using uint8x16_t = typename vector_type::type; +using uint8x32_t = typename vector_type::type; +using uint8x64_t = typename vector_type::type; + // Convert X to Y template __host__ __device__ constexpr Y type_convert(X x) diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index 2ddbb6440d8..3a09d6038a4 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From b5083bfef4a1f7600c8c30030e677ad79b07d2fb Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 3 Aug 2023 02:01:44 +0000 Subject: [PATCH 101/118] Fp16AInt8B_GEMM sanity --- .../49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 8 +- example/49_fpAintB_gemm/run_gemm_example.inc | 62 ---------------- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 74 +------------------ .../element/unary_element_wise_operation.hpp | 16 +--- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 11 +-- .../threadwise_tensor_slice_transfer.hpp | 4 +- .../threadwise_tensor_slice_transfer_v3r1.hpp | 12 +-- script/clang-format-overwrite.sh | 4 +- 8 files changed, 14 insertions(+), 177 deletions(-) diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp index e8776a94bcd..138c8f1f86a 100644 --- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -14,6 +14,8 @@ // The DeviceOp is CDataType = ADataType * Dequant(BDataType) * ScaleDataType // The HostRef is CDataType = ADataType * Dequant(QuantDataType) * ScaleDataType +//TODO: Current implementation consume more VGPR than expected. + using ADataType = ck::half_t; using QuantDataType = int8_t; using BDataType = uint8_t; @@ -49,13 +51,13 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceFpAintBGemm_Wmma_ GemmDefault, 1, // Prefetch stage 128, // BlockSize - 128, // MPerBlock - 128, // NPerBlock + 64, // MPerBlock + 128, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave S<4, 32, 1>, S<1, 0, 2>, diff --git a/example/49_fpAintB_gemm/run_gemm_example.inc b/example/49_fpAintB_gemm/run_gemm_example.inc index 5aca18fd5cc..87c8d6a70a1 100644 --- a/example/49_fpAintB_gemm/run_gemm_example.inc +++ b/example/49_fpAintB_gemm/run_gemm_example.inc @@ -67,68 +67,6 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) UnsignedWeightPreprocessor preprocessor; Tensor b_k_n = preprocessor(quant_b_k_n); -#if 0 - printf("Matrix A:\n"); - for (int im = 0; im < M; im++) - { - for (int ik = 0; ik < K; ik++) - { - if(ik % 16 == 0){ - printf("|"); - } - - printf(" %04x", *(reinterpret_cast(&a_m_k(im,ik)))); - } - printf("\n"); - } -#endif -#if 0 - printf("Matrix QuantB:\n"); - for (int in = 0; in < N; in++) - { - for (int ik = 0; ik < K; ik++) - { - if(ik % 16 == 0){ - printf("|"); - } - - printf(" %02x", *(reinterpret_cast(&quant_b_k_n(ik,in)))); - } - printf("\n"); - } -#endif -#if 0 - printf("Matrix Scale:\n"); - for(int in = 0; in < N; in++) - { - for(int ik = 0; ik < 1; ik++) - { - if(ik % 16 == 0) - { - printf("|"); - } - - printf(" %04x", *(reinterpret_cast(&scale_k_n(ik, in)))); - } - printf("\n"); - } -#endif -#if 0 - printf("Matrix B:\n"); - for (int in = 0; in < N; in++) - { - for (int ik = 0; ik < K; ik++) - { - if(ik % 16 == 0){ - printf("|"); - } - - printf(" %02x", b_k_n(ik,in)); - } - printf("\n"); - } -#endif - Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp index 981fa70a69b..7aab2c77c2b 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -408,12 +408,7 @@ struct Blockwise_fpAintB_GemmWMMA scale_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0), scale_thread_buf); -#if 0 - printf("Tid: %03d, n: %02d, scale_thread_buf: %04x\n", - get_thread_local_1d_id(), n0.value, - *(reinterpret_cast(&scale_thread_buf[n0])) - ); -#endif + // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, @@ -448,72 +443,7 @@ struct Blockwise_fpAintB_GemmWMMA a_thread_desc_, make_tuple(I0, m0, I0, I0, I0, I0), a_thread_buf); - if(true) - { -#if 0 - printf("Tid: %03d, m, n, k: %02d, %02d, %02d, a_thread_buf: %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x|\n", - get_thread_local_1d_id(), m0.value, n0.value, k.value, - *(reinterpret_cast(&a_thread_buf[Number<0>{}])), - *(reinterpret_cast(&a_thread_buf[Number<1>{}])), - *(reinterpret_cast(&a_thread_buf[Number<2>{}])), - *(reinterpret_cast(&a_thread_buf[Number<3>{}])), - *(reinterpret_cast(&a_thread_buf[Number<4>{}])), - *(reinterpret_cast(&a_thread_buf[Number<5>{}])), - *(reinterpret_cast(&a_thread_buf[Number<6>{}])), - *(reinterpret_cast(&a_thread_buf[Number<7>{}])), - *(reinterpret_cast(&a_thread_buf[Number<8>{}])), - *(reinterpret_cast(&a_thread_buf[Number<9>{}])), - *(reinterpret_cast(&a_thread_buf[Number<10>{}])), - *(reinterpret_cast(&a_thread_buf[Number<11>{}])), - *(reinterpret_cast(&a_thread_buf[Number<12>{}])), - *(reinterpret_cast(&a_thread_buf[Number<13>{}])), - *(reinterpret_cast(&a_thread_buf[Number<14>{}])), - *(reinterpret_cast(&a_thread_buf[Number<15>{}])) - ); -#endif -#if 0 - printf("Tid: %03d, m, n, k: %02d, %02d, %02d, b_thread_buf: %02x %02x %02x %02x| %02x %02x %02x %02x| %02x %02x %02x %02x| %02x %02x %02x %02x|\n", - get_thread_local_1d_id(), m0.value, n0.value, k.value, - b_thread_buf[Number<0>{}], - b_thread_buf[Number<1>{}], - b_thread_buf[Number<2>{}], - b_thread_buf[Number<3>{}], - b_thread_buf[Number<4>{}], - b_thread_buf[Number<5>{}], - b_thread_buf[Number<6>{}], - b_thread_buf[Number<7>{}], - b_thread_buf[Number<8>{}], - b_thread_buf[Number<9>{}], - b_thread_buf[Number<10>{}], - b_thread_buf[Number<11>{}], - b_thread_buf[Number<12>{}], - b_thread_buf[Number<13>{}], - b_thread_buf[Number<14>{}], - b_thread_buf[Number<15>{}] - ); -#endif -#if 0 - printf("Tid: %03d, m, n, k: %02d, %02d, %02d, converted_b_thread_buf: %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x| %04x %04x %04x %04x|\n", - get_thread_local_1d_id(), m0.value, n0.value, k.value, - *(reinterpret_cast(&converted_b_thread_buf[Number<0>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<1>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<2>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<3>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<4>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<5>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<6>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<7>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<8>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<9>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<10>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<11>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<12>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<13>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<14>{}])), - *(reinterpret_cast(&converted_b_thread_buf[Number<15>{}])) - ); -#endif - } + vector_type a_thread_vec; static_for<0, WmmaK, 1>{}([&](auto i) { diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 57aa8638a31..28d60e3ca90 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -398,23 +398,12 @@ struct FastNumericArrayConverter uint32_t* half_2 = reinterpret_cast(&Output); uint32_t const uint8_4 = reinterpret_cast(Input); - // printf("Tid: %03d, uint8_4: %08x\n", - // get_thread_local_1d_id(), - // uint8_4); - static constexpr uint32_t byte_selector_01 = 0x05010500; static constexpr uint32_t byte_selector_23 = 0x05030502; static constexpr uint32_t fp16_adder = 0x64646464; half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01); half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23); - // printf("Tid: %03d, Part1 converted: %08x | %08x\n", - // get_thread_local_1d_id(), - // half_2[Number<0>{}], - // half_2[Number<1>{}]); - - // Lastly, we subtract 1152 from our constructed number using fp16 math to get our signed - // integer as fp16. static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" : "=v"(half_2[0]) @@ -422,10 +411,7 @@ struct FastNumericArrayConverter asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" : "=v"(half_2[1]) : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM)); - // printf("Tid: %03d, Part2 converted: %08x | %08x\n", - // get_thread_local_1d_id(), - // half_2[Number<0>{}], - // half_2[Number<1>{}]); + return Output; } diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index 8010550e040..b44f8d0e0eb 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -52,14 +52,6 @@ __global__ void #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ defined(__gfx1102__)) __shared__ char p_shared[GridwiseGemm::SharedMemTrait::lds_size]; - if(false && get_thread_local_1d_id() == 0) - { - printf("lds_size: %lu\n", GridwiseGemm::SharedMemTrait::lds_size); - printf("lds_a_size: %d\n", GridwiseGemm::SharedMemTrait::a_block_space_size_aligned); - printf("lds_b_size: %d\n", GridwiseGemm::SharedMemTrait::b_block_space_size_aligned); - printf("lds_scale_size: %d\n", - GridwiseGemm::SharedMemTrait::scale_block_space_size_aligned); - } GridwiseGemm::template Run(p_a_grid, p_b_grid, @@ -805,7 +797,7 @@ struct GridwiseFpAintBGemm_Wmma auto b_block_buf = make_dynamic_buffer( static_cast(p_shared) + SharedMemTrait::b_block_space_offset, SharedMemTrait::b_block_space_size_aligned); - // printf("b_lds_offset: %lu\n", SharedMemTrait::b_block_space_offset); + auto b_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1( static_cast(p_shared) + SharedMemTrait::scale_block_space_offset, SharedMemTrait::scale_block_space_size_aligned); - // printf("scale_lds_offset: %lu\n", SharedMemTrait::scale_block_space_offset); auto scale_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1(src_coord_.GetOffset(), is_src_valid)}; - if(false) - { - printf("Tid: %03d, a_grid_buf: %04x\n", - get_thread_local_1d_id(), - *(reinterpret_cast( - &src_vector_container.template AsType()[Number<0>{}]))); - } + // copy data from src_vector_container into src_thread_scratch_ src_thread_scratch_tuple_(thread_scratch_id) .template SetAsType( @@ -448,9 +442,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 const bool is_dst_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); -#if 0 - printf("Tid: %03d, LDS write offset: %d\n", get_thread_local_1d_id(), dst_coord_.GetOffset()); -#endif + using dst_vector_type = vector_type_maker_t; using dst_vector_t = typename dst_vector_type::type; diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index 3a09d6038a4..2ddbb6440d8 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,2 +1,2 @@ -find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -# git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' +git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' From 73e475d8ca49347e202bd7593f619ea0242d5926 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 7 Aug 2023 10:59:22 +0000 Subject: [PATCH 102/118] MQA implementation --- .../CMakeLists.txt | 1 + ...ulti_query_attention_forward_wmma_fp16.cpp | 288 ++++ ...run_multi_query_attention_forward_wmma.inc | 339 +++++ .../49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 2 +- ...ice_multi_query_attention_forward_wmma.hpp | 1247 +++++++++++++++++ .../cpu/reference_batched_gemm.hpp | 122 ++ 6 files changed, 1998 insertions(+), 1 deletion(-) create mode 100644 example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index c6c6fc3209e..a88a3503144 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -10,6 +10,7 @@ if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp) add_example_executable(example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp) + add_example_executable(example_multi_query_attention_forward_wmma_fp16 multi_query_attention_forward_wmma_fp16.cpp) endif() add_custom_target(example_gemm_scale_softmax_gemm) diff --git a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp new file mode 100644 index 00000000000..43feea12fb4 --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +/* +Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g_k_l) * B1_g_l_n + |-----------------| + Gemm0 + |-------------------------------------| + Gemm1 +*/ + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using B0DataType = F16; +using B1DataType = F16; +using Acc0DataType = F32; +using Acc1DataType = F32; +using CShuffleDataType = F32; +using CDataType = F16; +using Acc0BiasDataType = ck::Tuple<>; +using Acc1BiasDataType = ck::Tuple<>; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 1; +static constexpr ck::index_t NumDimN = 1; +static constexpr ck::index_t NumDimK = 1; +static constexpr ck::index_t NumDimO = 1; + +using AElementOp = PassThrough; +using B0ElementOp = PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; +static constexpr auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; + +// clang-format off +// #define CK_MHA_USE_WAVE_1 +// #define CK_MHA_USE_WAVE_2 +// #define CK_MHA_USE_WAVE_4 +#define CK_MHA_USE_WAVE_8 +using DeviceMHAFactory = + std::tuple< +#ifdef CK_MHA_USE_WAVE_1 + // 1 wave, mrepeat = 1, nrepeat = 2, k/o repeat = 1~5 + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_2 + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_4 + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 128, + // Gemm 0 + 64, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_8 + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceMultiQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec> +#endif + >; +// clang-format on +// Ref Gemm0: fp16 in, fp32 out +using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm_MQA; + +// Ref Softmax: fp32 in, fp16 out +using ReferenceSoftmaxInstance = + ck::tensor_operation::host::ReferenceSoftmax; + +// Ref Gemm1: fp16 in, fp16 out +using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm_MQA; + +#include "run_multi_query_attention_forward_wmma.inc" + +int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc new file mode 100644 index 00000000000..5a069d79576 --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +int run(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape for A/B0/B1/C + // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o + ck::index_t M = 120; + ck::index_t N = 1000; + ck::index_t K = 64; + ck::index_t O = 128; + + // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape + // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) + // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t G0 = 7; + ck::index_t G1 = 13; + ck::index_t KV_head = 1; + + float alpha = 1; + + bool input_permute = false; + bool output_permute = true; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 13) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + O = std::stoi(argv[7]); + G0 = std::stoi(argv[8]); + G1 = std::stoi(argv[9]); + + alpha = std::stof(argv[10]); + + input_permute = std::stoi(argv[11]); + output_permute = std::stoi(argv[12]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 11: M, N, K, O, G0, G1\n"); + printf("arg10: scale (alpha)\n"); + printf("arg11 to 12: input / output permute\n"); + exit(0); + } + + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides = + input_permute + ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] + : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::vector b0_gs_ns_ks_lengths{G0, KV_head, N, K}; + std::vector b0_gs_ns_ks_strides = + input_permute + ? std::vector{N * KV_head * K, K, KV_head * K, 1} + // B0 layout [G0, N, G1, K] + : std::vector{KV_head * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::vector b1_gs_os_ns_lengths{G0, KV_head, O, N}; + std::vector b1_gs_os_ns_strides = + input_permute + ? std::vector{N * KV_head * O, O, 1, KV_head * O} + // B1 layout [G0, N, G1, O] + : std::vector{KV_head * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides = + output_permute + ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] + : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + Tensor b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + Tensor c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + Tensor c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; + std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl; + std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl; + std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + case 3: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + break; + case 4: // A, B0, B1 1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 5: // Rand: b1 b0; unit: a + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 6: // Rand: a b0 ; unit: B1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 7: // Rand: a b1 ; unit: b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 8: // Rand: a ; unit: b0 b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 9: // Rand: b0 ; unit: a b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 10: // Rand: b1 ; unit: a b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + } + + DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize()); + DeviceMem b0_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize()); + DeviceMem b1_device_buf(sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * + c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_gs_ms_ks.mData.data()); + b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data()); + b1_device_buf.ToDevice(b1_gs_os_ns.mData.data()); + + auto a_element_op = AElementOp{}; + auto b0_element_op = B0ElementOp{}; + auto acc0_element_op = Acc0ElementOp{alpha}; + auto b1_element_op = B1ElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + float best_perf = .0; + float best_time = .0; + int not_pass = 0; + std::string best_kernel = ""; + printf("Verification: %s\n", do_verification ? "ON" : "OFF"); + // TODO ANT: replace array with vector? + ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { + const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + + using DeviceMHAInstance = ck::remove_cvref_t; + auto gemm = DeviceMHAInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b0_device_buf.GetDeviceBuffer()), + static_cast(b1_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + O, + G0, + G1, + alpha, + input_permute, + output_permute); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; + + // return 0; + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1; + std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(CDataType) * M * O) * G0 * G1 + + (sizeof(B0DataType) * K * N + sizeof(B1DataType) * N * O) * G0; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + if(tflops > best_perf) + { + best_perf = tflops; + best_time = ave_time * 1000; + best_kernel = gemm.GetTypeString(); + } + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g0_g1_m_k({G0, G1, M, K}); + Tensor b0_g0_1_k_n({G0, 1, K, N}); + Tensor b1_g0_1_n_o({G0, 1, N, O}); + Tensor acc0_g0_g1_m_n({G0, G1, M, N}); // scratch object after gemm0 + Tensor a1_g0_g1_m_n({G0, G1, M, N}); // scratch object after softmax + Tensor c_g0_g1_m_o_host_result({G0, G1, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g0_g1_m_k(idx[0], idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g0_1_k_n(idx[0], idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g0_1_n_o(idx[0], idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument(a_g0_g1_m_k, + b0_g0_1_k_n, + acc0_g0_g1_m_n, + a_element_op, + b0_element_op, + acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + acc0_g0_g1_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[2], idx[3])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = + ref_softmax.MakeArgument(acc0_g0_g1_m_n, a1_g0_g1_m_n, 1, 0, {3}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g0_g1_m_n, + b1_g0_1_n_o, + c_g0_g1_m_o_host_result, + PassThrough{}, + b1_element_op, + c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach( + [&](auto& self, auto idx) { self(idx) = c_g0_g1_m_o_host_result(idx); }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) + { + rtol = 1e-2; + atol = 1e-2; + } + + bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol); + printf("Verification: %s, Pass: %s\n", + do_verification ? "ON" : "OFF", + this_run_verification ? "YES" : "NO"); + + if(!this_run_verification) + { + not_pass = 1; + printf("%d th MQA instance verification Failed \n", i.value); + } + } + }); + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M + << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time + << " us" << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + return not_pass; +} diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp index 138c8f1f86a..a3b7554e620 100644 --- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -14,7 +14,7 @@ // The DeviceOp is CDataType = ADataType * Dequant(BDataType) * ScaleDataType // The HostRef is CDataType = ADataType * Dequant(QuantDataType) * ScaleDataType -//TODO: Current implementation consume more VGPR than expected. +// TODO: Current implementation consume more VGPR than expected. using ADataType = ck::half_t; using QuantDataType = int8_t; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp new file mode 100644 index 00000000000..2fd7147c58f --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp @@ -0,0 +1,1247 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp" +#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// Multi-Query Attention (MQA) kernel implementation +// Assume number of head of K,V is 1. +// Q [G0, G1, M, K] * K [G0, 1, K, N] = P [G0, G1, M, N] +// P [G0, G1, M, N] * V [G0, 1, N, O] = Out [G0, G1, M, O] +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_multi_query_attention_wmma(const ADataType* __restrict__ p_a_grid, + const B0DataType* __restrict__ p_b0_grid, + const B1DataType* __restrict__ p_b1_grid, + CDataType* __restrict__ p_c_grid, + index_t M, // SequenceQ + index_t N, // SequenceK + index_t K, // HeadDim + index_t O, // SequenceK + index_t G0, // Batch + index_t G1, // HeadNum + float alpha, + bool input_permute, + bool output_permute) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) + + // clang-format off +// *************************************************** + const auto q_head = G1; + const auto kv_head = 1; +// Make Tensor Descriptors + constexpr index_t array_size = 4; + std::array a_gs_ms_ks_lengths{G0, q_head, M, K}; + std::array a_gs_ms_ks_strides = + input_permute + ? std::array{M * q_head * K, K, q_head * K, 1} // A layout [G0, M, G1, K] + : std::array{q_head * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::array b0_gs_ns_ks_lengths{G0, kv_head, N, K}; + std::array b0_gs_ns_ks_strides = + input_permute + ? std::array{N * kv_head * K, K, kv_head * K, 1} // B0 layout [G0, N, 1, K] + : std::array{kv_head * N * K, N * K, K, 1}; // B0 layout [G0, 1, N, K] + + std::array b1_gs_os_ns_lengths{G0, kv_head, O, N}; + std::array b1_gs_os_ns_strides = + input_permute + ? std::array{N * kv_head * O, O, 1, kv_head * O} // B1 layout [G0, N, 1, O] + : std::array{kv_head * N * O, N * O, 1, O}; // B1 layout [G0, 1, N, O] + + std::array c_gs_ms_os_lengths{G0, q_head, M, O}; + std::array c_gs_ms_os_strides = + output_permute + ? std::array{M * q_head * O, O, q_head * O, 1} // C layout [G0, M, G1, O] + : std::array{q_head * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + const auto a_element_op = AElementwiseOperation{}; + const auto b0_element_op = B0ElementwiseOperation{}; + const auto acc0_element_op = AccElementwiseOperation{alpha}; + const auto b1_element_op = B1ElementwiseOperation{}; + const auto c_element_op = CElementwiseOperation{}; + // fail to reuse DeviceOp::MakeArgument() because of the __device__ function required. + + const auto a_grid_desc = DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n); + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto a_grid_desc_g_m_k = + DeviceOp::Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc_g_l_k = + DeviceOp::Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc_g_n_l = + DeviceOp::Transform::MakeB1GridDescriptor_G_N_K(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto compute_base_ptr_of_batch = + typename DeviceOp::ComputeBasePtrOfStridedBatch{a_grid_desc_g_m_k, b0_grid_desc_g_l_k, b1_grid_desc_g_n_l, c_grid_desc_g_m_n}; + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + const auto c0_matrix_mask = typename DeviceOp::C0MatrixMask{b0_grid_desc_g_l_k.GetLength(Number<1>{})}; + + // clang-format on + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB0BasePtr(g_idx / G1))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetB1BasePtr(g_idx / G1))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + GridwiseOp::template Run(p_a_grid + a_batch_offset, + p_b0_grid + b0_batch_offset, + p_b1_grid + b1_batch_offset, + p_c_grid + c_batch_offset, + p_shared, + a_grid_desc, + b0_grid_desc, + b1_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b0_element_op, + acc0_element_op, + b1_element_op, + c_element_op, + c0_matrix_mask, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b0_grid; + ignore = p_b1_grid; + ignore = p_c_grid; + ignore = M; + ignore = N; + ignore = K; + ignore = O; + ignore = G0; + ignore = G1; + ignore = input_permute; + ignore = output_permute; +#endif // end of if (defined(__gfx1100__)) +} + +// Computes C = A * B0 * B1 +// MN = MK * KL * LN +// ^^^^^^ (Acc0) +// ^^^^^^^^^^^ (Acc1) +template +struct DeviceMultiQueryAttentionForward_Wmma + : public DeviceBatchedGemmSoftmaxGemmPermute +{ + static_assert(NumDimG > 0 && NumDimM > 0 && NumDimL > 0 && NumDimK > 0 && NumDimN > 0, + "Number of dimension must be greater than 0"); + + static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size(); + static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size(); + + // TODO ANT: implement bias combination + static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented"); + + static constexpr index_t NumDimGemm0M = NumDimM; + static constexpr index_t NumDimGemm0N = NumDimL; + static constexpr index_t NumDimGemm0K = NumDimK; + static constexpr index_t NumDimGemm1M = NumDimM; + static constexpr index_t NumDimGemm1N = NumDimN; + static constexpr index_t NumDimGemm1K = NumDimL; + + using DeviceOp = DeviceMultiQueryAttentionForward_Wmma; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + + static constexpr auto WmmaK = 16; + + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + + static constexpr auto AEnableLds_auto = LWaves == 1 ? false : true; + static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true; + static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true; + + static constexpr auto AEnableLds_manu = false; + static constexpr auto B0EnableLds_manu = true; + static constexpr auto B1EnableLds_manu = true; + + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); + static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1); + static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1); + + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma< + Sequence, + Sequence, + GemmSpec, + ASpec, + B0Spec, + B1Spec, + CSpec>; + + __host__ __device__ static auto MakeAGridDescriptor( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) + { + if constexpr(AEnableLds) + { + return Transform::MakeAGridDescriptor_AK0_M_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), + Number{}); + } + else + { + return Transform:: + MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AK0PerWmma_AKRow_MPerWmma_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, + a_gs_ms_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } + } + + __host__ __device__ static auto MakeB0GridDescriptor( + const std::array& b0_gs_ls_ks_lengths_vec, + const std::array& b0_gs_ls_ks_strides_vec) + { + if constexpr(B0EnableLds) + { + return Transform::MakeB0GridDescriptor_BK0_N_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}); + } + else + { + return Transform:: + MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BK0PerWmma_BKRow_LPerWmma_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } + } + + __host__ __device__ static auto MakeB1GridDescriptor( + const std::array& b1_gs_ns_ls_lengths_vec, + const std::array& b1_gs_ns_ls_strides_vec) + { + if constexpr(B1EnableLds) + { + return Transform::MakeB1GridDescriptor_BK0_N_BK1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}); + } + else + { + return Transform:: + MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves__BL0PerWmma_BLRow_NPerWmma_BL1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } + } + + using AGridDesc = decltype(MakeAGridDescriptor({}, {})); + using B0GridDesc = decltype(MakeB0GridDescriptor({}, {})); + using B1GridDesc = decltype(MakeB1GridDescriptor({}, {})); + using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); + using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); + using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); + using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); + using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); + + __host__ __device__ constexpr static auto make_MaskOutPredicate() + { + if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled) + { + return MaskDisabledPredicate{}; + } + else if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle) + { + return MaskOutUpperTrianglePredicate{}; + } + } + using C0MatrixMask = C0MatrixMask_impl; + + struct ComputeBasePtrOfStridedBatch + { + __host__ __device__ ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, + const B0GridDesc_G_L_K& b0_grid_desc_g_l_k, + const B1GridDesc_G_N_L& b1_grid_desc_g_n_l, + const CGridDesc_G_M_N& c_grid_desc_g_m_n) + : a_grid_desc_g_m_k_(a_grid_desc_g_m_k), + b0_grid_desc_g_l_k_(b0_grid_desc_g_l_k), + b1_grid_desc_g_n_l_(b1_grid_desc_g_n_l), + c_grid_desc_g_m_n_(c_grid_desc_g_m_n) + { + } + + __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const + { + return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const + { + return b0_grid_desc_g_l_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const + { + return b1_grid_desc_g_n_l_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const + { + return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + private: + AGridDesc_G_M_K a_grid_desc_g_m_k_; + B0GridDesc_G_L_K b0_grid_desc_g_l_k_; + B1GridDesc_G_N_L b1_grid_desc_g_n_l_; + CGridDesc_G_M_N c_grid_desc_g_m_n_; + }; + + // GridwiseOp + using GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma< + // DataType Family + ADataType, + B0DataType, + Acc0DataType, + B1DataType, + Acc1DataType, + CShuffleDataType, + CDataType, + // ElementwiseOp Family + AElementwiseOperation, + B0ElementwiseOperation, + AccElementwiseOperation, + B1ElementwiseOperation, + CElementwiseOperation, + InMemoryDataOperationEnum::Set, + // InMemory Data Descriptor + AGridDesc, + B0GridDesc, + B1GridDesc, + CGridDesc_M_N, + // Tiling Family + MPerBlock, + LPerBlock, + KPerBlock, + AK1, + BK1, + NPerBlock, + LTilePerBlock, + L1, + MPerWmma, + LPerWmma, + NPerWmma, + MRepeat, + LRepeat, + NRepeat, + // ThreadCluster Family + BlockSize, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + true, + AEnableLds, + ABlockLdsAddExtraM, + B0BlockTransferThreadClusterLengths_K0_L_K1, + B0BlockTransferThreadClusterArrangeOrder, + B0BlockTransferSrcAccessOrder, + B0BlockTransferSrcVectorDim, + B0BlockTransferSrcScalarPerVector, + B0BlockTransferDstScalarPerVector_K1, + true, + B0EnableLds, + B0BlockLdsAddExtraL, + B1BlockTransferThreadClusterLengths_L0_N_L1, + B1BlockTransferThreadClusterArrangeOrder, + B1BlockTransferSrcAccessOrder, + B1BlockTransferSrcVectorDim, + B1BlockTransferSrcScalarPerVector, + B1BlockTransferDstScalarPerVector_L1, + false, + B1EnableLds, + B1BlockLdsAddExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CShuffleBlockTransferScalarPerVector_NPerBlock, + Transform::matrix_padder.PadN, + MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle, + NumPrefetch, + LoopSched, + PipelineVer>; + + struct RawArg : public BaseArgument + { + RawArg(const ADataType* p_a_grid, + const B0DataType* p_b0_grid, + const B1DataType* p_b1_grid, + CDataType* p_c_grid, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) + : p_a_grid_{p_a_grid}, + p_b0_grid_{p_b0_grid}, + p_b1_grid_{p_b1_grid}, + p_c_grid_{p_c_grid}, + M_{M}, + N_{N}, + K_{K}, + O_{O}, + G0_{G0}, + G1_{G1}, + alpha_{alpha}, + input_permute_{input_permute}, + output_permute_{output_permute} + { + } + // Pointers + const ADataType* p_a_grid_; + const B0DataType* p_b0_grid_; + const B1DataType* p_b1_grid_; + CDataType* p_c_grid_; + + // Raw Problem Size + index_t M_; + index_t N_; + index_t K_; + index_t O_; + index_t G0_; + index_t G1_; + float alpha_; + bool input_permute_; + bool output_permute_; + }; + + static auto MakeArgument(const ADataType* p_a, + const B0DataType* p_b0, + const B1DataType* p_b1, + CDataType* p_c, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) + { + return RawArg{ + p_a, p_b0, p_b1, p_c, M, N, K, O, G0, G1, alpha, input_permute, output_permute}; + } + + static bool IsSupportedArgument(const RawArg& arg) + { + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") + { + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc0 Type err"); + return false; + } + + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc1 Type err"); + return false; + } + } + else + { + printf("DeviceOp: Arch err"); + return false; + } + + constexpr index_t array_size = 4; + ck::index_t G0 = arg.G0_; + ck::index_t G1 = arg.G1_; + ck::index_t M = arg.M_; + ck::index_t N = arg.N_; + ck::index_t K = arg.K_; + ck::index_t O = arg.O_; + bool input_permute = arg.input_permute_; + bool output_permute = arg.output_permute_; + + std::array a_gs_ms_ks_lengths{G0, G1, M, K}; + std::array a_gs_ms_ks_strides = + input_permute ? std::array{M * G1 * K, K, G1 * K, 1} + // A layout [G0, M, G1, K] + : std::array{ + G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::array b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::array b0_gs_ns_ks_strides = + input_permute ? std::array{N * G1 * K, K, G1 * K, 1} + // B0 layout [G0, N, G1, K] + : std::array{ + G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::array b1_gs_os_ns_lengths{G0, G1, O, N}; + std::array b1_gs_os_ns_strides = + input_permute ? std::array{N * G1 * O, O, 1, G1 * O} + // B1 layout [G0, N, G1, O] + : std::array{ + G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::array c_gs_ms_os_lengths{G0, G1, M, O}; + std::array c_gs_ms_os_strides = + output_permute ? std::array{M * G1 * O, O, G1 * O, 1} + // C layout [G0, M, G1, O] + : std::array{ + G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + const auto a_grid_desc = + DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + + if(!GridwiseOp::CheckValidity( + a_grid_desc, b0_grid_desc, b1_grid_desc, c_grid_desc_m_n, block_2_ctile_map)) + { + return false; + } + + // Check if C permute dimension matches GEMM + GEMM shape + const index_t c_g = c_grid_desc_g_m_n.GetLength(I0); // unpadded + + if(!(c_g == batch_count)) + { + printf("DeviceOp: BatchCount err"); + return false; + } + + // Note: we need raw lengths since threadwise copy can not handle vector load when part of + // vector is out of bounds + // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O + const auto MzRaw = M; + const auto LzRaw = N; + const auto KzRaw = K; + const auto NzRaw = O; + + // Check scalar per vector requirement + const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; + const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; + const auto c_extent_lowest = NzRaw; + + if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && + b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && + b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && + c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + printf("DeviceOp: Data Transfer Vector scalar err"); + return false; + } + + std::array a_mz_kz_strides_{ + a_gs_ms_ks_strides[NumDimG + NumDimM - 1], + a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}; + std::array b0_lz_kz_strides_{ + b0_gs_ns_ks_strides[NumDimG + NumDimL - 1], + b0_gs_ns_ks_strides[NumDimG + NumDimL + NumDimK - 1]}; + std::array b1_nz_lz_strides_{ + b1_gs_os_ns_strides[NumDimG + NumDimN - 1], + b1_gs_os_ns_strides[NumDimG + NumDimN + NumDimL - 1]}; + std::array c_mz_nz_strides_{ + c_gs_ms_os_strides[NumDimG + NumDimM - 1], + c_gs_ms_os_strides[NumDimG + NumDimM + NumDimN - 1]}; + + // Check vector load/store requirement + const auto a_stride_lowest = + ABlockTransferSrcVectorDim == 2 ? a_mz_kz_strides_[1] : a_mz_kz_strides_[0]; + const auto b0_stride_lowest = + B0BlockTransferSrcVectorDim == 2 ? b0_lz_kz_strides_[1] : b0_lz_kz_strides_[0]; + const auto b1_stride_lowest = + B1BlockTransferSrcVectorDim == 2 ? b1_nz_lz_strides_[1] : b1_nz_lz_strides_[0]; + const auto c_stride_lowest = c_mz_nz_strides_[1]; + + if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || + c_stride_lowest == 1)) + { + printf("DeviceOp: Data Vectorize transfer err"); + return false; + } + + return true; + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + // Argument + struct Argument : public BaseArgument + { + Argument( + const ADataType* p_a_grid, + const B0DataType* p_b0_grid, + const B1DataType* p_b1_grid, + CDataType* p_c_grid, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b0_gs_ls_ks_lengths, + const std::array& b0_gs_ls_ks_strides, + const std::array& b1_gs_ns_ls_lengths, + const std::array& b1_gs_ns_ls_strides, + const std::array& c_gs_ms_ns_lengths, + const std::array& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + const index_t M01, + const index_t N01, + AElementwiseOperation a_element_op, + B0ElementwiseOperation b0_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) + : p_a_grid_{p_a_grid}, + p_b0_grid_{p_b0_grid}, + p_b1_grid_{p_b1_grid}, + p_c_grid_{p_c_grid}, + a_grid_desc{DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b0_grid_desc{ + DeviceOp::MakeB0GridDescriptor(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc{ + DeviceOp::MakeB1GridDescriptor(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + c_grid_desc_m_n_{ + Transform::MakeCGridDescriptor_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, + a_grid_desc_g_m_k_{ + Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b0_grid_desc_g_l_k_{ + Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc_g_n_l_{ + Transform::MakeB1GridDescriptor_G_N_K(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + c_grid_desc_g_m_n_{ + Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, + c_grid_desc_mblock_mperblock_nblock_nperblock_{}, + block_2_ctile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)}, + a_element_op_{a_element_op}, + b0_element_op_{b0_element_op}, + acc_element_op_{acc_element_op}, + b1_element_op_{b1_element_op}, + c_element_op_{c_element_op}, + c0_matrix_mask_{b0_grid_desc_g_l_k_.GetLength(I1)}, + raw_lengths_mz_lz_kz_nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], + b0_gs_ls_ks_lengths[NumDimG + NumDimL - 1], + b0_gs_ls_ks_lengths[NumDimG + NumDimL + NumDimK - 1], + b1_gs_ns_ls_lengths[NumDimG + NumDimN - 1]}, + a_mz_kz_strides_{a_gs_ms_ks_strides[NumDimG + NumDimM - 1], + a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}, + b0_lz_kz_strides_{b0_gs_ls_ks_strides[NumDimG + NumDimL - 1], + b0_gs_ls_ks_strides[NumDimG + NumDimL + NumDimK - 1]}, + b1_nz_lz_strides_{b1_gs_ns_ls_strides[NumDimG + NumDimN - 1], + b1_gs_ns_ls_strides[NumDimG + NumDimN + NumDimL - 1]}, + c_mz_nz_strides_{c_gs_ms_ns_strides[NumDimG + NumDimM - 1], + c_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1]}, + batch_count_{c_grid_desc_g_m_n_.GetLength(I0)}, + compute_ptr_offset_of_batch_{ + a_grid_desc_g_m_k_, b0_grid_desc_g_l_k_, b1_grid_desc_g_n_l_, c_grid_desc_g_m_n_} + { + // TODO ANT: implement bias addition + ignore = p_acc0_biases; + ignore = p_acc1_biases; + ignore = acc0_biases_gs_ms_ls_lengths; + ignore = acc0_biases_gs_ms_ls_strides; + ignore = acc1_biases_gs_ms_ns_lengths; + ignore = acc1_biases_gs_ms_ns_strides; + + if(GridwiseOp::CheckValidity( + a_grid_desc, b0_grid_desc, b1_grid_desc, c_grid_desc_m_n_, block_2_ctile_map_)) + { + c_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n_); + } + } + + // Pointers + const ADataType* p_a_grid_; + const B0DataType* p_b0_grid_; + const B1DataType* p_b1_grid_; + CDataType* p_c_grid_; + + // Tensor Descriptors + AGridDesc a_grid_desc; + B0GridDesc b0_grid_desc; + B1GridDesc b1_grid_desc; + CGridDesc_M_N c_grid_desc_m_n_; + + AGridDesc_G_M_K a_grid_desc_g_m_k_; + B0GridDesc_G_L_K b0_grid_desc_g_l_k_; + B1GridDesc_G_N_L b1_grid_desc_g_n_l_; + CGridDesc_G_M_N c_grid_desc_g_m_n_; + + typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock_; + + // Block to Tile mapping + typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map_; + + // ElementwiseOp + AElementwiseOperation a_element_op_; + B0ElementwiseOperation b0_element_op_; + AccElementwiseOperation acc_element_op_; + B1ElementwiseOperation b1_element_op_; + CElementwiseOperation c_element_op_; + + // check C0 masking and padding + C0MatrixMask c0_matrix_mask_; + + // Strides for the last M/N/K dimensions of A/B0/B1/C + // for sanity check of vector load/store + std::array raw_lengths_mz_lz_kz_nz_; + std::array a_mz_kz_strides_; + std::array b0_lz_kz_strides_; + std::array b1_nz_lz_strides_; + std::array c_mz_nz_strides_; + + index_t batch_count_; + // Batch Offset + ComputeBasePtrOfStridedBatch compute_ptr_offset_of_batch_; + }; + + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::RawArg; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto M0 = math::integer_divide_ceil(arg.M_, MPerBlock); + const auto N0 = math::integer_divide_ceil(arg.O_, NPerBlock); + + const index_t grid_size = arg.G0_ * arg.G1_ * M0 * N0; + const auto K = arg.K_; + // printf("HasKBlockLoop: %d\n", GridwiseOp::CalculateHasMainKBlockLoop(K)); + auto launch_kernel = [&](auto has_main_k_block_loop) { + const auto kernel = kernel_multi_query_attention_wmma; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b0_grid_, + arg.p_b1_grid_, + arg.p_c_grid_, + arg.M_, + arg.N_, + arg.K_, + arg.O_, + arg.G0_, + arg.G1_, + arg.alpha_, + arg.input_permute_, + arg.output_permute_); + }; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } +#if 0 + static bool IsSupportedArgument(const Argument& arg) + { + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") + { + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc0 Type err"); + return false; + } + + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc1 Type err"); + return false; + } + } + else + { + printf("DeviceOp: Arch err"); + return false; + } + + if(!GridwiseOp::CheckValidity(arg.a_grid_desc, + arg.b0_grid_desc, + arg.b1_grid_desc, + arg.c_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + return false; + } + + // Check if C permute dimension matches GEMM + GEMM shape + const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded + + if(!(c_g == arg.batch_count_)) + { + printf("DeviceOp: BatchCount err"); + return false; + } + + // Note: we need raw lengths since threadwise copy can not handle vector load when part of + // vector is out of bounds + // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O + const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; + const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; + const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; + const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; + + // Check scalar per vector requirement + const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; + const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; + const auto c_extent_lowest = NzRaw; + + if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && + b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && + b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && + c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + printf("DeviceOp: Data Transfer Vector scalar err"); + return false; + } + + // Check vector load/store requirement + const auto a_stride_lowest = + ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; + const auto b0_stride_lowest = + B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; + const auto b1_stride_lowest = + B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; + const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; + + if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || + c_stride_lowest == 1)) + { + printf("DeviceOp: Data Vectorize transfer err"); + return false; + } + + return true; + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument( + const ADataType* p_a, + const B0DataType* p_b0, + const B1DataType* p_b1, + CDataType* p_c, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b0_gs_ls_ks_lengths, + const std::array& b0_gs_ls_ks_strides, + const std::array& b1_gs_ns_ls_lengths, + const std::array& b1_gs_ns_ls_strides, + const std::array& c_gs_ms_ns_lengths, + const std::array& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + AElementwiseOperation a_element_op, + B0ElementwiseOperation b0_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) + { + return Argument{p_a, + p_b0, + p_b1, + p_c, + p_acc0_biases, + p_acc1_biases, + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ls_ks_lengths, + b0_gs_ls_ks_strides, + b1_gs_ns_ls_lengths, + b1_gs_ns_ls_strides, + c_gs_ms_ns_lengths, + c_gs_ms_ns_strides, + acc0_biases_gs_ms_ls_lengths, + acc0_biases_gs_ms_ls_strides, + acc1_biases_gs_ms_ns_lengths, + acc1_biases_gs_ms_ns_strides, + 1, + 1, + a_element_op, + b0_element_op, + acc_element_op, + b1_element_op, + c_element_op}; + } +#endif + + // polymorphic + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b0, + const void* p_b1, + void* p_c, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b0_gs_ls_ks_lengths, + const std::vector& b0_gs_ls_ks_strides, + const std::vector& b1_gs_ns_ls_lengths, + const std::vector& b1_gs_ns_ls_strides, + const std::vector& c_gs_ms_ns_lengths, + const std::vector& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + AElementwiseOperation a_element_op, + B0ElementwiseOperation b0_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) override + { + std::array a_lengths; + std::array a_strides; + std::array b0_lengths; + std::array b0_strides; + std::array b1_lengths; + std::array b1_strides; + std::array c_lengths; + std::array c_strides; + std::transform(a_gs_ms_ks_lengths.begin(), + a_gs_ms_ks_lengths.end(), + a_lengths.begin(), + [](index_t i) { return i; }); + std::transform(a_gs_ms_ks_strides.begin(), + a_gs_ms_ks_strides.end(), + a_strides.begin(), + [](index_t i) { return i; }); + std::transform(b0_gs_ls_ks_lengths.begin(), + b0_gs_ls_ks_lengths.end(), + b0_lengths.begin(), + [](index_t i) { return i; }); + std::transform(b0_gs_ls_ks_strides.begin(), + b0_gs_ls_ks_strides.end(), + b0_strides.begin(), + [](index_t i) { return i; }); + std::transform(b1_gs_ns_ls_lengths.begin(), + b1_gs_ns_ls_lengths.end(), + b1_lengths.begin(), + [](index_t i) { return i; }); + std::transform(b1_gs_ns_ls_strides.begin(), + b1_gs_ns_ls_strides.end(), + b1_strides.begin(), + [](index_t i) { return i; }); + std::transform(c_gs_ms_ns_lengths.begin(), + c_gs_ms_ns_lengths.end(), + c_lengths.begin(), + [](index_t i) { return i; }); + std::transform(c_gs_ms_ns_strides.begin(), + c_gs_ms_ns_strides.end(), + c_strides.begin(), + [](index_t i) { return i; }); + return std::make_unique(static_cast(p_a), + static_cast(p_b0), + static_cast(p_b1), + static_cast(p_c), + p_acc0_biases, + p_acc1_biases, + a_lengths, + a_strides, + b0_lengths, + b0_strides, + b1_lengths, + b1_strides, + c_lengths, + c_strides, + acc0_biases_gs_ms_ls_lengths, + acc0_biases_gs_ms_ls_strides, + acc1_biases_gs_ms_ns_lengths, + acc1_biases_gs_ms_ns_strides, + 1, + 1, + a_element_op, + b0_element_op, + acc_element_op, + b1_element_op, + c_element_op); + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + + // clang-format off + str << "DeviceMultiQueryAttentionForward_Wmma" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << LPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << LTilePerBlock << ", " + << L1 << ", " + << getGemmSpecializationString(GemmSpec) << ", " + << "ASpec" << getTensorSpecializationString(ASpec) << ", " + << "B0Spec" << getTensorSpecializationString(B0Spec) << ", " + << "B1Spec" << getTensorSpecializationString(B1Spec) << ", " + << "CSpec" << getTensorSpecializationString(CSpec) << ", " + << getMaskingSpecializationString(MaskingSpec) + << ">" + << " AEnableLds: " + << AEnableLds << ", " + << "B0EnableLds: " + << B0EnableLds << ", " + << "B1EnableLds: " + << B1EnableLds << ", " + << "NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp index a1b1e0d91b4..327cc9e28c6 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp @@ -133,6 +133,128 @@ struct ReferenceBatchedGemm : public device::BaseOperator } }; +template +struct ReferenceBatchedGemm_MQA : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const Tensor& a_g0_g1_m_k, + const Tensor& b_g0_1_k_n, + Tensor& c_g0_g1_m_n, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + : a_g0_g1_m_k_{a_g0_g1_m_k}, + b_g0_1_k_n_{b_g0_1_k_n}, + c_g0_g1_m_n_{c_g0_g1_m_n}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + c_element_op_{c_element_op} + { + } + + const Tensor& a_g0_g1_m_k_; + const Tensor& b_g0_1_k_n_; + Tensor& c_g0_g1_m_n_; + + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CElementwiseOperation c_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = ReferenceBatchedGemm_MQA::Argument; + + float Run(const Argument& arg) + { + auto f_g0g1mk_g01kn_g0g1mn = [&](auto g0, auto g1, auto m, auto n) { + const int K = arg.a_g0_g1_m_k_.mDesc.GetLengths()[3]; + + AccDataType v_acc = 0; + + for(int k = 0; k < K; ++k) + { + ADataType v_a; + BDataType v_b; + + arg.a_element_op_(v_a, arg.a_g0_g1_m_k_(g0, g1, m, k)); + arg.b_element_op_(v_b, arg.b_g0_1_k_n_(g0, 0, k, n)); + + v_acc += + ck::type_convert(v_a) * ck::type_convert(v_b); + } + + AccDataType v_c; + + arg.c_element_op_(v_c, v_acc); + + arg.c_g0_g1_m_n_(g0, g1, m, n) = ck::type_convert(v_c); + }; + + make_ParallelTensorFunctor(f_g0g1mk_g01kn_g0g1mn, + arg.c_g0_g1_m_n_.mDesc.GetLengths()[0], + arg.c_g0_g1_m_n_.mDesc.GetLengths()[1], + arg.c_g0_g1_m_n_.mDesc.GetLengths()[2], + arg.c_g0_g1_m_n_.mDesc.GetLengths()[3])( + std::thread::hardware_concurrency()); + return 0; + } + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const Tensor& a_g0_g1_m_k, + const Tensor& b_g0_1_k_n, + Tensor& c_g0_g1_m_n, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{ + a_g0_g1_m_k, b_g0_1_k_n, c_g0_g1_m_n, a_element_op, b_element_op, c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceBatchedGemm_MQA" + << std::endl; + // clang-format on + + return str.str(); + } +}; + } // namespace host } // namespace tensor_operation } // namespace ck From b2d5cf8a78df291d40354e8a216168ea5461eca1 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 8 Aug 2023 07:40:52 +0000 Subject: [PATCH 103/118] GQA-4 example --- .../CMakeLists.txt | 1 + ...uped_query_attention_forward_wmma_fp16.cpp | 302 ++++ ...ulti_query_attention_forward_wmma_fp16.cpp | 9 +- ...n_grouped_query_attention_forward_wmma.inc | 340 +++++ ...e_grouped_query_attention_forward_wmma.hpp | 1257 +++++++++++++++++ .../cpu/reference_batched_gemm.hpp | 124 ++ 6 files changed, 2028 insertions(+), 5 deletions(-) create mode 100644 example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index a88a3503144..af6609faad1 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -11,6 +11,7 @@ if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp) add_example_executable(example_cross_attention_forward_wmma_fp16 cross_attention_forward_wmma_fp16.cpp) add_example_executable(example_multi_query_attention_forward_wmma_fp16 multi_query_attention_forward_wmma_fp16.cpp) + add_example_executable(example_grouped_query_attention_forward_wmma_fp16 grouped_query_attention_forward_wmma_fp16.cpp) endif() add_custom_target(example_gemm_scale_softmax_gemm) diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp new file mode 100644 index 00000000000..12dcfcc36d9 --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_query_attention_forward_wmma_fp16.cpp @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +/* +Grouped Query Attention, +Ainslie, Joshua, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebrón, and Sumit +Sanghai. “GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints.” +arXiv, May 22, 2023. https://doi.org/10.48550/arXiv.2305.13245. + +Example is GQA-4 +*/ + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/literals.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" + +template +using S = ck::Sequence; + +using F16 = ck::half_t; +using F32 = float; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +using ADataType = F16; +using B0DataType = F16; +using B1DataType = F16; +using Acc0DataType = F32; +using Acc1DataType = F32; +using CShuffleDataType = F32; +using CDataType = F16; +using Acc0BiasDataType = ck::Tuple<>; +using Acc1BiasDataType = ck::Tuple<>; + +static constexpr ck::index_t NumDimG = 2; +static constexpr ck::index_t NumDimM = 1; +static constexpr ck::index_t NumDimN = 1; +static constexpr ck::index_t NumDimK = 1; +static constexpr ck::index_t NumDimO = 1; +static constexpr ck::index_t QueryGroupNumber = 4; + +using AElementOp = PassThrough; +using B0ElementOp = PassThrough; +using Acc0ElementOp = ck::tensor_operation::element_wise::Scale; +using B1ElementOp = PassThrough; +using CElementOp = PassThrough; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding; +static constexpr auto MaskingSpec = + ck::tensor_operation::device::MaskingSpecialization::MaskDisabled; + +static constexpr auto TensorSpecA = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default; +static constexpr auto TensorSpecC = ck::tensor_operation::device::TensorSpecialization::Default; + +// clang-format off +// #define CK_MHA_USE_WAVE_1 +// #define CK_MHA_USE_WAVE_2 +// #define CK_MHA_USE_WAVE_4 +#define CK_MHA_USE_WAVE_8 +using DeviceMHAFactory = + std::tuple< +#ifdef CK_MHA_USE_WAVE_1 + // 1 wave, mrepeat = 1, nrepeat = 2, k/o repeat = 1~5 + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 32, + // Gemm 0 + 16, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 32, + // Gemm 0 + 16, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_2 + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 64, + // Gemm 0 + 32, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 64, + // Gemm 0 + 32, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_4 + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 128, + // Gemm 0 + 64, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 128, + // Gemm 0 + 64, 64, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 64, 1, 2>, 8, + MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_8 + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceGroupedQueryAttentionForward_Wmma< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + QueryGroupNumber, + 256, + // Gemm 0 + 128, 128, 64, 8, 8, + // Gemm 1 + 64, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 8, 4, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec> +#endif + >; +// clang-format on +// Ref Gemm0: fp16 in, fp32 out +using ReferenceGemm0Instance = + ck::tensor_operation::host::ReferenceBatchedGemm_GQA; + +// Ref Softmax: fp32 in, fp16 out +using ReferenceSoftmaxInstance = + ck::tensor_operation::host::ReferenceSoftmax; + +// Ref Gemm1: fp16 in, fp16 out +using ReferenceGemm1Instance = + ck::tensor_operation::host::ReferenceBatchedGemm_GQA; + +#include "run_grouped_query_attention_forward_wmma.inc" + +int main(int argc, char* argv[]) { return run(argc, argv); } diff --git a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp index 43feea12fb4..694a320a45f 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/multi_query_attention_forward_wmma_fp16.cpp @@ -2,11 +2,10 @@ // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. /* -Gemm + Softmax + Gemm fused operation. Computes C_g_m_n = Softmax(A_g_m_k * B0_g_k_l) * B1_g_l_n - |-----------------| - Gemm0 - |-------------------------------------| - Gemm1 +Multi-Query Attention +Shazeer, Noam. “Fast Transformer Decoding: One Write-Head Is All You Need.” arXiv.org, November 6, +2019. https://arxiv.org/abs/1911.02150v1. + */ #include diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc new file mode 100644 index 00000000000..0d66d837d30 --- /dev/null +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +int run(int argc, char* argv[]) +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + // GEMM shape for A/B0/B1/C + // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o + ck::index_t M = 1024; + ck::index_t N = 1024; + ck::index_t K = 64; + ck::index_t O = 64; + + // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape + // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) + // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t G0 = 4; + ck::index_t G1 = 16; + ck::index_t KV_head = QueryGroupNumber; + + float alpha = 1; + + bool input_permute = false; + bool output_permute = true; + + if(argc == 1) + { + // use default case + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else if(argc == 13) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + + M = std::stoi(argv[4]); + N = std::stoi(argv[5]); + K = std::stoi(argv[6]); + O = std::stoi(argv[7]); + G0 = std::stoi(argv[8]); + G1 = std::stoi(argv[9]); + + alpha = std::stof(argv[10]); + + input_permute = std::stoi(argv[11]); + output_permute = std::stoi(argv[12]); + } + else + { + printf("arg1: verification (0=no, 1=yes)\n"); + printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); + printf("arg3: time kernel (0=no, 1=yes)\n"); + printf("arg4 to 11: M, N, K, O, G0, G1\n"); + printf("arg10: scale (alpha)\n"); + printf("arg11 to 12: input / output permute\n"); + exit(0); + } + + std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_strides = + input_permute + ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] + : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::vector b0_gs_ns_ks_lengths{G0, KV_head, N, K}; + std::vector b0_gs_ns_ks_strides = + input_permute + ? std::vector{N * KV_head * K, K, KV_head * K, 1} + // B0 layout [G0, N, G1, K] + : std::vector{KV_head * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::vector b1_gs_os_ns_lengths{G0, KV_head, O, N}; + std::vector b1_gs_os_ns_strides = + input_permute + ? std::vector{N * KV_head * O, O, 1, KV_head * O} + // B1 layout [G0, N, G1, O] + : std::vector{KV_head * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + std::vector c_gs_ms_os_strides = + output_permute + ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] + : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + Tensor b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + Tensor c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + Tensor c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl; + std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl; + std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl; + std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 2: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + case 3: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + break; + case 4: // A, B0, B1 1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 5: // Rand: b1 b0; unit: a + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 6: // Rand: a b0 ; unit: B1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 7: // Rand: a b1 ; unit: b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + case 8: // Rand: a ; unit: b0 b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 9: // Rand: b0 ; unit: a b1 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_1{}); + break; + case 10: // Rand: b1 ; unit: a b0 + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2{-2, 2}); + break; + default: + a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{}); + b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal{}); + b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); + } + + DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize()); + DeviceMem b0_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize()); + DeviceMem b1_device_buf(sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize()); + DeviceMem c_device_buf(sizeof(CDataType) * + c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()); + + a_device_buf.ToDevice(a_gs_ms_ks.mData.data()); + b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data()); + b1_device_buf.ToDevice(b1_gs_os_ns.mData.data()); + + auto a_element_op = AElementOp{}; + auto b0_element_op = B0ElementOp{}; + auto acc0_element_op = Acc0ElementOp{alpha}; + auto b1_element_op = B1ElementOp{}; + auto c_element_op = CElementOp{}; + + // do GEMM + float best_perf = .0; + float best_time = .0; + int not_pass = 0; + std::string best_kernel = ""; + printf("Verification: %s\n", do_verification ? "ON" : "OFF"); + // TODO ANT: replace array with vector? + ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { + const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + + using DeviceMHAInstance = ck::remove_cvref_t; + auto gemm = DeviceMHAInstance{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b0_device_buf.GetDeviceBuffer()), + static_cast(b1_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + O, + G0, + G1, + alpha, + input_permute, + output_permute); + + if(!gemm.IsSupportedArgument(argument)) + { + std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl; + + // return 0; + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + + std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1; + std::size_t num_btype = + (sizeof(ADataType) * M * K + sizeof(CDataType) * M * O) * G0 * G1 + + (sizeof(B0DataType) * K * N + sizeof(B1DataType) * N * O) * G0 * QueryGroupNumber; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << gemm.GetTypeString() << std::endl; + if(tflops > best_perf) + { + best_perf = tflops; + best_time = ave_time * 1000; + best_kernel = gemm.GetTypeString(); + } + if(do_verification) + { + c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); + + Tensor a_g0_g1_m_k({G0, G1, M, K}); + Tensor b0_g0_gq_k_n({G0, QueryGroupNumber, K, N}); + Tensor b1_g0_gq_n_o({G0, QueryGroupNumber, N, O}); + Tensor acc0_g0_g1_m_n({G0, G1, M, N}); // scratch object after gemm0 + Tensor a1_g0_g1_m_n({G0, G1, M, N}); // scratch object after softmax + Tensor c_g0_g1_m_o_host_result({G0, G1, M, O}); // scratch object after gemm1 + + // permute + a_gs_ms_ks.ForEach([&](auto& self, auto idx) { + a_g0_g1_m_k(idx[0], idx[1], idx[2], idx[3]) = self(idx); + }); + b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { + b0_g0_gq_k_n(idx[0], idx[1], idx[3], idx[2]) = self(idx); + }); + b1_gs_os_ns.ForEach([&](auto& self, auto idx) { + b1_g0_gq_n_o(idx[0], idx[1], idx[3], idx[2]) = self(idx); + }); + + // gemm 0 + auto ref_gemm0 = ReferenceGemm0Instance{}; + auto ref_gemm0_invoker = ref_gemm0.MakeInvoker(); + auto ref_gemm0_argument = ref_gemm0.MakeArgument(a_g0_g1_m_k, + b0_g0_gq_k_n, + acc0_g0_g1_m_n, + a_element_op, + b0_element_op, + acc0_element_op); + + ref_gemm0_invoker.Run(ref_gemm0_argument); + + // masking + const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + acc0_g0_g1_m_n.ForEach([&](auto& self, auto idx) { + if(mask.IsMaskedElement(idx[2], idx[3])) + self(idx) = -ck::NumericLimits::Infinity(); + }); + + // softmax + auto ref_softmax = ReferenceSoftmaxInstance{}; + auto ref_softmax_invoker = ref_softmax.MakeInvoker(); + auto ref_softmax_argument = + ref_softmax.MakeArgument(acc0_g0_g1_m_n, a1_g0_g1_m_n, 1, 0, {3}); + + ref_softmax_invoker.Run(ref_softmax_argument); + + // gemm1 + auto ref_gemm1 = ReferenceGemm1Instance{}; + auto ref_gemm1_invoker = ref_gemm1.MakeInvoker(); + auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g0_g1_m_n, + b1_g0_gq_n_o, + c_g0_g1_m_o_host_result, + PassThrough{}, + b1_element_op, + c_element_op); + + ref_gemm1_invoker.Run(ref_gemm1_argument); + + // permute + c_gs_ms_os_host_result.ForEach( + [&](auto& self, auto idx) { self(idx) = c_g0_g1_m_o_host_result(idx); }); + + // default absolute error and relative error is 0.001 + double rtol = 1e-3; + double atol = 1e-3; + + // when BF16 is taken, set absolute error and relative error to 0.01 + if(std::is_same_v && std::is_same_v && + std::is_same_v && std::is_same_v) + { + rtol = 1e-2; + atol = 1e-2; + } + + bool this_run_verification = ck::utils::check_err(c_gs_ms_os_device_result.mData, + c_gs_ms_os_host_result.mData, + "Error: Incorrect results!", + rtol, + atol); + printf("Verification: %s, Pass: %s\n", + do_verification ? "ON" : "OFF", + this_run_verification ? "YES" : "NO"); + + if(!this_run_verification) + { + not_pass = 1; + printf("%d th MQA instance verification Failed \n", i.value); + } + } + }); + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M + << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + std::cout << "Best kernel: " << best_kernel << " , " << best_perf << " TFlops , " << best_time + << " us" << std::endl; + std::cout << "---------------------------------------------------------------------------------" + "-----------" + << std::endl; + return not_pass; +} diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp new file mode 100644 index 00000000000..2313b256c32 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp @@ -0,0 +1,1257 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp" +#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp" +#include "ck/host_utility/device_prop.hpp" +#include "ck/host_utility/kernel_launch.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// Multi-Query Attention (MQA) kernel implementation +// Assume number of head of K,V is 1. +// Q [G0, G1, M, K] * K [G0, 1, K, N] = P [G0, G1, M, N] +// P [G0, G1, M, N] * V [G0, 1, N, O] = Out [G0, G1, M, O] +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_grouped_query_attention_wmma(const ADataType* __restrict__ p_a_grid, + const B0DataType* __restrict__ p_b0_grid, + const B1DataType* __restrict__ p_b1_grid, + CDataType* __restrict__ p_c_grid, + index_t M, // SequenceQ + index_t N, // SequenceK + index_t K, // HeadDim + index_t O, // SequenceK + index_t G0, // Batch + index_t G1, // HeadNum + float alpha, + bool input_permute, + bool output_permute) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ + defined(__gfx1102__)) + + // clang-format off +// *************************************************** + const auto q_head = G1; + const auto kv_head = QueryGroupNumber; +// Make Tensor Descriptors + constexpr index_t array_size = 4; + std::array a_gs_ms_ks_lengths{G0, q_head, M, K}; + std::array a_gs_ms_ks_strides = + input_permute + ? std::array{M * q_head * K, K, q_head * K, 1} // A layout [G0, M, G1, K] + : std::array{q_head * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::array b0_gs_ns_ks_lengths{G0, kv_head, N, K}; + std::array b0_gs_ns_ks_strides = + input_permute + ? std::array{N * kv_head * K, K, kv_head * K, 1} // B0 layout [G0, N, 1, K] + : std::array{kv_head * N * K, N * K, K, 1}; // B0 layout [G0, 1, N, K] + + std::array b1_gs_os_ns_lengths{G0, kv_head, O, N}; + std::array b1_gs_os_ns_strides = + input_permute + ? std::array{N * kv_head * O, O, 1, kv_head * O} // B1 layout [G0, N, 1, O] + : std::array{kv_head * N * O, N * O, 1, O}; // B1 layout [G0, 1, N, O] + + std::array c_gs_ms_os_lengths{G0, q_head, M, O}; + std::array c_gs_ms_os_strides = + output_permute + ? std::array{M * q_head * O, O, q_head * O, 1} // C layout [G0, M, G1, O] + : std::array{q_head * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + const auto a_element_op = AElementwiseOperation{}; + const auto b0_element_op = B0ElementwiseOperation{}; + const auto acc0_element_op = AccElementwiseOperation{alpha}; + const auto b1_element_op = B1ElementwiseOperation{}; + const auto c_element_op = CElementwiseOperation{}; + // fail to reuse DeviceOp::MakeArgument() because of the __device__ function required. + + const auto a_grid_desc = DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto c_grid_desc_mblock_mperblock_nblock_nperblock = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n); + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto a_grid_desc_g_m_k = + DeviceOp::Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc_g_l_k = + DeviceOp::Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc_g_n_l = + DeviceOp::Transform::MakeB1GridDescriptor_G_N_K(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + const auto compute_base_ptr_of_batch = + typename DeviceOp::ComputeBasePtrOfStridedBatch{a_grid_desc_g_m_k, b0_grid_desc_g_l_k, b1_grid_desc_g_n_l, c_grid_desc_g_m_n}; + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + const auto c0_matrix_mask = typename DeviceOp::C0MatrixMask{b0_grid_desc_g_l_k.GetLength(Number<1>{})}; + + // clang-format on + __shared__ char p_shared[GridwiseOp::GetSharedMemoryNumberOfByte()]; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetABasePtr(g_idx))); + const long_index_t b0_batch_offset = __builtin_amdgcn_readfirstlane(static_cast( + compute_base_ptr_of_batch.GetB0BasePtr(g_idx * QueryGroupNumber / G1))); + const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(static_cast( + compute_base_ptr_of_batch.GetB1BasePtr(g_idx * QueryGroupNumber / G1))); + const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane( + static_cast(compute_base_ptr_of_batch.GetCBasePtr(g_idx))); + + GridwiseOp::template Run(p_a_grid + a_batch_offset, + p_b0_grid + b0_batch_offset, + p_b1_grid + b1_batch_offset, + p_c_grid + c_batch_offset, + p_shared, + a_grid_desc, + b0_grid_desc, + b1_grid_desc, + c_grid_desc_mblock_mperblock_nblock_nperblock, + a_element_op, + b0_element_op, + acc0_element_op, + b1_element_op, + c_element_op, + c0_matrix_mask, + block_2_ctile_map); +#else + ignore = p_a_grid; + ignore = p_b0_grid; + ignore = p_b1_grid; + ignore = p_c_grid; + ignore = M; + ignore = N; + ignore = K; + ignore = O; + ignore = G0; + ignore = G1; + ignore = input_permute; + ignore = output_permute; +#endif // end of if (defined(__gfx1100__)) +} + +// Computes C = A * B0 * B1 +// MN = MK * KL * LN +// ^^^^^^ (Acc0) +// ^^^^^^^^^^^ (Acc1) +template +struct DeviceGroupedQueryAttentionForward_Wmma + : public DeviceBatchedGemmSoftmaxGemmPermute +{ + static_assert(NumDimG > 0 && NumDimM > 0 && NumDimL > 0 && NumDimK > 0 && NumDimN > 0, + "Number of dimension must be greater than 0"); + + static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size(); + static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size(); + + // TODO ANT: implement bias combination + static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented"); + + static constexpr index_t NumDimGemm0M = NumDimM; + static constexpr index_t NumDimGemm0N = NumDimL; + static constexpr index_t NumDimGemm0K = NumDimK; + static constexpr index_t NumDimGemm1M = NumDimM; + static constexpr index_t NumDimGemm1N = NumDimN; + static constexpr index_t NumDimGemm1K = NumDimL; + + using DeviceOp = DeviceGroupedQueryAttentionForward_Wmma; + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + static constexpr auto I3 = Number<3>{}; + static constexpr auto I4 = Number<4>{}; + static constexpr auto I5 = Number<5>{}; + static constexpr auto I6 = Number<6>{}; + + static constexpr auto WmmaK = 16; + + static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); + static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma); + static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); + + static constexpr auto AEnableLds_auto = LWaves == 1 ? false : true; + static constexpr auto B0EnableLds_auto = MWaves == 1 ? false : true; + static constexpr auto B1EnableLds_auto = MWaves == 1 ? false : true; + + static constexpr auto AEnableLds_manu = false; + static constexpr auto B0EnableLds_manu = true; + static constexpr auto B1EnableLds_manu = true; + + static constexpr auto AEnableLds = AEnableLds_auto || AEnableLds_manu || (NumPrefetch > 1); + static constexpr auto B0EnableLds = B0EnableLds_auto || B0EnableLds_manu || (NumPrefetch > 1); + static constexpr auto B1EnableLds = B1EnableLds_auto || B1EnableLds_manu || (NumPrefetch > 1); + + using Transform = TransformBatchedContractionContractionToBatchedGemmGemm_Wmma< + Sequence, + Sequence, + GemmSpec, + ASpec, + B0Spec, + B1Spec, + CSpec>; + + __host__ __device__ static auto MakeAGridDescriptor( + const std::array& a_gs_ms_ks_lengths_vec, + const std::array& a_gs_ms_ks_strides_vec) + { + if constexpr(AEnableLds) + { + return Transform::MakeAGridDescriptor_AK0_M_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec), + Number{}); + } + else + { + return Transform:: + MakeAGridDescriptor_AKWmma_MBlockRepeat_MWaves_AK0PerWmma_AKRow_MPerWmma_AK1( + Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, + a_gs_ms_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } + } + + __host__ __device__ static auto MakeB0GridDescriptor( + const std::array& b0_gs_ls_ks_lengths_vec, + const std::array& b0_gs_ls_ks_strides_vec) + { + if constexpr(B0EnableLds) + { + return Transform::MakeB0GridDescriptor_BK0_N_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}); + } + else + { + return Transform:: + MakeB0GridDescriptor_BKWmma_LBlockRepeat_LWaves_BK0PerWmma_BKRow_LPerWmma_BK1( + Transform::MakeB0GridDescriptor_N_K(b0_gs_ls_ks_lengths_vec, + b0_gs_ls_ks_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } + } + + __host__ __device__ static auto MakeB1GridDescriptor( + const std::array& b1_gs_ns_ls_lengths_vec, + const std::array& b1_gs_ns_ls_strides_vec) + { + if constexpr(B1EnableLds) + { + return Transform::MakeB1GridDescriptor_BK0_N_BK1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}); + } + else + { + return Transform:: + MakeB1GridDescriptor_BLWmma_NBlockRepeat_NWaves__BL0PerWmma_BLRow_NPerWmma_BL1( + Transform::MakeB1GridDescriptor_N_K(b1_gs_ns_ls_lengths_vec, + b1_gs_ns_ls_strides_vec), + Number{}, + Number{}, + Number{}, + Number{}, + Number{}); + } + } + + using AGridDesc = decltype(MakeAGridDescriptor({}, {})); + using B0GridDesc = decltype(MakeB0GridDescriptor({}, {})); + using B1GridDesc = decltype(MakeB1GridDescriptor({}, {})); + using CGridDesc_M_N = decltype(Transform::MakeCGridDescriptor_M_N({}, {})); + using AGridDesc_G_M_K = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {})); + using B0GridDesc_G_L_K = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {})); + using B1GridDesc_G_N_L = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {})); + using CGridDesc_G_M_N = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {})); + + __host__ __device__ constexpr static auto make_MaskOutPredicate() + { + if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled) + { + return MaskDisabledPredicate{}; + } + else if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle) + { + return MaskOutUpperTrianglePredicate{}; + } + } + using C0MatrixMask = C0MatrixMask_impl; + + struct ComputeBasePtrOfStridedBatch + { + __host__ __device__ ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k, + const B0GridDesc_G_L_K& b0_grid_desc_g_l_k, + const B1GridDesc_G_N_L& b1_grid_desc_g_n_l, + const CGridDesc_G_M_N& c_grid_desc_g_m_n) + : a_grid_desc_g_m_k_(a_grid_desc_g_m_k), + b0_grid_desc_g_l_k_(b0_grid_desc_g_l_k), + b1_grid_desc_g_n_l_(b1_grid_desc_g_n_l), + c_grid_desc_g_m_n_(c_grid_desc_g_m_n) + { + } + + __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const + { + return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetB0BasePtr(index_t g_idx) const + { + return b0_grid_desc_g_l_k_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const + { + return b1_grid_desc_g_n_l_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const + { + return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0)); + } + + private: + AGridDesc_G_M_K a_grid_desc_g_m_k_; + B0GridDesc_G_L_K b0_grid_desc_g_l_k_; + B1GridDesc_G_N_L b1_grid_desc_g_n_l_; + CGridDesc_G_M_N c_grid_desc_g_m_n_; + }; + + // GridwiseOp + using GridwiseOp = GridwiseBatchedGemmSoftmaxGemm_Wmma< + // DataType Family + ADataType, + B0DataType, + Acc0DataType, + B1DataType, + Acc1DataType, + CShuffleDataType, + CDataType, + // ElementwiseOp Family + AElementwiseOperation, + B0ElementwiseOperation, + AccElementwiseOperation, + B1ElementwiseOperation, + CElementwiseOperation, + InMemoryDataOperationEnum::Set, + // InMemory Data Descriptor + AGridDesc, + B0GridDesc, + B1GridDesc, + CGridDesc_M_N, + // Tiling Family + MPerBlock, + LPerBlock, + KPerBlock, + AK1, + BK1, + NPerBlock, + LTilePerBlock, + L1, + MPerWmma, + LPerWmma, + NPerWmma, + MRepeat, + LRepeat, + NRepeat, + // ThreadCluster Family + BlockSize, + ABlockTransferThreadClusterLengths_K0_M_K1, + ABlockTransferThreadClusterArrangeOrder, + ABlockTransferSrcAccessOrder, + ABlockTransferSrcVectorDim, + ABlockTransferSrcScalarPerVector, + ABlockTransferDstScalarPerVector_K1, + true, + AEnableLds, + ABlockLdsAddExtraM, + B0BlockTransferThreadClusterLengths_K0_L_K1, + B0BlockTransferThreadClusterArrangeOrder, + B0BlockTransferSrcAccessOrder, + B0BlockTransferSrcVectorDim, + B0BlockTransferSrcScalarPerVector, + B0BlockTransferDstScalarPerVector_K1, + true, + B0EnableLds, + B0BlockLdsAddExtraL, + B1BlockTransferThreadClusterLengths_L0_N_L1, + B1BlockTransferThreadClusterArrangeOrder, + B1BlockTransferSrcAccessOrder, + B1BlockTransferSrcVectorDim, + B1BlockTransferSrcScalarPerVector, + B1BlockTransferDstScalarPerVector_L1, + false, + B1EnableLds, + B1BlockLdsAddExtraN, + CShuffleMRepeatPerShuffle, + CShuffleNRepeatPerShuffle, + CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, + CShuffleBlockTransferScalarPerVector_NPerBlock, + Transform::matrix_padder.PadN, + MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle, + NumPrefetch, + LoopSched, + PipelineVer>; + + struct RawArg : public BaseArgument + { + RawArg(const ADataType* p_a_grid, + const B0DataType* p_b0_grid, + const B1DataType* p_b1_grid, + CDataType* p_c_grid, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) + : p_a_grid_{p_a_grid}, + p_b0_grid_{p_b0_grid}, + p_b1_grid_{p_b1_grid}, + p_c_grid_{p_c_grid}, + M_{M}, + N_{N}, + K_{K}, + O_{O}, + G0_{G0}, + G1_{G1}, + alpha_{alpha}, + input_permute_{input_permute}, + output_permute_{output_permute} + { + } + // Pointers + const ADataType* p_a_grid_; + const B0DataType* p_b0_grid_; + const B1DataType* p_b1_grid_; + CDataType* p_c_grid_; + + // Raw Problem Size + index_t M_; + index_t N_; + index_t K_; + index_t O_; + index_t G0_; + index_t G1_; + float alpha_; + bool input_permute_; + bool output_permute_; + }; + + static auto MakeArgument(const ADataType* p_a, + const B0DataType* p_b0, + const B1DataType* p_b1, + CDataType* p_c, + index_t M, + index_t N, + index_t K, + index_t O, + index_t G0, + index_t G1, + float alpha, + bool input_permute, + bool output_permute) + { + return RawArg{ + p_a, p_b0, p_b1, p_c, M, N, K, O, G0, G1, alpha, input_permute, output_permute}; + } + + static bool IsSupportedArgument(const RawArg& arg) + { + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") + { + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc0 Type err"); + return false; + } + + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc1 Type err"); + return false; + } + } + else + { + printf("DeviceOp: Arch err"); + return false; + } + + if(arg.G1_ % QueryGroupNumber != 0) + { + return false; + } + + constexpr index_t array_size = 4; + ck::index_t G0 = arg.G0_; + ck::index_t G1 = arg.G1_; + ck::index_t M = arg.M_; + ck::index_t N = arg.N_; + ck::index_t K = arg.K_; + ck::index_t O = arg.O_; + bool input_permute = arg.input_permute_; + bool output_permute = arg.output_permute_; + + std::array a_gs_ms_ks_lengths{G0, G1, M, K}; + std::array a_gs_ms_ks_strides = + input_permute ? std::array{M * G1 * K, K, G1 * K, 1} + // A layout [G0, M, G1, K] + : std::array{ + G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] + + std::array b0_gs_ns_ks_lengths{G0, G1, N, K}; + std::array b0_gs_ns_ks_strides = + input_permute ? std::array{N * G1 * K, K, G1 * K, 1} + // B0 layout [G0, N, G1, K] + : std::array{ + G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] + + std::array b1_gs_os_ns_lengths{G0, G1, O, N}; + std::array b1_gs_os_ns_strides = + input_permute ? std::array{N * G1 * O, O, 1, G1 * O} + // B1 layout [G0, N, G1, O] + : std::array{ + G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] + + std::array c_gs_ms_os_lengths{G0, G1, M, O}; + std::array c_gs_ms_os_strides = + output_permute ? std::array{M * G1 * O, O, G1 * O, 1} + // C layout [G0, M, G1, O] + : std::array{ + G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + + const auto a_grid_desc = + DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); + const auto b0_grid_desc = + DeviceOp::MakeB0GridDescriptor(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); + const auto b1_grid_desc = + DeviceOp::MakeB1GridDescriptor(b1_gs_os_ns_lengths, b1_gs_os_ns_strides); + const auto c_grid_desc_m_n = + DeviceOp::Transform::MakeCGridDescriptor_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + + const auto block_2_ctile_map = GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n, 1, 1); + + const auto c_grid_desc_g_m_n = + DeviceOp::Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_os_lengths, c_gs_ms_os_strides); + index_t batch_count = c_grid_desc_g_m_n.GetLength(Number<0>{}); + + if(!GridwiseOp::CheckValidity( + a_grid_desc, b0_grid_desc, b1_grid_desc, c_grid_desc_m_n, block_2_ctile_map)) + { + return false; + } + + // Check if C permute dimension matches GEMM + GEMM shape + const index_t c_g = c_grid_desc_g_m_n.GetLength(I0); // unpadded + + if(!(c_g == batch_count)) + { + printf("DeviceOp: BatchCount err"); + return false; + } + + // Note: we need raw lengths since threadwise copy can not handle vector load when part of + // vector is out of bounds + // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O + const auto MzRaw = M; + const auto LzRaw = N; + const auto KzRaw = K; + const auto NzRaw = O; + + // Check scalar per vector requirement + const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; + const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; + const auto c_extent_lowest = NzRaw; + + if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && + b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && + b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && + c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + printf("DeviceOp: Data Transfer Vector scalar err"); + return false; + } + + std::array a_mz_kz_strides_{ + a_gs_ms_ks_strides[NumDimG + NumDimM - 1], + a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}; + std::array b0_lz_kz_strides_{ + b0_gs_ns_ks_strides[NumDimG + NumDimL - 1], + b0_gs_ns_ks_strides[NumDimG + NumDimL + NumDimK - 1]}; + std::array b1_nz_lz_strides_{ + b1_gs_os_ns_strides[NumDimG + NumDimN - 1], + b1_gs_os_ns_strides[NumDimG + NumDimN + NumDimL - 1]}; + std::array c_mz_nz_strides_{ + c_gs_ms_os_strides[NumDimG + NumDimM - 1], + c_gs_ms_os_strides[NumDimG + NumDimM + NumDimN - 1]}; + + // Check vector load/store requirement + const auto a_stride_lowest = + ABlockTransferSrcVectorDim == 2 ? a_mz_kz_strides_[1] : a_mz_kz_strides_[0]; + const auto b0_stride_lowest = + B0BlockTransferSrcVectorDim == 2 ? b0_lz_kz_strides_[1] : b0_lz_kz_strides_[0]; + const auto b1_stride_lowest = + B1BlockTransferSrcVectorDim == 2 ? b1_nz_lz_strides_[1] : b1_nz_lz_strides_[0]; + const auto c_stride_lowest = c_mz_nz_strides_[1]; + + if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || + c_stride_lowest == 1)) + { + printf("DeviceOp: Data Vectorize transfer err"); + return false; + } + + return true; + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + // Argument + struct Argument : public BaseArgument + { + Argument( + const ADataType* p_a_grid, + const B0DataType* p_b0_grid, + const B1DataType* p_b1_grid, + CDataType* p_c_grid, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b0_gs_ls_ks_lengths, + const std::array& b0_gs_ls_ks_strides, + const std::array& b1_gs_ns_ls_lengths, + const std::array& b1_gs_ns_ls_strides, + const std::array& c_gs_ms_ns_lengths, + const std::array& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + const index_t M01, + const index_t N01, + AElementwiseOperation a_element_op, + B0ElementwiseOperation b0_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) + : p_a_grid_{p_a_grid}, + p_b0_grid_{p_b0_grid}, + p_b1_grid_{p_b1_grid}, + p_c_grid_{p_c_grid}, + a_grid_desc{DeviceOp::MakeAGridDescriptor(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b0_grid_desc{ + DeviceOp::MakeB0GridDescriptor(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc{ + DeviceOp::MakeB1GridDescriptor(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + c_grid_desc_m_n_{ + Transform::MakeCGridDescriptor_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, + a_grid_desc_g_m_k_{ + Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)}, + b0_grid_desc_g_l_k_{ + Transform::MakeB0GridDescriptor_G_N_K(b0_gs_ls_ks_lengths, b0_gs_ls_ks_strides)}, + b1_grid_desc_g_n_l_{ + Transform::MakeB1GridDescriptor_G_N_K(b1_gs_ns_ls_lengths, b1_gs_ns_ls_strides)}, + c_grid_desc_g_m_n_{ + Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_ns_lengths, c_gs_ms_ns_strides)}, + c_grid_desc_mblock_mperblock_nblock_nperblock_{}, + block_2_ctile_map_{GridwiseOp::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)}, + a_element_op_{a_element_op}, + b0_element_op_{b0_element_op}, + acc_element_op_{acc_element_op}, + b1_element_op_{b1_element_op}, + c_element_op_{c_element_op}, + c0_matrix_mask_{b0_grid_desc_g_l_k_.GetLength(I1)}, + raw_lengths_mz_lz_kz_nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1], + b0_gs_ls_ks_lengths[NumDimG + NumDimL - 1], + b0_gs_ls_ks_lengths[NumDimG + NumDimL + NumDimK - 1], + b1_gs_ns_ls_lengths[NumDimG + NumDimN - 1]}, + a_mz_kz_strides_{a_gs_ms_ks_strides[NumDimG + NumDimM - 1], + a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]}, + b0_lz_kz_strides_{b0_gs_ls_ks_strides[NumDimG + NumDimL - 1], + b0_gs_ls_ks_strides[NumDimG + NumDimL + NumDimK - 1]}, + b1_nz_lz_strides_{b1_gs_ns_ls_strides[NumDimG + NumDimN - 1], + b1_gs_ns_ls_strides[NumDimG + NumDimN + NumDimL - 1]}, + c_mz_nz_strides_{c_gs_ms_ns_strides[NumDimG + NumDimM - 1], + c_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1]}, + batch_count_{c_grid_desc_g_m_n_.GetLength(I0)}, + compute_ptr_offset_of_batch_{ + a_grid_desc_g_m_k_, b0_grid_desc_g_l_k_, b1_grid_desc_g_n_l_, c_grid_desc_g_m_n_} + { + // TODO ANT: implement bias addition + ignore = p_acc0_biases; + ignore = p_acc1_biases; + ignore = acc0_biases_gs_ms_ls_lengths; + ignore = acc0_biases_gs_ms_ls_strides; + ignore = acc1_biases_gs_ms_ns_lengths; + ignore = acc1_biases_gs_ms_ns_strides; + + if(GridwiseOp::CheckValidity( + a_grid_desc, b0_grid_desc, b1_grid_desc, c_grid_desc_m_n_, block_2_ctile_map_)) + { + c_grid_desc_mblock_mperblock_nblock_nperblock_ = + GridwiseOp::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( + c_grid_desc_m_n_); + } + } + + // Pointers + const ADataType* p_a_grid_; + const B0DataType* p_b0_grid_; + const B1DataType* p_b1_grid_; + CDataType* p_c_grid_; + + // Tensor Descriptors + AGridDesc a_grid_desc; + B0GridDesc b0_grid_desc; + B1GridDesc b1_grid_desc; + CGridDesc_M_N c_grid_desc_m_n_; + + AGridDesc_G_M_K a_grid_desc_g_m_k_; + B0GridDesc_G_L_K b0_grid_desc_g_l_k_; + B1GridDesc_G_N_L b1_grid_desc_g_n_l_; + CGridDesc_G_M_N c_grid_desc_g_m_n_; + + typename GridwiseOp::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock + c_grid_desc_mblock_mperblock_nblock_nperblock_; + + // Block to Tile mapping + typename GridwiseOp::DefaultBlock2CTileMap block_2_ctile_map_; + + // ElementwiseOp + AElementwiseOperation a_element_op_; + B0ElementwiseOperation b0_element_op_; + AccElementwiseOperation acc_element_op_; + B1ElementwiseOperation b1_element_op_; + CElementwiseOperation c_element_op_; + + // check C0 masking and padding + C0MatrixMask c0_matrix_mask_; + + // Strides for the last M/N/K dimensions of A/B0/B1/C + // for sanity check of vector load/store + std::array raw_lengths_mz_lz_kz_nz_; + std::array a_mz_kz_strides_; + std::array b0_lz_kz_strides_; + std::array b1_nz_lz_strides_; + std::array c_mz_nz_strides_; + + index_t batch_count_; + // Batch Offset + ComputeBasePtrOfStridedBatch compute_ptr_offset_of_batch_; + }; + + struct Invoker : public BaseInvoker + { + using Argument = DeviceOp::RawArg; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + const auto M0 = math::integer_divide_ceil(arg.M_, MPerBlock); + const auto N0 = math::integer_divide_ceil(arg.O_, NPerBlock); + + const index_t grid_size = arg.G0_ * arg.G1_ * M0 * N0; + const auto K = arg.K_; + // printf("HasKBlockLoop: %d\n", GridwiseOp::CalculateHasMainKBlockLoop(K)); + auto launch_kernel = [&](auto has_main_k_block_loop) { + const auto kernel = kernel_grouped_query_attention_wmma; + + return launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.p_a_grid_, + arg.p_b0_grid_, + arg.p_b1_grid_, + arg.p_c_grid_, + arg.M_, + arg.N_, + arg.K_, + arg.O_, + arg.G0_, + arg.G1_, + arg.alpha_, + arg.input_permute_, + arg.output_permute_); + }; + + if(GridwiseOp::CalculateHasMainKBlockLoop(K)) + { + return launch_kernel(integral_constant{}); + } + else + { + return launch_kernel(integral_constant{}); + } + } + + // polymorphic + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } +#if 0 + static bool IsSupportedArgument(const Argument& arg) + { + if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || + ck::get_device_name() == "gfx1102") + { + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc0 Type err"); + return false; + } + + if constexpr(!(is_same_v || is_same_v)) + { + printf("DeviceOp: Acc1 Type err"); + return false; + } + } + else + { + printf("DeviceOp: Arch err"); + return false; + } + + if(!GridwiseOp::CheckValidity(arg.a_grid_desc, + arg.b0_grid_desc, + arg.b1_grid_desc, + arg.c_grid_desc_m_n_, + arg.block_2_ctile_map_)) + { + return false; + } + + // Check if C permute dimension matches GEMM + GEMM shape + const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded + + if(!(c_g == arg.batch_count_)) + { + printf("DeviceOp: BatchCount err"); + return false; + } + + // Note: we need raw lengths since threadwise copy can not handle vector load when part of + // vector is out of bounds + // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O + const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0]; + const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1]; + const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2]; + const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3]; + + // Check scalar per vector requirement + const auto a_extent_lowest = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw; + const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw; + const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw; + const auto c_extent_lowest = NzRaw; + + if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 && + b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 && + b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 && + c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0)) + { + printf("DeviceOp: Data Transfer Vector scalar err"); + return false; + } + + // Check vector load/store requirement + const auto a_stride_lowest = + ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0]; + const auto b0_stride_lowest = + B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0]; + const auto b1_stride_lowest = + B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0]; + const auto c_stride_lowest = arg.c_mz_nz_strides_[1]; + + if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 || + c_stride_lowest == 1)) + { + printf("DeviceOp: Data Vectorize transfer err"); + return false; + } + + return true; + } + + // polymorphic + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument( + const ADataType* p_a, + const B0DataType* p_b0, + const B1DataType* p_b1, + CDataType* p_c, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::array& a_gs_ms_ks_lengths, + const std::array& a_gs_ms_ks_strides, + const std::array& b0_gs_ls_ks_lengths, + const std::array& b0_gs_ls_ks_strides, + const std::array& b1_gs_ns_ls_lengths, + const std::array& b1_gs_ns_ls_strides, + const std::array& c_gs_ms_ns_lengths, + const std::array& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + AElementwiseOperation a_element_op, + B0ElementwiseOperation b0_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) + { + return Argument{p_a, + p_b0, + p_b1, + p_c, + p_acc0_biases, + p_acc1_biases, + a_gs_ms_ks_lengths, + a_gs_ms_ks_strides, + b0_gs_ls_ks_lengths, + b0_gs_ls_ks_strides, + b1_gs_ns_ls_lengths, + b1_gs_ns_ls_strides, + c_gs_ms_ns_lengths, + c_gs_ms_ns_strides, + acc0_biases_gs_ms_ls_lengths, + acc0_biases_gs_ms_ls_strides, + acc1_biases_gs_ms_ns_lengths, + acc1_biases_gs_ms_ns_strides, + 1, + 1, + a_element_op, + b0_element_op, + acc_element_op, + b1_element_op, + c_element_op}; + } +#endif + + // polymorphic + std::unique_ptr MakeArgumentPointer( + const void* p_a, + const void* p_b0, + const void* p_b1, + void* p_c, + const std::array p_acc0_biases, + const std::array p_acc1_biases, + const std::vector& a_gs_ms_ks_lengths, + const std::vector& a_gs_ms_ks_strides, + const std::vector& b0_gs_ls_ks_lengths, + const std::vector& b0_gs_ls_ks_strides, + const std::vector& b1_gs_ns_ls_lengths, + const std::vector& b1_gs_ns_ls_strides, + const std::vector& c_gs_ms_ns_lengths, + const std::vector& c_gs_ms_ns_strides, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths, + const std::array, NumAcc0Bias> acc0_biases_gs_ms_ls_strides, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths, + const std::array, NumAcc1Bias> acc1_biases_gs_ms_ns_strides, + AElementwiseOperation a_element_op, + B0ElementwiseOperation b0_element_op, + AccElementwiseOperation acc_element_op, + B1ElementwiseOperation b1_element_op, + CElementwiseOperation c_element_op) override + { + std::array a_lengths; + std::array a_strides; + std::array b0_lengths; + std::array b0_strides; + std::array b1_lengths; + std::array b1_strides; + std::array c_lengths; + std::array c_strides; + std::transform(a_gs_ms_ks_lengths.begin(), + a_gs_ms_ks_lengths.end(), + a_lengths.begin(), + [](index_t i) { return i; }); + std::transform(a_gs_ms_ks_strides.begin(), + a_gs_ms_ks_strides.end(), + a_strides.begin(), + [](index_t i) { return i; }); + std::transform(b0_gs_ls_ks_lengths.begin(), + b0_gs_ls_ks_lengths.end(), + b0_lengths.begin(), + [](index_t i) { return i; }); + std::transform(b0_gs_ls_ks_strides.begin(), + b0_gs_ls_ks_strides.end(), + b0_strides.begin(), + [](index_t i) { return i; }); + std::transform(b1_gs_ns_ls_lengths.begin(), + b1_gs_ns_ls_lengths.end(), + b1_lengths.begin(), + [](index_t i) { return i; }); + std::transform(b1_gs_ns_ls_strides.begin(), + b1_gs_ns_ls_strides.end(), + b1_strides.begin(), + [](index_t i) { return i; }); + std::transform(c_gs_ms_ns_lengths.begin(), + c_gs_ms_ns_lengths.end(), + c_lengths.begin(), + [](index_t i) { return i; }); + std::transform(c_gs_ms_ns_strides.begin(), + c_gs_ms_ns_strides.end(), + c_strides.begin(), + [](index_t i) { return i; }); + return std::make_unique(static_cast(p_a), + static_cast(p_b0), + static_cast(p_b1), + static_cast(p_c), + p_acc0_biases, + p_acc1_biases, + a_lengths, + a_strides, + b0_lengths, + b0_strides, + b1_lengths, + b1_strides, + c_lengths, + c_strides, + acc0_biases_gs_ms_ls_lengths, + acc0_biases_gs_ms_ls_strides, + acc1_biases_gs_ms_ns_lengths, + acc1_biases_gs_ms_ns_strides, + 1, + 1, + a_element_op, + b0_element_op, + acc_element_op, + b1_element_op, + c_element_op); + } + + static auto MakeInvoker() { return Invoker{}; } + + // polymorphic + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + // polymorphic + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + std::map LoopSchedToString{ + {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}}; + + std::map PipelineVersionToString{{PipelineVersion::v1, "v1"}, + {PipelineVersion::v2, "v2"}}; + + // clang-format off + str << "DeviceGroupedQueryAttentionForward_Wmma, " + << "QueryGroupNumber: " + << QueryGroupNumber << ", " + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << LPerBlock << ", " + << KPerBlock << ", " + << AK1 << ", " + << BK1 << ", " + << MPerBlock << ", " + << NPerBlock << ", " + << LTilePerBlock << ", " + << L1 << ", " + << getGemmSpecializationString(GemmSpec) << ", " + << "ASpec" << getTensorSpecializationString(ASpec) << ", " + << "B0Spec" << getTensorSpecializationString(B0Spec) << ", " + << "B1Spec" << getTensorSpecializationString(B1Spec) << ", " + << "CSpec" << getTensorSpecializationString(CSpec) << ", " + << getMaskingSpecializationString(MaskingSpec) + << ">" + << " AEnableLds: " + << AEnableLds << ", " + << "B0EnableLds: " + << B0EnableLds << ", " + << "B1EnableLds: " + << B1EnableLds << ", " + << "NumPrefetch: " + << NumPrefetch << ", " + << "LoopScheduler: " + << LoopSchedToString[LoopSched] << ", " + << "PipelineVersion: " + << PipelineVersionToString[PipelineVer]; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp index 327cc9e28c6..7a8e1d9a37d 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp @@ -255,6 +255,130 @@ struct ReferenceBatchedGemm_MQA : public device::BaseOperator } }; +template +struct ReferenceBatchedGemm_GQA : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const Tensor& a_g0_g1_m_k, + const Tensor& b_g0_gq_k_n, + Tensor& c_g0_g1_m_n, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + : a_g0_g1_m_k_{a_g0_g1_m_k}, + b_g0_gq_k_n_{b_g0_gq_k_n}, + c_g0_g1_m_n_{c_g0_g1_m_n}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + c_element_op_{c_element_op} + { + } + + const Tensor& a_g0_g1_m_k_; + const Tensor& b_g0_gq_k_n_; + Tensor& c_g0_g1_m_n_; + + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CElementwiseOperation c_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = ReferenceBatchedGemm_GQA::Argument; + + float Run(const Argument& arg) + { + auto f_g0g1mk_g0gqkn_g0g1mn = [&](auto g0, auto g1, auto m, auto n) { + const int G1 = arg.a_g0_g1_m_k_.mDesc.GetLengths()[1]; + const int K = arg.a_g0_g1_m_k_.mDesc.GetLengths()[3]; + + AccDataType v_acc = 0; + + for(int k = 0; k < K; ++k) + { + ADataType v_a; + BDataType v_b; + + arg.a_element_op_(v_a, arg.a_g0_g1_m_k_(g0, g1, m, k)); + arg.b_element_op_(v_b, arg.b_g0_gq_k_n_(g0, g1 * QueryGroupNumber / G1, k, n)); + + v_acc += + ck::type_convert(v_a) * ck::type_convert(v_b); + } + + AccDataType v_c; + + arg.c_element_op_(v_c, v_acc); + + arg.c_g0_g1_m_n_(g0, g1, m, n) = ck::type_convert(v_c); + }; + + make_ParallelTensorFunctor(f_g0g1mk_g0gqkn_g0g1mn, + arg.c_g0_g1_m_n_.mDesc.GetLengths()[0], + arg.c_g0_g1_m_n_.mDesc.GetLengths()[1], + arg.c_g0_g1_m_n_.mDesc.GetLengths()[2], + arg.c_g0_g1_m_n_.mDesc.GetLengths()[3])( + std::thread::hardware_concurrency()); + return 0; + } + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /* stream_config */ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + // TODO: properly implement this check + return true; + } + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const Tensor& a_g0_g1_m_k, + const Tensor& b_g0_gq_k_n, + Tensor& c_g0_g1_m_n, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{ + a_g0_g1_m_k, b_g0_gq_k_n, c_g0_g1_m_n, a_element_op, b_element_op, c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceBatchedGemm_GQA" + << std::endl; + // clang-format on + + return str.str(); + } +}; + } // namespace host } // namespace tensor_operation } // namespace ck From d1894bdbbb993bc4ada02a9b21eb7f60f64966cd Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 9 Aug 2023 23:56:04 +0000 Subject: [PATCH 104/118] tempsave --- example/01_gemm/gemm_wmma_fp16.cpp | 12 ++++++------ example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp | 2 +- .../gpu/element/unary_element_wise_operation.hpp | 7 +++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index eeeabff9511..8c52e4f7d70 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -27,7 +27,7 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle BLayout, CLayout, ADataType, - BDataType, + BDataType, CDataType, AccDataType, CShuffleDataType, @@ -35,16 +35,16 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle BElementOp, CElementOp, GemmDefault, - 2, // Prefetch stage + 1, // Prefetch stage 128, // BlockSize - 128, // MPerBlock - 64, // NPerBlock + 64, // MPerBlock + 128, // NPerBlock 64, // KPerBlock 8, // K1 16, // MPerWmma 16, // NPerWmma - 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave - 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + 2, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 4, // N-Repeat // N-PerWmma / N-Repeat = N-Wave S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp index a3b7554e620..9dc97fecd86 100644 --- a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp +++ b/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp @@ -21,7 +21,7 @@ using QuantDataType = int8_t; using BDataType = uint8_t; using ScaleDataType = ck::half_t; using AccDataType = float; -using CShuffleDataType = ck::half_t; +using CShuffleDataType = float; using CDataType = ck::half_t; using ALayout = Row; diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 28d60e3ca90..b59b30849fe 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -404,6 +404,13 @@ struct FastNumericArrayConverter half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01); half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23); + // static constexpr ck::half_t fp16_subtract = -1152; + // Output.template AsType()(Number<0>{}) += fp16_subtract; + // Output.template AsType()(Number<1>{}) += fp16_subtract; + // Output.template AsType()(Number<2>{}) += fp16_subtract; + // Output.template AsType()(Number<3>{}) += fp16_subtract; + + // inline assembly get very poor performance as no chance to global scheduling static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" : "=v"(half_2[0]) From 061009a3bc682068fcd664dce2678ae9dc1f6065 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 15 Aug 2023 10:06:05 +0000 Subject: [PATCH 105/118] Compile pass --- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 2 +- ...oup_tensor_slice_transfer_v4r1_dequant.hpp | 222 ++++ .../device/impl/device_fpAintB_gemm_wmma.hpp | 7 +- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 253 +--- .../grid/gridwise_gemm_pipeline_selector.hpp | 5 + .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 103 ++ ...ise_tensor_slice_transfer_v3r1_dequant.hpp | 1034 +++++++++++++++++ 7 files changed, 1418 insertions(+), 208 deletions(-) create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp index 7aab2c77c2b..84251a7506c 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp @@ -315,7 +315,7 @@ struct Blockwise_fpAintB_GemmWMMA fast_numeric_converter; // basic intrinsic to determine loopover direction - if constexpr(MRepeat < NRepeat) + if constexpr( 0 ) { static_for<0, KPerBlock / WmmaK, 1>{}( [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp new file mode 100644 index 00000000000..338b3a88923 --- /dev/null +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp @@ -0,0 +1,222 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_description/cluster_descriptor.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp" + +namespace ck { + +/** + * @brief Blockwise data transfer with dequantization + * + * RunRead would load low-precision data and scale data. + * RunWrite would process dequantization process. + * Assume Scale is identical along K-dimension + * + * This version does following things to avoid scratch memory issue + * 1. Use StaticallyIndexedArray instead of C array for thread buffer + * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor + * 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate + * + */ +template +struct ThreadGroupTensorSliceTransfer_v4r1_dequant +{ + static constexpr index_t nDim = remove_reference_t::GetNumOfDimension(); + + static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{}; + static constexpr auto scale_thread_slice_lengths = BlockScaleSliceLengths{} / ThreadClusterLengths{}; + + using Index = MultiIndex; + + __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1_dequant( + const SrcDesc& src_desc, + const Index& src_block_slice_origin, + const SrcElementwiseOperation& src_element_op, + const ScaleDesc& scale_desc, + const Index& scale_block_slice_origin, + const ScaleElementwiseOperation& scale_element_op, + const DstDesc& dst_desc, + const Index& dst_block_slice_origin, + const DstElementwiseOperation& dst_element_op) + : threadwise_transfer_(src_desc, + make_zero_multi_index(), + src_element_op, + scale_desc, + make_zero_multi_index(), + scale_element_op, + dst_desc, + make_zero_multi_index(), + dst_element_op) + + { + static_assert(nDim == remove_cvref_t::GetNumOfDimension() && + nDim == remove_cvref_t::GetNumOfDimension() && + nDim == remove_cvref_t::GetNumOfDimension() && + nDim == ThreadClusterLengths::Size() && + nDim == ThreadClusterArrangeOrder::Size() && + nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(), + "wrong! nDim not consistent"); + + static_assert( + is_same{} && + is_same{} , + "wrong! threads should be mapped to cover entire slicing window"); + + static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(), + "wrong! ThreadGroup::GetNumOfThread() too small"); + + if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or + ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) + { + const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex( + make_multi_index(ThreadGroup::GetThreadId())); + + const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths; + + threadwise_transfer_.SetSrcSliceOrigin(src_desc, + src_block_slice_origin + thread_data_idx_begin); + threadwise_transfer_.SetScaleSliceOrigin(scale_desc, + scale_block_slice_origin + thread_data_idx_begin); + threadwise_transfer_.SetDstSliceOrigin(dst_desc, + dst_block_slice_origin + thread_data_idx_begin); + } + } + + template + __device__ void RunRead(const SrcDesc& src_desc, + const SrcBuffer& src_buf, + Number thread_scratch_id = Number{}) + { + if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or + ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) + { + threadwise_transfer_.RunRead(src_desc, src_buf, thread_scratch_id); + } + } + + // With the assumption, scale scratch is always one + template + __device__ void RunScaleRead(const ScaleDesc& scale_desc, + const ScaleBuffer& scale_buf) + { + if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or + ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) + { + threadwise_transfer_.RunScaleRead(scale_desc, scale_buf); + } + } + + template + __device__ void RunWrite(const DstDesc& dst_desc, + DstBuffer& dst_buf, + Number thread_scratch_id = Number{}) + { + if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or + ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) + { + threadwise_transfer_.RunWrite(dst_desc, dst_buf, thread_scratch_id); + } + } + + // We don't prefer use this API directly + /* + template + __device__ void Run(const SrcDesc& src_desc, + const SrcBuffer& src_buf, + const DstDesc& dst_desc, + DstBuffer& dst_buf, + Number thread_scratch_id) + { + RunRead(src_desc, src_buf, thread_scratch_id); + RunWrite(dst_desc, dst_buf, thread_scratch_id); + } + */ + + __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step) + { + if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or + ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) + { + threadwise_transfer_.MoveSrcSliceWindow(src_desc, step); + } + } + + __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step) + { + if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or + ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) + { + threadwise_transfer_.MoveDstSliceWindow(dst_desc, step); + } + } + + // With the assumption, scale buffer don't need move slice window method + + private: + static constexpr auto thread_cluster_desc_ = + make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{}); + + using ThreadwiseTransfer = + ThreadwiseTensorSliceTransfer_v3r1_dequant; + + ThreadwiseTransfer threadwise_transfer_; +}; + +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 64aaaf034c2..31c39ed0187 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -66,7 +66,7 @@ template + ck::PipelineVersion PipelineVer = ck::PipelineVersion::weight_only> struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB PipelineVersionToString{ {PipelineVersion::v1, "v1"}, {PipelineVersion::v2, "v2"}, - {PipelineVersion::dequant_v1, "dequant_v1"}}; + {PipelineVersion::dequant_v1, "dequant_v1"}, + {PipelineVersion::weight_only, "weight_only"}}; // clang-format off str << "DeviceFpAintBGemm_Wmma_CShuffle" diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index b44f8d0e0eb..da618c0c230 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -9,8 +9,9 @@ #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp" -#include "ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp" +#include "ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp" #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp" +#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp" #include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp" #include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" @@ -82,6 +83,7 @@ __global__ void #endif // end of if (defined(__gfx1100__)) } +// Assume B is Col-Major template + PipelineVersion PipelineVer = PipelineVersion::weight_only> struct GridwiseFpAintBGemm_Wmma { static constexpr auto I0 = Number<0>{}; @@ -252,38 +254,6 @@ struct GridwiseFpAintBGemm_Wmma return b_block_desc; } - __host__ __device__ static constexpr auto MakeScaleBlockDescriptor() - { - // Scale [1, N], all K related dimension reduce to 1 - constexpr auto scale_block_desc = [&]() { - if constexpr(BEnableLds) - { - // K0->N->K1 Per Block - constexpr auto K0PerBlock = KPerBlock / K1; - - return make_naive_tensor_descriptor( - make_tuple(Number{}, Number{}, I1), - make_tuple(I0, I1, I0)); - } - else - { - constexpr auto KWmmaPerblock = KPerBlock / WmmaK; - constexpr auto K0PerWmma = WmmaK / 2 / K1; - // KWmma->NRepeat->MWave->K0PerWmma->KRow->MPerWmma->K1 Per Thread - return make_naive_tensor_descriptor(make_tuple(Number{}, - Number{}, - I1, - Number{}, - I1, - I1, - I1), - make_tuple(I0, I1, I0, I0, I0, I0, I0)); - } - }(); - - return scale_block_desc; - } - __host__ __device__ static constexpr auto MakeABlockSliceCopyStep() { constexpr auto a_block_copy_step = [&]() { @@ -424,47 +394,6 @@ struct GridwiseFpAintBGemm_Wmma return b_wave_desc; } - template - __host__ __device__ static constexpr auto MakeScaleWaveDescriptor(const ScaleBlockDesc_&) - { - constexpr auto scale_wave_desc = [&]() { - if constexpr(BEnableLds) - { - // BK0_N_BK1 -> BK0_NRepeat_Nwaves_NPerWmma_BK1 - constexpr auto B_K0 = ScaleBlockDesc_{}.GetLength(I0); - constexpr auto B_K1 = ScaleBlockDesc_{}.GetLength(I2); - constexpr auto B_KRow = I1; - return transform_tensor_descriptor( - ScaleBlockDesc_{}, - make_tuple(make_unmerge_transform(make_tuple(Number{}, B_KRow)), - make_unmerge_transform(make_tuple( - Number{}, Number{}, Number{})), - make_pass_through_transform(Number{})), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{})); - } - else - { - // KWmma_MRepeat_MWave_K0PerWmma_KRow_MPerWmma_K1 -> K0_MRepeat_Mwaves_MPerWmma_K1 - constexpr auto KWmma = ScaleBlockDesc_{}.GetLength(I0); - constexpr auto K0PerWmma = ScaleBlockDesc_{}.GetLength(I3); - constexpr auto B_KRow = ScaleBlockDesc_{}.GetLength(I4); - constexpr auto B_K1 = ScaleBlockDesc_{}.GetLength(I6); - - // Workaround, Freeze transform - return make_naive_tensor_descriptor(make_tuple(Number{}, - Number{}, - I1, - Number{}, - I1, - Number{}), - make_tuple(I0, I1, I0, I0, I0, I0)); - } - }(); - - return scale_wave_desc; - } - __host__ __device__ static constexpr auto // *Caution Here repeat is shuffle repeat GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat() @@ -613,8 +542,11 @@ struct GridwiseFpAintBGemm_Wmma struct SharedMemTrait { - // LDS allocation for A and B: be careful of alignment - + // LDS allocation for A and Dequantized B: be careful of DataType + // scale would not put into LDS. + using LDS_ADataType = ADataType; + using LDS_BDataType = ADataType; + using LDS_CDataType = CShuffleDataType; static constexpr auto max_lds_align = K1; static constexpr auto a_block_space_size_aligned = @@ -625,18 +557,13 @@ struct GridwiseFpAintBGemm_Wmma BEnableLds ? math::integer_least_multiple(MakeBBlockDescriptor().GetElementSpaceSize(), max_lds_align) : 0; - static constexpr auto scale_block_space_size_aligned = - BEnableLds ? math::integer_least_multiple( - MakeScaleBlockDescriptor().GetElementSpaceSize(), max_lds_align) - : 0; static constexpr auto a_block_space_offset = 0; + // B would be dequantize to ADataType before enter LDS + // b_lds_offset = LDS size allocated for a in byte / LDS_BDataType static constexpr auto b_block_space_offset = - (a_block_space_offset + a_block_space_size_aligned) * sizeof(ADataType) / - sizeof(BDataType); - static constexpr auto scale_block_space_offset = - (b_block_space_offset + b_block_space_size_aligned) * sizeof(BDataType) / - sizeof(ScaleDataType); + (a_block_space_offset + a_block_space_size_aligned) * sizeof(LDS_ADataType) / + sizeof(LDS_BDataType); // LDS allocation for C shuffle in LDS static constexpr auto c_shuffle_block_space_size = @@ -646,10 +573,9 @@ struct GridwiseFpAintBGemm_Wmma static constexpr auto c_shuffle_block_space_offset = 0; static constexpr auto lds_size = - math::max(c_shuffle_block_space_size * sizeof(CShuffleDataType), - a_block_space_size_aligned * sizeof(ADataType) + - b_block_space_size_aligned * sizeof(BDataType) + - scale_block_space_size_aligned * sizeof(ScaleDataType)); + math::max(c_shuffle_block_space_size * sizeof(LDS_CDataType), + a_block_space_size_aligned * sizeof(LDS_ADataType) + + b_block_space_size_aligned * sizeof(LDS_BDataType)); }; template @@ -707,7 +633,6 @@ struct GridwiseFpAintBGemm_Wmma constexpr auto a_block_desc = MakeABlockDescriptor(); constexpr auto b_block_desc = MakeBBlockDescriptor(); - constexpr auto scale_block_desc = MakeScaleBlockDescriptor(); auto a_block_trait = [&](){ // A matrix blockwise copy @@ -795,35 +720,44 @@ struct GridwiseFpAintBGemm_Wmma { constexpr auto K0PerBlock = KPerBlock/ K1; auto b_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::b_block_space_offset, + static_cast(p_shared) + SharedMemTrait::b_block_space_offset, SharedMemTrait::b_block_space_size_aligned); auto b_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - BDataType, - BDataType, - decltype(b_grid_desc), - decltype(b_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - BBlockTransferSrcVectorDim, - 2, - BBlockTransferSrcScalarPerVector, - BBlockTransferDstScalarPerVector_K1, - 1, - 1, - BThreadTransferSrcResetCoordinateAfterRun, - true, + ThreadGroupTensorSliceTransfer_v4r1_dequant, +/* typename BlockScaleSliceLengths, */ Sequence, +/* typename ThreadClusterLengths, */ BBlockTransferThreadClusterLengths_K0_N_K1, +/* typename ThreadClusterArrangeOrder, */ BBlockTransferThreadClusterArrangeOrder, +/* typename SrcData, */ BDataType, +/* typename ScaleData, */ ScaleDataType, +/* typename DstData, */ ADataType, +/* typename SrcDesc, */ decltype(b_grid_desc), +/* typename ScaleDesc, */ decltype(scale_grid_desc), +/* typename DstDesc, */ decltype(b_block_desc), +/* typename SrcDimAccessOrder, */ BBlockTransferSrcAccessOrder, +/* typename DstDimAccessOrder, */ Sequence<0, 1, 2>, +/* index_t SrcVectorDim, */ BBlockTransferSrcVectorDim, +/* index_t DstVectorDim, */ 2, +/* index_t SrcScalarPerVector, */ BBlockTransferSrcScalarPerVector, +/* index_t ScaleScalarPerVector, */ 1, +/* index_t DstScalarPerVector, */ BBlockTransferDstScalarPerVector_K1, +/* index_t SrcScalarStrideInVector, */ 1, +/* index_t ScaleScalarStrideInVector, */ 1, +/* index_t DstScalarStrideInVector, */ 1, +/* bool ThreadTransferSrcResetCoordinateAfterRun, */ BThreadTransferSrcResetCoordinateAfterRun, +/* bool ThreadTransferDstResetCoordinateAfterRun, */ true, NumGemmKPrefetchStage>( b_grid_desc, make_multi_index(0, n_block_data_idx_on_grid, 0), b_element_op, + scale_grid_desc, + make_multi_index(0, n_block_data_idx_on_grid, 0), + ck::tensor_operation::element_wise::PassThrough{}, b_block_desc, make_multi_index(0, 0, 0), ck::tensor_operation::element_wise::PassThrough{}); @@ -870,108 +804,22 @@ struct GridwiseFpAintBGemm_Wmma } }; - auto scale_block_trait = [&](){ - if constexpr(BEnableLds) - { - constexpr auto K0PerBlock = KPerBlock/ K1; - - auto scale_block_buf = make_dynamic_buffer( - static_cast(p_shared) + SharedMemTrait::scale_block_space_offset, - SharedMemTrait::scale_block_space_size_aligned); - - auto scale_blockwise_copy = - ThreadGroupTensorSliceTransfer_v4r1, - BBlockTransferThreadClusterLengths_K0_N_K1, - BBlockTransferThreadClusterArrangeOrder, - ScaleDataType, - ScaleDataType, - decltype(scale_grid_desc), - decltype(scale_block_desc), - BBlockTransferSrcAccessOrder, - Sequence<0, 1, 2>, - BBlockTransferSrcVectorDim, - 2, - 1, - 1, - 1, // no effect - 1, // no effect - BThreadTransferSrcResetCoordinateAfterRun, - true, - NumGemmKPrefetchStage>( - scale_grid_desc, - make_multi_index(0, n_block_data_idx_on_grid, 0), - b_element_op, - scale_block_desc, - make_multi_index(0, 0, 0), - ck::tensor_operation::element_wise::PassThrough{}); - - return make_tuple(scale_block_buf, scale_blockwise_copy); - } - else - { - // Thread-wise copy - constexpr auto KWmmaPerBlock = KPerBlock / WmmaK; - constexpr auto K0PerWmma = WmmaK/2/K1Value; - // KPerBlock/WmmaK -> NRepeat -> NWaves -> WmmaK/K1 -> NPerWmma -> K1 - auto scale_block_buf = make_static_buffer( - scale_block_desc.GetElementSpaceSize()); - - auto scale_blockwise_copy = - ThreadwiseTensorSliceTransfer_v2{}, - Number{}, - I1, - Number{}, - I1, - I1, - Number{}>, - Sequence<0, 1, 2, 3, 4, 5, 6>, - 6, - BBlockTransferSrcScalarPerVector, - BThreadTransferSrcResetCoordinateAfterRun, - true>( - scale_grid_desc, - make_multi_index(0, - n_block_data_idx_on_grid/(NWaves * NPerWmma), - get_thread_local_1d_id() / 32, - 0, - (get_thread_local_1d_id() % 32 )/ 16, - get_thread_local_1d_id() % 16, - 0)); - - return make_tuple(scale_block_buf, scale_blockwise_copy); - } - }; - auto a_block_buf = a_block_trait()[I0]; auto a_blockwise_copy = a_block_trait()[I1]; auto b_block_buf = b_block_trait()[I0]; auto b_blockwise_copy = b_block_trait()[I1]; - - auto scale_block_buf = scale_block_trait()[I0]; - auto scale_blockwise_copy = scale_block_trait()[I1]; /*******************************************************************************/ // GEMM constexpr auto KPack = math::integer_least_multiple(K1, WmmaK); auto blockwise_gemm = - Blockwise_fpAintB_GemmWMMA{}; } + else if constexpr(PipelineVer == PipelineVersion::weight_only) + { + return GridwiseGemmPipeline_v1_WeightOnly{}; + } else { std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 0ff11a531f8..1b281674591 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -769,6 +769,109 @@ struct GridwiseGemmPipeline_v1_dequant<1, true, false> } }; +template +struct GridwiseGemmPipeline_v1_WeightOnly; + +template <> +struct GridwiseGemmPipeline_v1_WeightOnly<1, true, true> +{ + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + + __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } + + __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) + { + return num_loop > 1; + } + + template + __device__ static void Run(const AGridDesc& a_grid_desc, + const ABlockDesc& a_block_desc, + ABlockTransfer& a_blockwise_copy, + const AGridBuffer& a_grid_buf, + ABlockBuffer& a_block_buf, + const ABlockTransferStep& a_block_copy_step, + const BGridDesc& b_grid_desc, + const BBlockDesc& b_block_desc, + BBlockTransfer& b_blockwise_copy, + const BGridBuffer& b_grid_buf, + BBlockBuffer& b_block_buf, + const BBlockTransferStep& b_block_copy_step, + const ScaleGridDesc& scale_grid_desc, + const ScaleGridBuffer& scale_grid_buf, + const BlockwiseGemm& blockwise_gemm, + CThreadBuffer& c_thread_buf, + index_t num_loop) + { + // Global Prefetch Stage 1 + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + // Scale read once + b_blockwise_copy.RunScaleRead(scale_grid_desc, scale_grid_buf); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + // Initialize C + c_thread_buf.Clear(); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + // Dequantization fused in blockwise_copy + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + // main body + if constexpr(HasMainLoop) + { + index_t i = 0; + + do + { + a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); + + block_sync_lds(); + + b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + + block_sync_lds(); + + a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); + b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); + + a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); + b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); + + ++i; + } while(i < (num_loop - 1)); + } + + // tail + { + block_sync_lds(); + + blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf); + } + } +}; + template struct GridwiseGemmPipelineInterwave_v1; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp new file mode 100644 index 00000000000..2bb96845829 --- /dev/null +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp @@ -0,0 +1,1034 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/common_header.hpp" +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" +#include "ck/tensor/static_tensor.hpp" + +namespace ck { + +namespace detail { +// TODO: How to fix this? It uses an struct instead of lambda because lambda +// doesn't have constructor +template +struct lambda_scalar_per_access_for_src_and_dst_idle +{ + __host__ __device__ constexpr auto operator()(index_t i) const + { + if(i == SrcVectorDim && i == DstVectorDim) + { + return math::lcm(SrcScalarPerVector, DstScalarPerVector); + } + else if(i == SrcVectorDim) + { + return SrcScalarPerVector; + } + else if(i == DstVectorDim) + { + return DstScalarPerVector; + } + else + { + return 1; + } + } +}; + +} // namespace detail + +// Assume: +// 1. src_desc and dst_desc are not known at compile-time +// 2. SrcBuffer and DstBuffer are DynamicBuffer +// 3. src_slice_origin and dst_slice_origin are not known at compile-time, +// 4. Use thread buffer +// 5. Dequantization happened between read and write. +template +struct ThreadwiseTensorSliceTransfer_v3r1_dequant +{ + static constexpr index_t nDim = SliceLengths::Size(); + using Index = MultiIndex; + + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using ScaleCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + + static constexpr auto I0 = Number<0>{}; + + __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1_dequant( + const SrcDesc& src_desc, + const Index& src_slice_origin, + const SrcElementwiseOperation& src_element_op, + const ScaleDesc& scale_desc, + const Index& scale_slice_origin, + const ScaleElementwiseOperation& scale_element_op, + const DstDesc& dst_desc, + const Index& dst_slice_origin, + const DstElementwiseOperation& dst_element_op) + : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)), + scale_coord_(make_tensor_coordinate(scale_desc, scale_slice_origin)), + dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)), + src_element_op_(src_element_op), + scale_element_op_(scale_element_op), + dst_element_op_(dst_element_op) + { + } + + __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx) + { + src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); + } + + __device__ void SetScaleSliceOrigin(const ScaleDesc& scale_desc, const Index& scale_slice_origin_idx) + { + scale_coord_ = make_tensor_coordinate(scale_desc, scale_slice_origin_idx); + } + + __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx) + { + dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx); + } + + template + __device__ void RunRead(const SrcDesc& src_desc, + const SrcBuffer& src_buf, + Number thread_scratch_id = Number{}) + { + static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer and SrcData data type are inconsistent"); + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // make forward steps + const auto src_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(src_desc, forward_step_idx); + }, + Number{}); + + // make backward steps + const auto src_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(src_desc, backward_step_idx); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_src_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i] + : ordered_src_access_lengths[i] - 1 - + ordered_src_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + + constexpr auto src_data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); + + const bool is_src_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_); + + using src_vector_type = vector_type_maker_t; + using src_vector_t = typename src_vector_type::type; + + // copy data from src_buf into src_vector_container + auto src_vector_container = src_vector_type{ + src_buf.template Get(src_coord_.GetOffset(), is_src_valid)}; + + // copy data from src_vector_container into src_thread_scratch_ + src_thread_scratch_tuple_(thread_scratch_id) + .template SetAsType( + src_data_idx_seq, src_vector_container.template AsType()[I0]); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move src coord + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]); + } + } + }); + }); + + // move src coordinate back to slice origin (or not) + if constexpr(SrcResetCoordinateAfterRun) + { + const auto src_reset_step = + make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep()); + + move_tensor_coordinate(src_desc, src_coord_, src_reset_step); + } + } + + template + __device__ void RunScaleRead(const ScaleDesc& scale_desc, + const ScaleBuffer& scale_buf) + { + static_assert(ScaleBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + ScaleBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! ScaleBuffer and ScaleData data type are inconsistent"); + + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto scale_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto scale_access_lengths = SliceLengths{} / scale_scalar_per_access; + + constexpr auto scale_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_scale_access_lengths = + container_reorder_given_new2old(scale_access_lengths, scale_dim_access_order); + + // make forward steps + const auto scale_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? scale_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(scale_desc, forward_step_idx); + }, + Number{}); + + // make backward steps + const auto scale_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -scale_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(scale_desc, backward_step_idx); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_scale_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_scale_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_scale_access_lengths[j] + ordered_scale_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate scale data index + constexpr auto scale_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_scale_access_idx[i] + : ordered_scale_access_lengths[i] - 1 - + ordered_scale_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, scale_dim_access_order) * + scale_scalar_per_access; + }(); + + constexpr auto scale_data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); + + const bool is_scale_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(scale_desc, scale_coord_); + + using scale_vector_type = vector_type_maker_t; + using scale_vector_t = typename scale_vector_type::type; + + // copy data from scale_buf into scale_vector_container + auto scale_vector_container = scale_vector_type{ + scale_buf.template Get(scale_coord_.GetOffset(), is_scale_valid)}; + + // copy data from scale_vector_container into scale_thread_scratch_ + scale_thread_scratch_ + .template SetAsType( + scale_data_idx_seq, scale_vector_container.template AsType()[I0]); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_scale_access_idx[i] < ordered_scale_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_scale_access_idx[j] == ordered_scale_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move scale coord + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + scale_desc, scale_coord_, scale_forward_steps[scale_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + scale_desc, scale_coord_, scale_backward_steps[scale_dim_access_order[i]]); + } + } + }); + }); + + // don't need to move scale coordinate back to slice origin + /* + if constexpr(SrcResetCoordinateAfterRun) + { + const auto scale_reset_step = + make_tensor_coordinate_step(scale_desc, GetScaleCoordinateResetStep()); + + move_tensor_coordinate(scale_desc, scale_coord_, scale_reset_step); + } + */ + } + + template + __device__ void + TransferDataFromSrcThreadScratchToDstThreadScratch(Number thread_scratch_id) + { +#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE + static_ford{}([&](auto idx) { + // convert from SrcData to DstData here + dst_thread_scratch_(idx) = + type_convert(src_thread_scratch_tuple_[thread_scratch_id][idx]); + }); +#else + // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_ + // TODO make this logic more generic for more sub-dword datatype + if constexpr(SrcVectorDim != DstVectorDim && + ((is_same>::value && + is_same>::value && + SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) || + (is_same>::value && + is_same>::value && + SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0))) + { + // each transpose does + // DstScalarPerVector # of src vectors in src_thread_scratch_ + // SrcScalarPerVector # of dst vectors in dst_thread_scratch_ + constexpr index_t num_src_vector = Number{}; + constexpr index_t num_dst_vector = Number{}; + + // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose + // TODO: make this logic generic for all scenario + static_assert(SrcVectorDim != DstVectorDim, "wrong"); + + constexpr auto src_scalar_step_in_vector = generate_sequence( + detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto dst_scalar_step_in_vector = generate_sequence( + detail::lambda_scalar_step_in_vector{}, Number{}); + + constexpr auto scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access_for_src_and_dst_idle{}, + Number{}); + + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + + static_ford{}([&](auto access_idx) { + constexpr auto data_idx = access_idx * scalar_per_access; + + constexpr auto data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); + + using src_vector_t = vector_type_maker_t; + using dst_vector_t = vector_type_maker_t; + + // get DstScalarPerVector # of read-only references to src vectors from + // src_thread_scratch_ + const auto src_vector_refs = generate_tie( + [&](auto i) -> const src_vector_t& { + // i increment corresponds to movement in DstVectorDim + return src_thread_scratch_tuple_[thread_scratch_id].GetVectorTypeReference( + data_idx_seq + i * dst_scalar_step_in_vector); + }, + Number{}); + + // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_ + auto dst_vector_refs = generate_tie( + [&](auto i) -> dst_vector_t& { + // i increment corresponds to movement in SrcVectorDim + return dst_thread_scratch_.GetVectorTypeReference( + data_idx_seq + i * src_scalar_step_in_vector); + }, + Number{}); + + // do data transpose + transpose_vectors{}( + src_vector_refs, dst_vector_refs); + + // do fast numeric convert + src_converted_thread_scratch_.template SetAsType(access_idx, + fast_numeric_converter( + src_thread_scratch_tuple_[thread_scratch_id].template GetAsType(access_idx))); + }); + } + + static_ford{}([&](auto idx) { + // apply the src elementwise op and convert to DstData under the hood if needed + // Scale is dynamic, could not implement through element_op. + DstData dst_v; + constexpr auto scale_idx = Sequence{}; + src_element_op_(dst_v, src_converted_thread_scratch_[idx] * scale_thread_scratch_[scale_idx]); + dst_thread_scratch_(idx) = dst_v; + }); +#endif + } + + template + __device__ void RunWrite(const DstDesc& dst_desc, + DstBuffer& dst_buf, + Number thread_scratch_id = Number{}) + { + // if there is transpose, it's done here + // TODO move this elsewhere + TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id); + + static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or + DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, + "wrong!"); + + static_assert( + is_same, remove_cvref_t>::value, + "wrong! SrcBuffer or DstBuffer data type is wrong"); + + // src scalar per access on each dim + // TODO: don't use this + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // make forward steps + const auto dst_forward_steps = generate_tuple( + [&](auto i) { + Index forward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(dst_desc, forward_step_idx); + }, + Number{}); + + // make backward steps + const auto dst_backward_steps = generate_tuple( + [&](auto i) { + Index backward_step_idx; + + static_for<0, nDim, 1>{}([&](auto j) { + backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0; + }); + + return make_tensor_coordinate_step(dst_desc, backward_step_idx); + }, + Number{}); + + // loop over tensor and copy + static_ford{}([&](auto ordered_dst_access_idx) { + // judge move forward or move backward + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_idx[I0]; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j]; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i] + : ordered_dst_access_lengths[i] - 1 - + ordered_dst_access_idx[i]; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + + constexpr auto dst_data_idx_seq = generate_sequence_v2( + [&](auto i) { return Number{}; }, Number{}); + + const bool is_dst_valid = + coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_); + + using dst_vector_type = vector_type_maker_t; + using dst_vector_t = typename dst_vector_type::type; + + // copy data from dst_thread_scratch_ into dst_vector_container + auto dst_vector_container = dst_vector_type{ + dst_thread_scratch_.template GetAsType(dst_data_idx_seq)}; + + static_for<0, DstScalarPerVector, 1>{}([&](auto i) { + DstData dst_v; + + // apply DstElementwiseOperation + dst_element_op_(dst_v, dst_vector_container.template AsType()[i]); + + dst_vector_container.template AsType()(i) = dst_v; + }); + + // copy data from dst_vector_container to dst_buf + dst_buf.template Set( + dst_coord_.GetOffset(), + is_dst_valid, + dst_vector_container.template AsType()[I0]); + + constexpr auto move_on_dim = [&]() constexpr + { + StaticallyIndexedArray move_on_dim_; + + static_for<0, nDim, 1>{}([&](auto i) { + move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1; + + static_for{}([&](auto j) { + move_on_dim_(i) &= + ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1; + }); + }); + + return move_on_dim_; + } + (); + + // move dst coord + static_for<0, nDim, 1>{}([&](auto i) { + if constexpr(move_on_dim[i]) + { + if constexpr(forward_sweep[i]) + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]); + } + else + { + move_tensor_coordinate( + dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]); + } + } + }); + }); + + // move dst coordinate back to slice origin (or not) + if constexpr(DstResetCoordinateAfterRun) + { + const auto dst_reset_step = + make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep()); + + move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step); + } + } + + __device__ static constexpr auto GetSrcCoordinateResetStep() + { + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_dim_access_order = SrcDimAccessOrder{}; + + constexpr auto ordered_src_access_lengths = + container_reorder_given_new2old(src_access_lengths, src_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_src_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate src data index after last iteration in RunRead(), if it has not being reset by + // RunRead() + constexpr auto src_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, src_dim_access_order) * + src_scalar_per_access; + }(); + + // + constexpr auto reset_src_data_step = [&]() { + Index reset_src_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; }); + + return reset_src_data_step_; + }(); + + return reset_src_data_step; + } + + __device__ static constexpr auto GetDstCoordinateResetStep() + { + // scalar per access on each dim + // TODO: don't use lambda_scalar_per_access + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_dim_access_order = DstDimAccessOrder{}; + + constexpr auto ordered_dst_access_lengths = + container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order); + + // judge move forward or move backward during the last iteration + constexpr auto forward_sweep = [&]() { + StaticallyIndexedArray forward_sweep_; + + forward_sweep_(I0) = true; + + static_for<1, nDim, 1>{}([&](auto i) { + index_t tmp = ordered_dst_access_lengths[I0] - 1; + + static_for<1, i, 1>{}([&](auto j) { + tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1; + }); + + forward_sweep_(i) = tmp % 2 == 0; + }); + + return forward_sweep_; + }(); + + // calculate dst data index after last iteration in RunWrite(), if it has not being reset by + // RunWrite() + constexpr auto dst_data_idx = [&]() { + Index ordered_idx; + + static_for<0, nDim, 1>{}([&](auto i) { + ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0; + }); + + return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) * + dst_scalar_per_access; + }(); + + // + constexpr auto reset_dst_data_step = [&]() { + Index reset_dst_data_step_; + + static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; }); + + return reset_dst_data_step_; + }(); + + return reset_dst_data_step; + } + + // src_slice_origin_step_idx need to be known at compile-time, for performance reason + __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, + const Index& src_slice_origin_step_idx) + { + // if src coord was not reset by RunRead(), then need to adjust the step here + const auto adjusted_step_idx = + SrcResetCoordinateAfterRun ? src_slice_origin_step_idx + : src_slice_origin_step_idx + GetSrcCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx); + + move_tensor_coordinate(src_desc, src_coord_, adjusted_step); + } + + // dst_slice_origin_step_idx need to be known at compile-time, for performance reason + __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, + const Index& dst_slice_origin_step_idx) + { + // if dst coord was not reset by RunWrite(), then need to adjust the step here + const auto adjusted_step_idx = + DstResetCoordinateAfterRun ? dst_slice_origin_step_idx + : dst_slice_origin_step_idx + GetDstCoordinateResetStep(); + + // is it OK to construct a new step every time? + const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx); + + move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step); + } + + __device__ static constexpr auto GetSrcThreadScratchDescriptor() + { + constexpr auto src_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access; + + constexpr auto src_access_lengths_and_vector_length = container_push_back( + sequence_to_tuple_of_number(src_access_lengths), Number{}); + + // 1st stage of transforms + constexpr auto desc0 = + make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length); + + // 2nd stage of transforms + constexpr auto transforms = generate_tuple( + [&](auto i) { + if constexpr(i == SrcVectorDim) + { + return make_merge_transform_v3_division_mod( + make_tuple(src_access_lengths_and_vector_length[i], + src_access_lengths_and_vector_length[Number{}])); + } + else + { + return make_pass_through_transform(src_access_lengths_and_vector_length[i]); + } + }, + Number{}); + + constexpr auto low_dim_idss = generate_tuple( + [&](auto i) { + if constexpr(i == SrcVectorDim) + { + return Sequence{}; + } + else + { + return Sequence{}; + } + }, + Number{}); + + constexpr auto up_dim_idss = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + + return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); + } + + __device__ static constexpr auto GetScaleThreadScratchDescriptor() + { + + constexpr auto scale_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto scale_access_lengths = SliceLengths{} / scale_scalar_per_access; + + constexpr auto scale_access_lengths_and_vector_length = container_push_back( + sequence_to_tuple_of_number(scale_access_lengths), Number{}); + + // 1st stage of transforms + constexpr auto desc0 = + make_naive_tensor_descriptor_packed(scale_access_lengths_and_vector_length); + + // 2nd stage of transforms + constexpr auto transforms = generate_tuple( + [&](auto i) { + if constexpr(i == SrcVectorDim) + { + return make_merge_transform_v3_division_mod( + make_tuple(scale_access_lengths_and_vector_length[i], + scale_access_lengths_and_vector_length[Number{}])); + } + else + { + return make_pass_through_transform(scale_access_lengths_and_vector_length[i]); + } + }, + Number{}); + + constexpr auto low_dim_idss = generate_tuple( + [&](auto i) { + if constexpr(i == SrcVectorDim) + { + return Sequence{}; + } + else + { + return Sequence{}; + } + }, + Number{}); + + constexpr auto up_dim_idss = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + + return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); + } + + __device__ static constexpr auto GetDstThreadScratchDescriptor() + { + // 1st stage of transforms + constexpr auto dst_scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access{}, Number{}); + + constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access; + + constexpr auto dst_access_lengths_and_vector_length = container_push_back( + sequence_to_tuple_of_number(dst_access_lengths), Number{}); + + constexpr auto desc0 = + make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length); + + // 2nd stage of transforms + constexpr auto transforms = generate_tuple( + [&](auto i) { + if constexpr(i == DstVectorDim) + { + return make_merge_transform_v3_division_mod( + make_tuple(dst_access_lengths_and_vector_length[i], + dst_access_lengths_and_vector_length[Number{}])); + } + else + { + return make_pass_through_transform(dst_access_lengths_and_vector_length[i]); + } + }, + Number{}); + + constexpr auto low_dim_idss = generate_tuple( + [&](auto i) { + if constexpr(i == DstVectorDim) + { + return Sequence{}; + } + else + { + return Sequence{}; + } + }, + Number{}); + + constexpr auto up_dim_idss = + generate_tuple([&](auto i) { return Sequence{}; }, Number{}); + + return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss); + } + + private: + static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){}; + static constexpr auto scale_thread_scratch_desc_ = decltype(GetScaleThreadScratchDescriptor()){}; + static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){}; + +/* + template + struct ScaleThreadScratchDesc{}; +*/ + + // Registers, contain raw data loaded from global buffer + using SrcThreadScratch = StaticTensorTupleOfVectorBuffer; + + // Registers, contain fast converted data + using SrcThreadConvertedScratch = StaticTensorTupleOfVectorBuffer; + + // Registers, contain scale data + using ScaleThreadScratch = StaticTensorTupleOfVectorBuffer; + + // Registers, contain dequantized data + using DstThreadScratch = StaticTensorTupleOfVectorBuffer; + + using FastTypeConverter = tensor_operation::element_wise::FastNumericArrayConverter; + + StaticallyIndexedArray src_thread_scratch_tuple_; + SrcThreadConvertedScratch src_converted_thread_scratch_; + ScaleThreadScratch scale_thread_scratch_; + + DstThreadScratch dst_thread_scratch_; + FastTypeConverter fast_numeric_converter; + + SrcCoord src_coord_; + ScaleCoord scale_coord_; + DstCoord dst_coord_; + const SrcElementwiseOperation src_element_op_; + const ScaleElementwiseOperation scale_element_op_; + const DstElementwiseOperation dst_element_op_; +}; + +} // namespace ck From bf75259f5383f4bd42213746b83fbd59d21852ae Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 16 Aug 2023 07:19:34 +0000 Subject: [PATCH 106/118] New implementation of fp16Aint8B Gemm, Acheieve similar math throughput with native fp16 Gemm --- .../gpu/block/blockwise_fpAintB_gemm_wmma.hpp | 624 ------------------ ...oup_tensor_slice_transfer_v4r1_dequant.hpp | 67 +- .../device/impl/device_fpAintB_gemm_wmma.hpp | 1 - .../element/unary_element_wise_operation.hpp | 11 +- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 6 +- .../grid/gridwise_gemm_pipeline_selector.hpp | 5 - .../gpu/grid/gridwise_gemm_pipeline_v1.hpp | 219 ------ ...ise_tensor_slice_transfer_v3r1_dequant.hpp | 122 ++-- 8 files changed, 116 insertions(+), 939 deletions(-) delete mode 100644 include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp diff --git a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp deleted file mode 100644 index 84251a7506c..00000000000 --- a/include/ck/tensor_operation/gpu/block/blockwise_fpAintB_gemm_wmma.hpp +++ /dev/null @@ -1,624 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/common_header.hpp" -#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp" -#include "ck/tensor_operation/gpu/warp/wmma_gemm.hpp" -#include "ck/tensor_description/tensor_adaptor.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" - -#define CK_MNK_LOOP - -namespace ck { - -template -/* Option: Read from LDS, big buffer hold all threads required data - * Source - * A: K0PerBlock x MPerBlock x K1 - * B: K0PerBlock x NPerBlock x K1 - * Destination - * C, non-transpose - * thread level: MRepeat x NRepeat x MAccVgprs - * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs - * KPACK == WMMA_K = 16 - * - * Option: Read from VMEM, small buffer hold each thread own required data (Skip LDS) - * Source: - * A(if skip LDS): MRepeat x KPack - * B(if skip LDS): NRepeat x KPack - * Destination - * C, non-transpose - * block level: MRepeat x MWave x MSubGroup x NRepeat x NWave x NThreadPerSubGroup x MAccVgprs - */ -struct Blockwise_fpAintB_GemmWMMA -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - static constexpr auto I2 = Number<2>{}; - static constexpr auto I3 = Number<3>{}; - static constexpr auto I4 = Number<4>{}; - static constexpr auto I5 = Number<5>{}; - static constexpr auto WmmaK = Number<16>{}; - - using ThisThreadBlock = ThisThreadBlock; - - // Hardcode of WaveSize, since current HIP Runtime(5.4.0-10984) could not return correct one. - static constexpr index_t WaveSize = 32; - - // When use LDS, each Row(16 consecutive lanes) read whole data from source buffer - // When not use LDS, each Row read half of whole data from source buffer, exchange the data via - // permutation - static constexpr index_t A_KRow = AEnableLds ? 1 : 2; - static constexpr index_t B_KRow = BEnableLds ? 1 : 2; - static constexpr index_t A_K1 = ABlockDesc{}.GetLength(I5); - static constexpr index_t B_K1 = BBlockDesc{}.GetLength(I5); - - // As Float DataType - static constexpr auto wmma_gemm = - WmmaGemm{}; - - static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerWMMA); - static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerWMMA); - - StaticBufferTupleOfVector - c_thread_buf_; - - __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; } - - __device__ static auto GetWaveIdx() - { - const index_t thread_id = ThisThreadBlock::GetThreadId(); - - constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))), - make_tuple(Sequence<0, 1, 2>{}), - make_tuple(Sequence<0>{})); - - return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id)); - } - - // Default, Block buffer in LDS, thread level offset enabled - __device__ static auto CalculateAThreadOriginDataIndex() - { - if constexpr(AEnableLds) - { - const auto wave_idx = GetWaveIdx(); - const auto waveId_m = wave_idx[I0]; - const auto WMMA_a_idx = wmma_gemm.CalculateAThreadOriginDataIndex(); - - // |KRepeat |MRepeat|MWave |KRow |MLane |KPack - return make_tuple(0, 0, waveId_m, 0, WMMA_a_idx, 0); - } - else - { - return make_tuple(0, 0, 0, 0, 0, 0); - } - } - - __device__ static auto CalculateBThreadOriginDataIndex() - { - if constexpr(BEnableLds) - { - const auto wave_idx = GetWaveIdx(); - const auto waveId_n = wave_idx[I1]; - const auto WMMA_b_idx = wmma_gemm.CalculateBThreadOriginDataIndex(); - - // |KRepeat |NRepeat|Nwave |KRow |NLane |KPack - return make_tuple(0, 0, waveId_n, 0, WMMA_b_idx, 0); - } - else - { - return make_tuple(0, 0, 0, 0, 0, 0); - } - } - - template - __device__ static auto CalculateCThreadOriginDataIndex(Number, Number) - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk(); - - constexpr auto mrepeat_mwave_mperWMMA_to_m_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerWMMA))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - constexpr auto nrepeat_nwave_nperWMMA_to_n_adaptor = make_single_stage_tensor_adaptor( - make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}), - make_tuple(Sequence<0, 1, 2>{})); - - const index_t c_thread_m = mrepeat_mwave_mperWMMA_to_m_adaptor.CalculateBottomIndex( - make_tuple(m0, waveId_m, blk_idx[I0]))[I0]; - const index_t c_thread_n = nrepeat_nwave_nperWMMA_to_n_adaptor.CalculateBottomIndex( - make_tuple(n0, waveId_n, blk_idx[I1]))[I0]; - - return make_tuple(c_thread_m, c_thread_n); - } - - template - __device__ static auto CalculateCThreadOriginDataIndex7D(Number, Number) - { - const auto wave_idx = GetWaveIdx(); - - const auto waveId_m = wave_idx[I0]; - const auto waveId_n = wave_idx[I1]; - - const auto blk_idx = wmma_gemm.GetBeginOfThreadBlk3D(); - - return make_tuple( - Number{}, waveId_m, blk_idx[I0], Number{}, waveId_n, blk_idx[I1], blk_idx[I2]); - } - - using Tuple6 = decltype(CalculateAThreadOriginDataIndex()); - __host__ __device__ - Blockwise_fpAintB_GemmWMMA(Tuple6 a_origin = CalculateAThreadOriginDataIndex(), - Tuple6 b_origin = CalculateBThreadOriginDataIndex()) - : a_thread_copy_(a_origin), b_thread_copy_(b_origin), scale_thread_copy_(b_origin) - { - static_assert(ABlockDesc::IsKnownAtCompileTime() && BBlockDesc::IsKnownAtCompileTime(), - "wrong! Desc should be known at compile-time"); - - static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize, - "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n"); - - static_assert(MPerBlock % (MPerWMMA * MRepeat) == 0 && - NPerBlock % (NPerWMMA * NRepeat) == 0, - "wrong!"); - } - - // transposed WMMA output C' = B' * A' - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto NAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - - return make_naive_tensor_descriptor_packed( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave - // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, I1, I1, Number{}, I1, I1, NAccVgprs)); - } - - // Thread level, register decriptor. Vector-write - __host__ __device__ static constexpr auto - GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens = - wmma_gemm.GetCMSubGroupNThreadPerSubGroupMAccVgprsThreadBlkLengths(); - - constexpr auto MAccVgprs = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I2]; - constexpr auto AccStride = c_msubgroup_nthreadpersubgroup_maccvgprs_tblk_lens[I3]; - return make_naive_tensor_descriptor( - // |MRepeat |MWave |MSubGroup |NRepeat |NWave - // |NThreadPerSubGroup |MAccVgprs - make_tuple(Number{}, I1, I1, Number{}, I1, I1, MAccVgprs), - make_tuple(Number{} * MAccVgprs * AccStride, - Number{} * MAccVgprs * AccStride, - Number{} * MAccVgprs * AccStride, - MAccVgprs * AccStride, - MAccVgprs * AccStride, - MAccVgprs * AccStride, - AccStride)); - } - - template - __host__ __device__ static constexpr auto - MakeCGridDescriptor_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - const CGridDesc_M_N& c_grid_desc_m_n) - { - const auto M = c_grid_desc_m_n.GetLength(I0); - const auto N = c_grid_desc_m_n.GetLength(I1); - - const auto c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma = - transform_tensor_descriptor( - c_grid_desc_m_n, - make_tuple( - make_unmerge_transform(make_tuple(M / (MWaves * MPerWMMA), MWaves, MPerWMMA)), - make_unmerge_transform(make_tuple(N / (NWaves * NPerWMMA), NWaves, NPerWMMA))), - make_tuple(Sequence<0>{}, Sequence<1>{}), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_grid_desc_mblockxrepeat_mwave_mperwmma_nblockxrepeat_nwave_nperwmma); - } - - // transposed WMMA output C' = B' * A' - __host__ __device__ static constexpr auto - GetCBlockDescriptor_MRepeat_MWave_MThreadPerSubGroup_NRepeat_NWave_NSubGroup_NAccVgprs() - { - constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MThreadPerSubGroup_NBlockxRepeat_NWave_NSubGroup_NAccVgprs( - c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); - } - - // Provide dimension size - __host__ __device__ static constexpr auto - GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs() - { - constexpr auto c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma = - make_naive_tensor_descriptor_packed(make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number{})); - - return wmma_gemm - .MakeCDesc_MBlockxRepeat_MWave_MSubGroup_NBlockxRepeat_NWave_NThreadPerSubGroup_MAccVgprs( - c_block_desc_mrepeat_mwave_mperwmma_nrepeat_nwave_nperwmma); - } - - // Describe how data allocated in thread copy src buffer - // M0_M1_M2 = MRepeat_MWave_MPerWmma, N0_N1_N2 = NRepeat_NWave_NPerWmma - static constexpr ABlockDesc a_block_desc_k0_m0_m1_m2_k1; - static constexpr BBlockDesc b_block_desc_k0_n0_n1_n2_k1; - static constexpr ScaleBlockDesc scale_block_desc_1_n0_n1_n2_1; - - template - __device__ void Run(const ABlockBuffer& a_block_buf, - const BBlockBuffer& b_block_buf, - const ScaleBlockBuffer& scale_block_buf, - CThreadBuffer& c_thread_buf) const - { - auto a_thread_buf = make_static_buffer( - a_thread_desc_.GetElementSpaceSize()); - auto b_thread_buf = make_static_buffer( - b_thread_desc_.GetElementSpaceSize()); - auto scale_thread_buf = make_static_buffer( - scale_thread_desc_.GetElementSpaceSize()); - // auto converted_b_thread_buf = make_static_buffer( - // b_thread_desc_.GetElementSpaceSize()); - tensor_operation::element_wise::FastNumericArrayConverter - fast_numeric_converter; - - // basic intrinsic to determine loopover direction - if constexpr( 0 ) - { - static_for<0, KPerBlock / WmmaK, 1>{}( - [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run( - a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0, I0), - a_thread_buf); - - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - b_thread_buf); - // read weight scale - scale_thread_copy_.Run( - scale_block_desc_1_n0_n1_n2_1, - make_tuple(Number{}, n0, I0, I0, I0, I0), - scale_block_buf, - scale_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - scale_thread_buf); - - vector_type b_int_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - b_int_vec.template AsType()(i) = - b_thread_buf[Number{}]; - }); - - // convert B from uint8 to fp16, multiply scale - b_thread_vec = fast_numeric_converter(b_int_vec); - static_for<0, WmmaK, 1>{}([&](auto i) { - b_thread_vec.template AsType()(i) = - scale_thread_buf[n0] * - b_thread_vec.template AsType()(i); - }); - - vector_type a_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = - a_thread_buf[Number{}]; - }); - - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - }); - }); - }); - } - else - { - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of - // k=0,kpack*1, .. - static_for<0, NRepeat, 1>{}([&](auto n0) { - // read weight scale - scale_thread_copy_.Run(scale_block_desc_1_n0_n1_n2_1, - make_tuple(I0, n0, I0, I0, I0, I0), - scale_block_buf, - scale_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - scale_thread_buf); - - // read B - b_thread_copy_.Run( - b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0, I0), - b_block_buf, - b_thread_desc_, - make_tuple(I0, n0, I0, I0, I0, I0), - b_thread_buf); - - vector_type b_int_vec; - vector_type b_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - b_int_vec.template AsType()(i) = - b_thread_buf[Number{}]; - }); - - // convert B from uint8 to fp16, multiply scale - b_thread_vec = fast_numeric_converter(b_int_vec); - static_for<0, WmmaK, 1>{}([&](auto i) { - b_thread_vec.template AsType()(i) = - scale_thread_buf[n0] * b_thread_vec.template AsType()(i); - }); - - static_for<0, MRepeat, 1>{}([&](auto m0) { - // read A - a_thread_copy_.Run( - a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0, I0), - a_block_buf, - a_thread_desc_, - make_tuple(I0, m0, I0, I0, I0, I0), - a_thread_buf); - - vector_type a_thread_vec; - - static_for<0, WmmaK, 1>{}([&](auto i) { - a_thread_vec.template AsType()(i) = - a_thread_buf[Number{}]; - }); - - using wmma_input_type_a = typename vector_type::type; - using wmma_input_type_b = typename vector_type::type; - - constexpr index_t c_offset = - c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0)); - - wmma_gemm.template Run( - a_thread_vec.template AsType()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), - c_thread_buf.GetVectorTypeReference(Number{})); - }); - }); - }); - } - } - - protected: - static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor(make_tuple(Number{}, - Number{}, - I1, - Number{}, - I1, - Number{}), - make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number<1>{})); - - static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor(make_tuple(Number{}, - Number{}, - I1, - Number{}, - I1, - Number{}), - make_tuple(Number{}, - Number{}, - Number{}, - Number{}, - Number{}, - Number<1>{})); - - static constexpr auto scale_thread_desc_ = make_naive_tensor_descriptor( - make_tuple( - Number{}, Number{}, I1, Number{}, I1, I1), - make_tuple(I0, I1, I0, I0, I0, I0)); - - // C[M, N, NumRegWMMA] - static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed( - make_tuple(Number{}, Number{}, wmma_gemm.GetRegSizePerWmma())); - - template - struct AThreadCopySelector; - - template <> - struct AThreadCopySelector - { - using type = - ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4, 5>, - 5, - A_K1, - A_K1>; - }; - - template <> - struct AThreadCopySelector - { - using type = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< - ADataType, - ADataType, - decltype(a_block_desc_k0_m0_m1_m2_k1), - decltype(a_thread_desc_), - tensor_operation::element_wise::PassThrough, - Sequence, - Sequence<0, 1, 2, 3, 4, 5>, - 5, - A_K1, - 0x76543210, - 0xfedcba98, - TransposeC ? false : true>; - }; - - template - struct BThreadCopySelector; - - template <> - struct BThreadCopySelector - { - using type = - ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4, 5>, - 5, - B_K1, - B_K1>; - }; - - template <> - struct BThreadCopySelector - { - using type = ThreadwiseTensorSliceTransfer_StaticToStatic_InterRow< - BDataType, - BDataType, - decltype(b_block_desc_k0_n0_n1_n2_k1), - decltype(b_thread_desc_), - tensor_operation::element_wise::PassThrough, - Sequence, - Sequence<0, 1, 2, 3, 4, 5>, - 5, - B_K1, - 0x76543210, - 0xfedcba98, - TransposeC ? true : false>; - }; - - template - struct ScaleThreadCopySelector; - - template <> - struct ScaleThreadCopySelector - { - using type = - ThreadwiseTensorSliceTransfer_v4, - Sequence<0, 1, 2, 3, 4, 5>, - 5, - 1, - 1>; - }; - - template <> - struct ScaleThreadCopySelector - { - using type = ThreadwiseTensorSliceTransfer_StaticToStatic< - ScaleDataType, - ScaleDataType, - decltype(scale_block_desc_1_n0_n1_n2_1), - decltype(scale_thread_desc_), - tensor_operation::element_wise::PassThrough, - Sequence, - Sequence<0, 1, 2, 3, 4, 5>, - 5, - 1>; - }; - - typename AThreadCopySelector::type a_thread_copy_; - typename BThreadCopySelector::type b_thread_copy_; - typename ScaleThreadCopySelector::type scale_thread_copy_; -}; - -} // namespace ck diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp index 338b3a88923..ab826bb0416 100644 --- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp +++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_dequant.hpp @@ -17,7 +17,7 @@ namespace ck { * RunRead would load low-precision data and scale data. * RunWrite would process dequantization process. * Assume Scale is identical along K-dimension - * + * * This version does following things to avoid scratch memory issue * 1. Use StaticallyIndexedArray instead of C array for thread buffer * 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor @@ -57,7 +57,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1_dequant static constexpr index_t nDim = remove_reference_t::GetNumOfDimension(); static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{}; - static constexpr auto scale_thread_slice_lengths = BlockScaleSliceLengths{} / ThreadClusterLengths{}; + static constexpr auto scale_thread_slice_lengths = + BlockScaleSliceLengths{} / ThreadClusterLengths{}; using Index = MultiIndex; @@ -83,7 +84,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1_dequant { static_assert(nDim == remove_cvref_t::GetNumOfDimension() && - nDim == remove_cvref_t::GetNumOfDimension() && + nDim == remove_cvref_t::GetNumOfDimension() && nDim == remove_cvref_t::GetNumOfDimension() && nDim == ThreadClusterLengths::Size() && nDim == ThreadClusterArrangeOrder::Size() && @@ -91,8 +92,9 @@ struct ThreadGroupTensorSliceTransfer_v4r1_dequant "wrong! nDim not consistent"); static_assert( - is_same{} && - is_same{} , + is_same{} && + is_same{}, "wrong! threads should be mapped to cover entire slicing window"); static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(), @@ -108,8 +110,8 @@ struct ThreadGroupTensorSliceTransfer_v4r1_dequant threadwise_transfer_.SetSrcSliceOrigin(src_desc, src_block_slice_origin + thread_data_idx_begin); - threadwise_transfer_.SetScaleSliceOrigin(scale_desc, - scale_block_slice_origin + thread_data_idx_begin); + threadwise_transfer_.SetScaleSliceOrigin( + scale_desc, scale_block_slice_origin + thread_data_idx_begin); threadwise_transfer_.SetDstSliceOrigin(dst_desc, dst_block_slice_origin + thread_data_idx_begin); } @@ -129,8 +131,7 @@ struct ThreadGroupTensorSliceTransfer_v4r1_dequant // With the assumption, scale scratch is always one template - __device__ void RunScaleRead(const ScaleDesc& scale_desc, - const ScaleBuffer& scale_buf) + __device__ void RunScaleRead(const ScaleDesc& scale_desc, const ScaleBuffer& scale_buf) { if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize()) @@ -191,30 +192,30 @@ struct ThreadGroupTensorSliceTransfer_v4r1_dequant using ThreadwiseTransfer = ThreadwiseTensorSliceTransfer_v3r1_dequant; + decltype(scale_thread_slice_lengths), + SrcElementwiseOperation, + ScaleElementwiseOperation, + DstElementwiseOperation, + DstInMemOp, + SrcData, + ScaleData, + DstData, + SrcDesc, + ScaleDesc, + DstDesc, + SrcDimAccessOrder, + DstDimAccessOrder, + SrcVectorDim, + DstVectorDim, + SrcScalarPerVector, + ScaleScalarPerVector, + DstScalarPerVector, + SrcScalarStrideInVector, + ScaleScalarStrideInVector, + DstScalarStrideInVector, + ThreadTransferSrcResetCoordinateAfterRun, + ThreadTransferDstResetCoordinateAfterRun, + NumThreadScratch>; ThreadwiseTransfer threadwise_transfer_; }; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 31c39ed0187..494993f0685 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -677,7 +677,6 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB PipelineVersionToString{ {PipelineVersion::v1, "v1"}, {PipelineVersion::v2, "v2"}, - {PipelineVersion::dequant_v1, "dequant_v1"}, {PipelineVersion::weight_only, "weight_only"}}; // clang-format off diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index b59b30849fe..c33911325ae 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -404,18 +404,11 @@ struct FastNumericArrayConverter half_2[0] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_01); half_2[1] = __builtin_amdgcn_perm(fp16_adder, uint8_4, byte_selector_23); - // static constexpr ck::half_t fp16_subtract = -1152; - // Output.template AsType()(Number<0>{}) += fp16_subtract; - // Output.template AsType()(Number<1>{}) += fp16_subtract; - // Output.template AsType()(Number<2>{}) += fp16_subtract; - // Output.template AsType()(Number<3>{}) += fp16_subtract; - - // inline assembly get very poor performance as no chance to global scheduling static constexpr uint32_t I8s_TO_F16s_MAGIC_NUM = 0x64806480; - asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" + asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]" : "=v"(half_2[0]) : "v"(half_2[0]), "s"(I8s_TO_F16s_MAGIC_NUM)); - asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]\n" + asm volatile("v_pk_add_f16 %0, %1, %2 neg_lo:[0,1] neg_hi:[0,1]" : "=v"(half_2[1]) : "v"(half_2[1]), "s"(I8s_TO_F16s_MAGIC_NUM)); diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index da618c0c230..69351682e2a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -544,9 +544,9 @@ struct GridwiseFpAintBGemm_Wmma { // LDS allocation for A and Dequantized B: be careful of DataType // scale would not put into LDS. - using LDS_ADataType = ADataType; - using LDS_BDataType = ADataType; - using LDS_CDataType = CShuffleDataType; + using LDS_ADataType = ADataType; + using LDS_BDataType = ADataType; + using LDS_CDataType = CShuffleDataType; static constexpr auto max_lds_align = K1; static constexpr auto a_block_space_size_aligned = diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index d85d4597c51..4c11fc0f317 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -12,7 +12,6 @@ enum struct PipelineVersion { v1, v2, - dequant_v1, weight_only, }; @@ -38,10 +37,6 @@ constexpr auto GridwiseGemmPipeline_Selector() { return GridwiseGemmPipeline_v2{}; } - else if constexpr(PipelineVer == PipelineVersion::dequant_v1) - { - return GridwiseGemmPipeline_v1_dequant{}; - } else if constexpr(PipelineVer == PipelineVersion::weight_only) { return GridwiseGemmPipeline_v1_WeightOnly{}; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp index 1b281674591..e48dac49c80 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp @@ -550,225 +550,6 @@ struct GridwiseGemmPipeline_v1<1, false, false> } }; -template -struct GridwiseGemmPipeline_v1_dequant; - -template <> -struct GridwiseGemmPipeline_v1_dequant<1, true, true> -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } - - __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) - { - return num_loop > 1; - } - - template - __device__ static void Run(const AGridDesc& a_grid_desc, - const ABlockDesc& a_block_desc, - ABlockTransfer& a_blockwise_copy, - const AGridBuffer& a_grid_buf, - ABlockBuffer& a_block_buf, - const ABlockTransferStep& a_block_copy_step, - const BGridDesc& b_grid_desc, - const BBlockDesc& b_block_desc, - BBlockTransfer& b_blockwise_copy, - const BGridBuffer& b_grid_buf, - BBlockBuffer& b_block_buf, - const BBlockTransferStep& b_block_copy_step, - const ScaleGridDesc& scale_grid_desc, - const ScaleBlockDesc& scale_block_desc, - ScaleBlockTransfer& scale_blockwise_copy, - const ScaleGridBuffer& scale_grid_buf, - ScaleBlockBuffer& scale_block_buf, - const BlockwiseGemm& blockwise_gemm, - CThreadBuffer& c_thread_buf, - index_t num_loop) - { - // preload data into LDS - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); - b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); - scale_blockwise_copy.RunRead(scale_grid_desc, scale_grid_buf); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - - // Initialize C - c_thread_buf.Clear(); - - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - scale_blockwise_copy.RunWrite(scale_block_desc, scale_block_buf); - - // main body - if constexpr(HasMainLoop) - { - index_t i = 0; - - do - { - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); - - block_sync_lds(); - - b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf); - - blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); - - block_sync_lds(); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); - b_blockwise_copy.RunWrite(b_block_desc, b_block_buf); - - ++i; - } while(i < (num_loop - 1)); - } - - // tail - { - block_sync_lds(); - - blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); - } - } -}; - -template <> -struct GridwiseGemmPipeline_v1_dequant<1, true, false> -{ - static constexpr auto I0 = Number<0>{}; - static constexpr auto I1 = Number<1>{}; - - __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; } - - __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop) - { - return num_loop > 1; - } - - template - __device__ static void Run(const AGridDesc& a_grid_desc, - const ABlockDesc& a_block_desc, - ABlockTransfer& a_blockwise_copy, - const AGridBuffer& a_grid_buf, - ABlockBuffer& a_block_buf, - const ABlockTransferStep& a_block_copy_step, - const BGridDesc& b_grid_desc, - const BBlockDesc& b_block_desc, - BBlockTransfer& b_blockwise_copy, - const BGridBuffer& b_grid_buf, - BBlockBuffer& b_block_buf, - const BBlockTransferStep& b_block_copy_step, - const ScaleGridDesc& scale_grid_desc, - const ScaleBlockDesc& scale_block_desc, - ScaleBlockTransfer& scale_blockwise_copy, - const ScaleGridBuffer& scale_grid_buf, - ScaleBlockBuffer& scale_block_buf, - const BlockwiseGemm& blockwise_gemm, - CThreadBuffer& c_thread_buf, - index_t num_loop) - { - constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0, I0, I0); - auto b_block_buf_switch = b_block_buf; - - // preload data into LDS - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); - b_blockwise_copy.Run( - b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf); - scale_blockwise_copy.Run( - scale_grid_desc, scale_grid_buf, scale_block_desc, b_block_origin_idx, scale_block_buf); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - - // Initialize C - c_thread_buf.Clear(); - - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); - - // main body - if constexpr(HasMainLoop) - { - index_t i = 0; - - do - { - b_blockwise_copy.Run( - b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_block_buf_switch); - - block_sync_lds(); - - a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf); - - blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); - - block_sync_lds(); - - a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step); - b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step); - - a_blockwise_copy.RunWrite(a_block_desc, a_block_buf); - - b_block_buf = b_block_buf_switch; - ++i; - } while(i < (num_loop - 1)); - } - - // tail - { - block_sync_lds(); - - blockwise_gemm.Run(a_block_buf, b_block_buf, scale_block_buf, c_thread_buf); - - block_sync_lds(); - } - } -}; - template struct GridwiseGemmPipeline_v1_WeightOnly; diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp index 2bb96845829..174b82f8700 100644 --- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp +++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_dequant.hpp @@ -84,9 +84,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant static constexpr index_t nDim = SliceLengths::Size(); using Index = MultiIndex; - using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); + using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); using ScaleCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{})); - using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); + using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{})); static constexpr auto I0 = Number<0>{}; @@ -114,7 +114,8 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx); } - __device__ void SetScaleSliceOrigin(const ScaleDesc& scale_desc, const Index& scale_slice_origin_idx) + __device__ void SetScaleSliceOrigin(const ScaleDesc& scale_desc, + const Index& scale_slice_origin_idx) { scale_coord_ = make_tensor_coordinate(scale_desc, scale_slice_origin_idx); } @@ -274,8 +275,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant } template - __device__ void RunScaleRead(const ScaleDesc& scale_desc, - const ScaleBuffer& scale_buf) + __device__ void RunScaleRead(const ScaleDesc& scale_desc, const ScaleBuffer& scale_buf) { static_assert(ScaleBuffer::GetAddressSpace() == AddressSpaceEnum::Global or ScaleBuffer::GetAddressSpace() == AddressSpaceEnum::Lds, @@ -358,11 +358,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant scale_scalar_per_access; }(); - constexpr auto scale_data_idx_seq = generate_sequence_v2( - [&](auto i) { return Number{}; }, Number{}); + constexpr auto scale_data_idx_seq = + generate_sequence_v2([&](auto i) { return Number{}; }, + Number{}); - const bool is_scale_valid = - coordinate_has_valid_offset_assuming_visible_index_is_valid(scale_desc, scale_coord_); + const bool is_scale_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid( + scale_desc, scale_coord_); using scale_vector_type = vector_type_maker_t; using scale_vector_t = typename scale_vector_type::type; @@ -372,16 +373,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant scale_buf.template Get(scale_coord_.GetOffset(), is_scale_valid)}; // copy data from scale_vector_container into scale_thread_scratch_ - scale_thread_scratch_ - .template SetAsType( - scale_data_idx_seq, scale_vector_container.template AsType()[I0]); + scale_thread_scratch_.template SetAsType( + scale_data_idx_seq, scale_vector_container.template AsType()[I0]); constexpr auto move_on_dim = [&]() constexpr { StaticallyIndexedArray move_on_dim_; static_for<0, nDim, 1>{}([&](auto i) { - move_on_dim_(i) = ordered_scale_access_idx[i] < ordered_scale_access_lengths[i] - 1; + move_on_dim_(i) = + ordered_scale_access_idx[i] < ordered_scale_access_lengths[i] - 1; static_for{}([&](auto j) { move_on_dim_(i) &= @@ -399,13 +400,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant { if constexpr(forward_sweep[i]) { - move_tensor_coordinate( - scale_desc, scale_coord_, scale_forward_steps[scale_dim_access_order[i]]); + move_tensor_coordinate(scale_desc, + scale_coord_, + scale_forward_steps[scale_dim_access_order[i]]); } else { - move_tensor_coordinate( - scale_desc, scale_coord_, scale_backward_steps[scale_dim_access_order[i]]); + move_tensor_coordinate(scale_desc, + scale_coord_, + scale_backward_steps[scale_dim_access_order[i]]); } } }); @@ -462,9 +465,9 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant constexpr auto scalar_per_access = generate_sequence( detail::lambda_scalar_per_access_for_src_and_dst_idle{}, + SrcScalarPerVector, + DstVectorDim, + DstScalarPerVector>{}, Number{}); constexpr auto access_lengths = SliceLengths{} / scalar_per_access; @@ -500,20 +503,46 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant // do data transpose transpose_vectors{}( src_vector_refs, dst_vector_refs); - - // do fast numeric convert - src_converted_thread_scratch_.template SetAsType(access_idx, - fast_numeric_converter( - src_thread_scratch_tuple_[thread_scratch_id].template GetAsType(access_idx))); }); } + // Do fast numeric convert + constexpr auto scalar_per_access = generate_sequence( + detail::lambda_scalar_per_access_for_src_and_dst_idle{}, + Number{}); + + constexpr auto access_lengths = SliceLengths{} / scalar_per_access; + + using src_vector_type = vector_type_maker_t; + using src_vector_t = typename src_vector_type::type; + + using src_converted_vector_type = vector_type_maker_t; + using src_converted_vector_t = typename src_converted_vector_type::type; + // Vector-wise type convert + static_ford{}([&](auto access_idx) { + auto src_vector_container = src_vector_type{ + src_thread_scratch_tuple_[thread_scratch_id].template GetAsType( + access_idx)}; + + auto src_converted_vector_container = + src_converted_vector_type{fast_numeric_converter(src_vector_container)}; + + src_converted_thread_scratch_.template SetAsType( + access_idx, + src_converted_vector_container.template AsType()[I0]); + }); + + // Element-scale operation, expect packed multiplication static_ford{}([&](auto idx) { - // apply the src elementwise op and convert to DstData under the hood if needed - // Scale is dynamic, could not implement through element_op. DstData dst_v; constexpr auto scale_idx = Sequence{}; - src_element_op_(dst_v, src_converted_thread_scratch_[idx] * scale_thread_scratch_[scale_idx]); + // printf("Tid: %03d, scale: %04x\n", get_thread_local_1d_id(), + // *(reinterpret_cast(&scale_thread_scratch_[scale_idx]))); + src_element_op_(dst_v, + src_converted_thread_scratch_[idx] * scale_thread_scratch_[scale_idx]); dst_thread_scratch_(idx) = dst_v; }); #endif @@ -978,13 +1007,14 @@ struct ThreadwiseTensorSliceTransfer_v3r1_dequant private: static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){}; - static constexpr auto scale_thread_scratch_desc_ = decltype(GetScaleThreadScratchDescriptor()){}; + static constexpr auto scale_thread_scratch_desc_ = + decltype(GetScaleThreadScratchDescriptor()){}; static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){}; -/* - template - struct ScaleThreadScratchDesc{}; -*/ + /* + template + struct ScaleThreadScratchDesc{}; + */ // Registers, contain raw data loaded from global buffer using SrcThreadScratch = StaticTensorTupleOfVectorBuffer; - + // Registers, contain fast converted data - using SrcThreadConvertedScratch = StaticTensorTupleOfVectorBuffer; + using SrcThreadConvertedScratch = + StaticTensorTupleOfVectorBuffer; // Registers, contain scale data using ScaleThreadScratch = StaticTensorTupleOfVectorBuffer; + ScaleData, + ScaleScalarPerVector, + decltype(scale_thread_scratch_desc_), + true>; // Registers, contain dequantized data using DstThreadScratch = StaticTensorTupleOfVectorBuffer; - - using FastTypeConverter = tensor_operation::element_wise::FastNumericArrayConverter; + + using FastTypeConverter = tensor_operation::element_wise:: + FastNumericArrayConverter; StaticallyIndexedArray src_thread_scratch_tuple_; SrcThreadConvertedScratch src_converted_thread_scratch_; From 809d7dfb66d1beeae0e37bd4a89eb7c2c28c5148 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Sat, 24 Feb 2024 13:58:10 +0000 Subject: [PATCH 107/118] format --- .../gemm_bilinear_wmma_fp16.cpp | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp index 36680ba0ba8..d1b820da7bb 100644 --- a/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp +++ b/example/02_gemm_bilinear/gemm_bilinear_wmma_fp16.cpp @@ -65,49 +65,49 @@ using CDEElementOp = AlphaBetaAdd; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; -using DeviceOpInstance = - ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffle, - ELayout, - ADataType, - BDataType, - AccDataType, - CShuffleDataType, - ck::Tuple, - EDataType, - AElementOp, - BElementOp, - CDEElementOp, - GemmSpec, - 1, - 128, - 64, - 64, - 64, - 4, - 16, - 16, - 1, - 4, - S<4, 32, 1>, - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 4, - 4, - true, - S<4, 32, 1>, - S<1, 0, 2>, - S<1, 0, 2>, - 2, - 4, - 4, - true, - 1, - 1, - S<1, 64, 1, 2>, - 8>; +using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffle< + ALayout, + BLayout, + ck::Tuple, + ELayout, + ADataType, + BDataType, + AccDataType, + CShuffleDataType, + ck::Tuple, + EDataType, + AElementOp, + BElementOp, + CDEElementOp, + GemmSpec, + 2, // Prefetch stage + 128, // BlockSize + 128, // MPerBlock + 64, // NPerBlock + 64, // KPerBlock + 8, // K1 + 16, // MPerWmma + 16, // NPerWmma + 4, // M-Repeat // M-PerWmma / M-Repeat = M-Wave + 2, // N-Repeat // N-PerWmma / N-Repeat = N-Wave + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + S<4, 32, 1>, + S<1, 0, 2>, + S<1, 0, 2>, + 2, + 8, + 8, + true, + 1, // C shuffle (M Repeat) Per store + 1, // C shuffle (N Repeat) Per store + S<1, 32, 1, 4>, + 8>; int main(int argc, char* argv[]) { From 18d5297b1173174e2a3d2e05af5bd819068b3e32 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Mon, 26 Feb 2024 13:16:51 +0000 Subject: [PATCH 108/118] Todo: fix gemm_bilinear_wmma instances compilation bug --- .../CMakeLists.txt | 7 - .../CMakeLists.txt | 0 .../common.hpp | 0 .../fp16int8_gemm_wmma.cpp | 0 .../run_gemm_example.inc | 21 +- ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 6 +- ..._grouped_conv_bwd_weight_wmma_cshuffle.hpp | 6 +- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 10 +- .../element/unary_element_wise_operation.hpp | 6 +- ...atched_gemm_softmax_gemm_wmma_cshuffle.hpp | 5 +- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 5 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 87 +++++++ .../grid/gridwise_gemm_pipeline_selector.hpp | 1 + include/ck/utility/amd_buffer_addressing.hpp | 111 +-------- include/ck/utility/data_type.hpp | 213 ------------------ .../device_grouped_conv_fwd_wmma_instance.hpp | 100 ++++---- .../gpu/gemm/CMakeLists.txt | 6 + .../gpu/gemm_bilinear/CMakeLists.txt | 8 +- ...uffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp | 48 ++-- ...uffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp | 48 ++-- ...uffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp | 48 ++-- ...uffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp | 80 +++---- .../grouped_conv2d_bwd_data/CMakeLists.txt | 16 +- .../gpu/grouped_conv2d_fwd/CMakeLists.txt | 32 +-- .../grouped_conv3d_bwd_weight/CMakeLists.txt | 18 +- .../gpu/grouped_conv3d_fwd/CMakeLists.txt | 32 +-- script/clang-format-overwrite.sh | 5 - 27 files changed, 333 insertions(+), 586 deletions(-) rename example/{49_fpAintB_gemm => 64_fpAintB_gemm}/CMakeLists.txt (100%) rename example/{49_fpAintB_gemm => 64_fpAintB_gemm}/common.hpp (100%) rename example/{49_fpAintB_gemm => 64_fpAintB_gemm}/fp16int8_gemm_wmma.cpp (100%) rename example/{49_fpAintB_gemm => 64_fpAintB_gemm}/run_gemm_example.inc (87%) diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 3938ddab48d..5e091f56471 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -1,10 +1,3 @@ -add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp) -add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp) -add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) -add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp) -add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) -add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) -add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) diff --git a/example/49_fpAintB_gemm/CMakeLists.txt b/example/64_fpAintB_gemm/CMakeLists.txt similarity index 100% rename from example/49_fpAintB_gemm/CMakeLists.txt rename to example/64_fpAintB_gemm/CMakeLists.txt diff --git a/example/49_fpAintB_gemm/common.hpp b/example/64_fpAintB_gemm/common.hpp similarity index 100% rename from example/49_fpAintB_gemm/common.hpp rename to example/64_fpAintB_gemm/common.hpp diff --git a/example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp b/example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp similarity index 100% rename from example/49_fpAintB_gemm/fp16int8_gemm_wmma.cpp rename to example/64_fpAintB_gemm/fp16int8_gemm_wmma.cpp diff --git a/example/49_fpAintB_gemm/run_gemm_example.inc b/example/64_fpAintB_gemm/run_gemm_example.inc similarity index 87% rename from example/49_fpAintB_gemm/run_gemm_example.inc rename to example/64_fpAintB_gemm/run_gemm_example.inc index 87c8d6a70a1..dc2bdc18f01 100644 --- a/example/49_fpAintB_gemm/run_gemm_example.inc +++ b/example/64_fpAintB_gemm/run_gemm_example.inc @@ -34,30 +34,15 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) { case 0: break; case 1: - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(quant_b_k_n); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(scale_k_n); + ck::utils::FillUniformDistributionIntegerValue{-1.f, 1.f}(a_m_k); + ck::utils::FillUniformDistributionIntegerValue{-1.f, 1.f}(quant_b_k_n); + ck::utils::FillUniformDistributionIntegerValue{-1.f, 1.f}(scale_k_n); break; case 2: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); ck::utils::FillUniformDistribution{-1.f, 1.f}(quant_b_k_n); ck::utils::FillUniformDistribution{-1.f, 1.f}(scale_k_n); break; - case 3: - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(quant_b_k_n); - ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(scale_k_n); - break; - case 4: - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{1.f, 1.f}(quant_b_k_n); - ck::utils::FillUniformDistributionIntegerValue{2.f, 2.f}(scale_k_n); - break; - case 5: - ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(a_m_k); - ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(quant_b_k_n); - ck::utils::FillUniformDistributionIntegerValue{-2.f, 2.f}(scale_k_n); - break; default: ck::utils::FillUniformDistribution{-1.f, 1.f}(a_m_k); ck::utils::FillUniformDistribution{-1.f, 1.f}(quant_b_k_n); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp index 4381fe9159c..b0e0e6da761 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp @@ -217,7 +217,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle // Tiling Family MPerBlock, NPerBlock, - K0PerBlock, + KPerBlock, MPerWMMA, NPerWMMA, K1, @@ -231,8 +231,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, - true, false, + true, ABlockLdsExtraM, BBlockTransferThreadClusterLengths_BK0_N_BK1, BBlockTransferThreadClusterArrangeOrder, @@ -240,8 +240,8 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_BK1, - true, false, + true, BBlockLdsExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp index d292e29d29c..e440eb82a4c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp @@ -416,7 +416,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle // Tiling Family MPerBlock, NPerBlock, - K0PerBlock, + KPerBlock, MPerWMMA, NPerWMMA, K1, @@ -430,8 +430,8 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, - true, false, + true, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, @@ -439,8 +439,8 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, - true, false, + true, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp index d46a1a199f8..d70d462e24e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp @@ -279,15 +279,11 @@ struct DeviceGroupedConvFwdMultipleD_Wmma_CShuffle } // desc for problem definition - using AGridDesc_M_K = remove_cvref_t( - {}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>; - using BGridDesc_N_K = remove_cvref_t({}, {}))>; - using DsGridDesc_M_N = remove_cvref_t; - using EGridDesc_M_N = remove_cvref_t({}, {}))>; - using AGridDesc = decltype(DeviceOp::MakeAGridDescriptor({}, {}, {}, {}, {}, {}, {}, {}, {}, {})); - using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({}, {})); + using BGridDesc = decltype(DeviceOp::MakeBGridDescriptor({}, {})); + using DsGridDesc_M_N = remove_cvref_t; + using EGridDesc_M_N = remove_cvref_t({}, {}))>; // GridwiseOp using GridwiseOp = GridwiseGemmMultipleD_Wmma< diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp index 34381c7dbaf..c6d933893e3 100644 --- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp @@ -121,6 +121,9 @@ struct PassThrough __host__ __device__ void operator()(bhalf_t& y, const int8_t& x) const { y = type_convert(x); + } + + template <> __host__ __device__ void operator()(uint8_t& y, const uint8_t& x) const { y = x; @@ -738,5 +741,4 @@ struct FastNumericArrayConverter } // namespace element_wise } // namespace tensor_operation -} -}// namespace ck +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp index ef7f91ab8d5..16717ff8197 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_wmma_cshuffle.hpp @@ -651,8 +651,9 @@ struct GridwiseBatchedGemmSoftmaxGemm_Wmma c_grid_desc_m_n); } - using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = + remove_cvref_t; using DefaultBlock2CTileMap = remove_cvref_t; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index 69351682e2a..18c996844f1 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -535,8 +535,9 @@ struct GridwiseFpAintBGemm_Wmma c_grid_desc_m_n); } - using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t; + using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = + remove_cvref_t; using DefaultBlock2CTileMap = remove_cvref_t; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index b9c85ccbefb..e01d835fb6a 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -596,6 +596,93 @@ struct GridwiseGemmMultipleD_Wmma Number{}); } + // CheckValidity for kernels without multi D + template + __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, + const BGridDesc& b_grid_desc, + const EGridDesc_M_N& e_grid_desc_m_n, + const Block2CTileMap& block_2_ctile_map) + { + static_assert(is_known_at_compile_time>::value, + "wrong! K1 need to be known at compile-time"); + + static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) && + (NPerBlock % (NRepeat * NPerWmma)) == 0, + "Invalid tuning param!"); + + const auto GetAProblemsizeMK = [&]() { + if constexpr(AEnableLds) + { + return make_tuple(a_grid_desc.GetLength(I1), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(a_grid_desc.GetLength(I1) * a_grid_desc.GetLength(I2) * + a_grid_desc.GetLength(I5), + a_grid_desc.GetLength(I0) * a_grid_desc.GetLength(I3) * + a_grid_desc.GetLength(I4) * a_grid_desc.GetLength(I6)); + } + }; + + const auto GetBProblemsizeNK = [&]() { + if constexpr(BEnableLds) + { + return make_tuple(b_grid_desc.GetLength(I1), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I2)); + } + else + { + return make_tuple(b_grid_desc.GetLength(I1) * b_grid_desc.GetLength(I2) * + b_grid_desc.GetLength(I5), + b_grid_desc.GetLength(I0) * b_grid_desc.GetLength(I3) * + b_grid_desc.GetLength(I4) * b_grid_desc.GetLength(I6)); + } + }; + + const auto M = GetAProblemsizeMK()[I0]; + const auto N = GetBProblemsizeNK()[I0]; + const auto K = GetAProblemsizeMK()[I1]; + + if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1) && + K == GetBProblemsizeNK()[I1])) + { + printf("GridwiseOp: ABE descriptor dimension cross check failure\n"); + return false; + } + + if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0)) + { + printf("GridwiseOp: Problemsize descriptor dimension check failure\n"); + return false; + } + + // check gridwise gemm pipeline + const auto num_k_loop = K / KPerBlock; + + if(!GridwiseGemmPipe::IsSupported(num_k_loop)) + { + return false; + } + + if(!block_2_ctile_map.CheckValidity(e_grid_desc_m_n)) + { + return false; + } + + // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc) + constexpr long_index_t TwoGB = (long_index_t{1} << 31); + + if(!(a_grid_desc.GetElementSpaceSize() * sizeof(ADataType) <= TwoGB && + b_grid_desc.GetElementSpaceSize() * sizeof(BDataType) <= TwoGB && + e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB)) + { + return false; + } + + return true; + } + // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01} template __host__ __device__ static constexpr bool CheckValidity(const AGridDesc& a_grid_desc, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp index b38ca80d319..567c42362c9 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp @@ -45,6 +45,7 @@ constexpr auto GridwiseGemmPipeline_Selector() else if constexpr(PipelineVer == PipelineVersion::v4) { return GridwiseGemmPipeline_v4{}; + } else if constexpr(PipelineVer == PipelineVersion::weight_only) { return GridwiseGemmPipeline_v1_WeightOnly{}; diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp index f9072a6e729..678c55b95f4 100644 --- a/include/ck/utility/amd_buffer_addressing.hpp +++ b/include/ck/utility/amd_buffer_addressing.hpp @@ -417,7 +417,8 @@ __device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_w (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || - (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) || + (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)), "wrong! not implemented"); using r_t = typename vector_type::type; @@ -521,114 +522,6 @@ amd_buffer_store_impl_raw(const typename vector_type::type src_thread dst_wave_addr_offset + sizeof(int32_t) * 12, static_cast(coherence)); } - else if constexpr(is_same::value) - { - if constexpr(N == 1) - { - return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - } - else if constexpr(N == 2) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - return llvm_amdgcn_raw_buffer_load_i8x2(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); -#else - int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - - return bit_cast(tmp); -#endif - } - else if constexpr(N == 4) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - return llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); -#else - int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - - return bit_cast(tmp); -#endif - } - else if constexpr(N == 8) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - vector_type tmp; - - tmp.AsType()(Number<0>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(int8_t), - static_cast(coherence)); - - return tmp.AsType()(Number<0>{}); -#else - int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - - return bit_cast(tmp); -#endif - } - else if constexpr(N == 16) - { -#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE - vector_type tmp; - - tmp.AsType()(Number<0>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - - tmp.AsType()(Number<1>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 4 * sizeof(int8_t), - static_cast(coherence)); - - tmp.AsType()(Number<2>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 8 * sizeof(int8_t), - static_cast(coherence)); - - tmp.AsType()(Number<3>{}) = - llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset + 12 * sizeof(int8_t), - static_cast(coherence)); - - return tmp.AsType()(Number<0>{}); -#else - int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource, - src_thread_addr_offset, - src_wave_addr_offset, - static_cast(coherence)); - - return bit_cast(tmp); -#endif - } - } } template ::type; using uint8x32_t = typename vector_type::type; using uint8x64_t = typename vector_type::type; -// Convert X to Y -template -__host__ __device__ constexpr Y type_convert(X x) -{ - static_assert(!std::is_reference_v && !std::is_reference_v); - - return static_cast(x); -} - -// convert bfp16 to fp32 -template <> -inline __host__ __device__ constexpr float type_convert(bhalf_t x) -{ - union - { - uint32_t int32; - float fp32; - } u = {uint32_t(x) << 16}; - - return u.fp32; -} - -// Convert X to Y -template -__host__ __device__ constexpr Y type_convert_sp(X x) -{ - static_assert(!std::is_reference_v && !std::is_reference_v); - - return static_cast(x); -} - -template <> -inline __host__ __device__ constexpr int type_convert_sp(float x) -{ - union - { - float fp32; - int int32; - } u = {x}; - - return u.int32; -} - -template <> -inline __host__ __device__ constexpr float type_convert_sp(int x) -{ - union - { - int int32; - float fp32; - } u = {x}; - - return u.fp32; -} - -template <> -inline __host__ __device__ constexpr int type_convert_sp(half_t x) -{ - union - { - half_t fp16; - int int32; - } u = {x}; - - return u.int32; -} - -template <> -inline __host__ __device__ constexpr half_t type_convert_sp(int x) -{ - union - { - int int32; - half_t fp16; - } u = {x}; - - return u.fp16; -} - -// convert fp32 to bfp16 -template <> -inline __host__ __device__ constexpr bhalf_t type_convert(float x) -{ - union - { - float fp32; - uint32_t int32; - } u = {x}; - - return uint16_t(u.int32 >> 16); -} - -// convert bfp16 to fp16 via fp32 -template <> -inline __host__ __device__ constexpr half_t type_convert(bhalf_t x) -{ - float x_fp32 = type_convert(x); - - return static_cast(x_fp32); -} - -// convert fp16 to bfp16 via fp32 -template <> -inline __host__ __device__ constexpr bhalf_t type_convert(half_t x) -{ - float x_fp32 = static_cast(x); - - return type_convert(x_fp32); -} - -// convert bfp16 to int32 via fp32 -template <> -inline __host__ __device__ constexpr int32_t type_convert(bhalf_t x) -{ - float x_fp32 = type_convert(x); - - return static_cast(x_fp32); -} - -// convert int32 to bfp16 via fp32 -template <> -inline __host__ __device__ constexpr bhalf_t type_convert(int32_t x) -{ - float x_fp32 = static_cast(x); - - return type_convert(x_fp32); -} - -// convert bfp16 to int8 via fp32 -template <> -inline __host__ __device__ constexpr int8_t type_convert(bhalf_t x) -{ - float x_fp32 = type_convert(x); - - return static_cast(x_fp32); -} - -// convert int8 to bfp16 via fp32 -template <> -inline __host__ __device__ constexpr bhalf_t type_convert(int8_t x) -{ - float x_fp32 = static_cast(x); - - return type_convert(x_fp32); -} - -// convert int8 to fp16 via fp32 -template <> -inline __host__ __device__ constexpr half_t type_convert(int8_t x) -{ - // TODO: replace it with fast_converter - float x_fp32 = static_cast(x); - - return type_convert(x_fp32); -} - -// Declare a template function for bf16 conversion using RTN -template -__host__ __device__ constexpr Y bf16_convert_rtn(X x); - -// Convert fp32 to bf16 with RTN if higher precision is needed -template <> -inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn(float x) -{ - union - { - float fp32; - uint32_t int32; - } u = {x}; - - // When the exponent bits are not all 1s, then the value is zero, normal, - // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus - // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd). - // This causes the bfloat16's mantissa to be incremented by 1 if the 16 - // least significant bits of the float mantissa are greater than 0x8000, - // or if they are equal to 0x8000 and the least significant bit of the - // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when - // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already - // has the value 0x7f, then incrementing it causes it to become 0x00 and - // the exponent is incremented by one, which is the next higher FP value - // to the unrounded bfloat16 value. When the bfloat16 value is subnormal - // with an exponent of 0x00 and a mantissa of 0x7f, it may be rounded up - // to a normal value with an exponent of 0x01 and a mantissa of 0x00. - // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F, - // incrementing it causes it to become an exponent of 0xFF and a mantissa - // of 0x00, which is Inf, the next higher value to the unrounded value. - bool flag0 = ~u.int32 & 0x7f800000; - - // When all of the exponent bits are 1, the value is Inf or NaN. - // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero - // mantissa bit. Quiet NaN is indicated by the most significant mantissa - // bit being 1. Signaling NaN is indicated by the most significant - // mantissa bit being 0 but some other bit(s) being 1. If any of the - // lower 16 bits of the mantissa are 1, we set the least significant bit - // of the bfloat16 mantissa, in order to preserve signaling NaN in case - // the bfloat16's mantissa bits are all 0. - bool flag1 = !flag0 && (u.int32 & 0xffff); - - u.int32 += flag0 ? 0x7fff + ((u.int32 >> 16) & 1) : 0; // Round to nearest, round to even - u.int32 |= flag1 ? 0x10000 : 0x0; // Preserve signaling NaN - - return uint16_t(u.int32 >> 16); -} - -// convert fp16 to bfp16 via fp32 with RTN if higher precision is needed -template <> -inline __host__ __device__ constexpr bhalf_t bf16_convert_rtn(half_t x) -{ - float x_fp32 = static_cast(x); - - return bf16_convert_rtn(x_fp32); -} - template struct NumericLimits { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp index f925397832b..4ea23ea1f96 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_wmma_instance.hpp @@ -54,36 +54,36 @@ template using device_grouped_conv_fwd_wmma_f16_instances = std::tuple< // clang-format off - //########################################| NumDim| A| B| Ds| E| AData| BData| Ds| EData| AccData| CShuffle| A| B| CDE| ConvForward| GEMM| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| DataType| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| Prefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // generic instance - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 4>, 1>, // blocksize=256 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>, // blocksize=128 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8>, // blocksize=64 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 2>, 8>, // blocksize=32 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8> + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8> // clang-format on >; @@ -97,36 +97,36 @@ template using device_grouped_conv_fwd_wmma_i8_instances = std::tuple< // clang-format off - //########################################| NumDim| A| B| Ds| E| AData| BData| Ds| EData| AccData| CShuffle| A| B| CDE| ConvForward| GEMM| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| DataType| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| Prefetch| Block| MPer| NPer| KPer| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | //generic instance - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 16, 1, 1, 1, S<1, 32, 1, 4>, 1>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 16, 1, 1, 1, S<1, 32, 1, 4>, 1>, // blocksize=256 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 8>, 8>, // blocksize=128 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 4>, 8>, // blocksize=64 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 32, 1, 2>, 8>, // blocksize=32 - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8> + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGroupedConvFwdMultipleD_Wmma_CShuffle, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index 3d243e3d562..e9cc1e854fe 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -111,6 +111,12 @@ list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_kn_mn_instance.cpp device_gemm_xdl_c_shuffle_fp8_fp8_fp8_km_nk_mn_instance.cpp) +list(APPEND GEMM_INSTANCES + device_gemm_wmma_f16_f16_f16_mk_kn_mn_instance.cpp + device_gemm_wmma_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_wmma_f16_f16_f16_km_kn_mn_instance.cpp + device_gemm_wmma_f16_f16_f16_km_nk_mn_instance.cpp) + add_instance_library(device_gemm_instance ${GEMM_INSTANCES}) set(ENABLE_PIPELINE_V2_OPT) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt index 2d1a3447bdf..426edeed748 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt @@ -3,8 +3,8 @@ add_instance_library(device_gemm_bilinear_instance device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp - #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp - #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp - #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp - #device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp + device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp + device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp + device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp + device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp index 73ea9cac07e..dd055fabb87 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instance.cpp @@ -36,32 +36,32 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial // e[m, n] = bilinear(a[m, k] * b[k, n], d[m, n]) using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_kn_mn_mn_instances = std::tuple< // clang-format off - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, // M/N/K padding - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 8, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 8, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 4, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 4, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 4, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 4, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 2>, 4> + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 8, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 32, 4, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 32, 4, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 32, 4, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 32, 4, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 2>, 4> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp index 1f36113e623..f6074843639 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instance.cpp @@ -36,32 +36,32 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial // e[m, n] = bilinear(a[m, k] * b[k, n], d[m, n]) using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_km_nk_mn_mn_instances = std::tuple< // clang-format off - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, // M/N/K padding - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 8, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 8, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 4, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 4, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 4, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 4, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 2>, 4> + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 8, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 32, 4, 16, 16, 4, 2, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 32, 4, 16, 16, 2, 2, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 32, 4, 16, 16, 1, 2, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Col, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 32, 4, 16, 16, 1, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 2>, 4> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp index 688c4633690..accb2f80b66 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp @@ -36,32 +36,32 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial // e[m, n] = bilinear(a[m, k] * b[k, n], d[m, n]) using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instances = std::tuple< // clang-format off - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, // M/N/K padding - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 8, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 4, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 4, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 4, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 4, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 2>, 4> + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 8, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 32, 4, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 32, 4, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 32, 4, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Row, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 32, 4, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 8, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 4, 1, 1, 1, S<1, 16, 1, 2>, 4> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp index 5319bd86056..6a23b703210 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp @@ -38,56 +38,56 @@ using device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instances = st // clang-format off // no padding // N % 16 == 0 && K % 16 == 0 - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmDefault, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmDefault, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, // M/N/K padding // N % 16 == 0 && K % 16 == 0 - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 4, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 4, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 4, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 4, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 16, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 8>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 16, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 4>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 16, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 2, S<1, 32, 1, 2>, 16>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 16, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1, 1, 1, S<1, 16, 1, 2>, 8>, // M/N/K padding // N % 8 == 0 && K % 8 == 0 - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 8, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 8, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 4>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 2>, 8>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 8>, // M/N/K padding // N % 8 == 0 && K % 8 == 0 - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 4, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 4, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 4, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 4, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 2>, 4>, + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 32, 4, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 8>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 32, 4, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 4>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 32, 4, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 2, S<1, 32, 1, 2>, 4>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 32, 4, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, 1, 1, 1, S<1, 16, 1, 2>, 4>, // M/N/K padding // N % 1 == 0 && K % 8 == 0 - //################################| A| B| Ds| E| AData| BData| DsData| EData| AccData| CShuffle| A| B| CDE| GEMM| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| - //################################| Layout| Layout| Layout| Layout| Type| Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| - //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| - //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 256, 128, 128, 8, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 1>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 128, 64, 64, 8, 8, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 4>, 1>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 64, 32, 32, 8, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 2>, 1>, - DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I8_Tuple, I8, I32, I32, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 32, 16, 16, 8, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 1> + //################################| A| B| Ds| E| AData| BData| AccData| CShuffle| DsData| EData| A| B| CDE| GEMM| Prefetch| Block| MPer| NPer| K0Per| K1| MPer| NPer| MRepeat| NRepeat| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| + //################################| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| Type| Type| Elementwise| Elementwise| Elementwise| Specialization| Stage| Size| Block| Block| Block| | WMMA| WMMA| | | ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| + //################################| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| + //################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 256, 128, 128, 64, 8, 16, 16, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 8>, 1>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 128, 64, 64, 64, 8, 16, 16, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 4>, 1>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 64, 32, 32, 64, 8, 16, 16, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 2, S<1, 32, 1, 2>, 1>, + DeviceGemmMultipleD_Wmma_CShuffle< Row, Col, Row_Tuple, Row, I8, I8, I32, I32, I8_Tuple, I8, PassThrough, PassThrough, Bilinear, GemmMNKPadding, 1, 32, 16, 16, 64, 8, 16, 16, 1, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 2>, 1> // clang-format on >; diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt index 0d09d958d6c..93d5bd7422c 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt @@ -7,12 +7,12 @@ add_instance_library( xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp xdl/device_grouped_conv2d_bwd_data_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp - #wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp + wmma/device_grouped_conv2d_bwd_data_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt index 24f6db2eb7a..2715a8cf21b 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt @@ -17,21 +17,21 @@ add_instance_library(device_grouped_conv2d_fwd_instance dl/device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instance.cpp # WMMA # GNHWC, GKYXC, GNHWK - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_oddc_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_oddc_instance.cpp ## NHWGC, GKYXC, NHWGK - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp - #wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp + wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt index 6558c4b2c7b..968e8dea2ff 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt @@ -16,15 +16,15 @@ if(DL_KERNELS) dl/device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp) endif() -#list(APPEND GROUPED_CONV3D_BWD_WEIGHT -# wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp -# wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp) +list(APPEND GROUPED_CONV3D_BWD_WEIGHT + wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp + wmma/device_grouped_conv3d_bwd_weight_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp) if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "bf8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES) list(APPEND GROUPED_CONV3D_BWD_WEIGHT diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt index dc05b2f0ce4..540ce3410bc 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt @@ -7,22 +7,22 @@ set(GROUPED_CONV3D_FWD xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp - #wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_1x1s1p0_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_f16_oddc_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_gndhwc_gkzyxc_gndhwk_i8_oddc_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp + wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp ) if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES) diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh index a649c8f13f0..728b8c10923 100755 --- a/script/clang-format-overwrite.sh +++ b/script/clang-format-overwrite.sh @@ -1,7 +1,2 @@ -<<<<<<< HEAD -# find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}' -======= find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}' git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-12 -i -style=file {}' ->>>>>>> origin/develop From 4c102fccaed7bea506c82a48ede1f126d9e29534 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 27 Feb 2024 07:52:25 +0000 Subject: [PATCH 109/118] Solve a bug when K1=16 --- .../gpu/block/blockwise_gemm_wmma.hpp | 48 +++++++++---------- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 8 ++-- .../device/impl/device_fpAintB_gemm_wmma.hpp | 8 ++-- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 8 ++-- .../gpu/device/impl/device_gemm_wmma.hpp | 8 ++-- .../gpu/grid/gridwise_fpAintB_gemm_wmma.hpp | 2 +- ...gridwise_gemm_multiple_d_wmma_cshuffle.hpp | 2 +- .../gpu/grid/gridwise_gemm_wmma.hpp | 2 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 20 ++++---- 9 files changed, 58 insertions(+), 48 deletions(-) diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp index 679da465dab..f8ee283c67c 100644 --- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_wmma.hpp @@ -302,13 +302,13 @@ struct BlockwiseGemmWMMA // basic intrinsic to determine loopover direction if constexpr(MRepeat < NRepeat) { - static_for<0, KPerBlock / WmmaK, 1>{}( + static_for<0, KPerBlock / KPack, 1>{}( [&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ... static_for<0, MRepeat, 1>{}([&](auto m0) { // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0, I0), + make_tuple(Number{}, m0, I0, I0, I0, I0), a_block_buf, a_thread_desc_, make_tuple(I0, m0, I0, I0, I0, I0), @@ -318,16 +318,16 @@ struct BlockwiseGemmWMMA // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0, I0), + make_tuple(Number{}, n0, I0, I0, I0, I0), b_block_buf, b_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0), b_thread_buf); - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; - static_for<0, WmmaK, 1>{}([&](auto i) { + static_for<0, KPack, 1>{}([&](auto i) { a_thread_vec.template AsType()(i) = a_thread_buf[Number()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); }); }); @@ -364,12 +364,12 @@ struct BlockwiseGemmWMMA { static_for<0, NRepeat, 1>{}([&](auto n0) { static_for<0, MRepeat, 1>{}([&](auto m0) { - static_for<0, KPerBlock / WmmaK, 1>{}([&](auto k) { // k=0,1,2 instead of + static_for<0, KPerBlock / KPack, 1>{}([&](auto k) { // k=0,1,2 instead of // k=0,kpack*1, .. // read B b_thread_copy_.Run( b_block_desc_k0_n0_n1_n2_k1, - make_tuple(Number{}, n0, I0, I0, I0, I0), + make_tuple(Number{}, n0, I0, I0, I0, I0), b_block_buf, b_thread_desc_, make_tuple(I0, n0, I0, I0, I0, I0), @@ -377,16 +377,16 @@ struct BlockwiseGemmWMMA // read A a_thread_copy_.Run( a_block_desc_k0_m0_m1_m2_k1, - make_tuple(Number{}, m0, I0, I0, I0, I0), + make_tuple(Number{}, m0, I0, I0, I0, I0), a_block_buf, a_thread_desc_, make_tuple(I0, m0, I0, I0, I0, I0), a_thread_buf); - vector_type a_thread_vec; - vector_type b_thread_vec; + vector_type a_thread_vec; + vector_type b_thread_vec; - static_for<0, WmmaK, 1>{}([&](auto i) { + static_for<0, KPack, 1>{}([&](auto i) { b_thread_vec.template AsType()(i) = b_thread_buf[Number()(Number<0>{}), - b_thread_vec.template AsType()(Number<0>{}), + a_thread_vec.template AsType(), + b_thread_vec.template AsType(), c_thread_buf.GetVectorTypeReference(Number{})); }); }); @@ -423,28 +423,28 @@ struct BlockwiseGemmWMMA protected: static constexpr auto a_thread_desc_ = - make_naive_tensor_descriptor(make_tuple(Number{}, + make_naive_tensor_descriptor(make_tuple(Number{}, Number{}, I1, Number{}, I1, Number{}), make_tuple(Number{}, - Number{}, + Number{}, Number{}, Number{}, Number{}, Number<1>{})); static constexpr auto b_thread_desc_ = - make_naive_tensor_descriptor(make_tuple(Number{}, + make_naive_tensor_descriptor(make_tuple(Number{}, Number{}, I1, Number{}, I1, Number{}), make_tuple(Number{}, - Number{}, + Number{}, Number{}, Number{}, Number{}, @@ -465,7 +465,7 @@ struct BlockwiseGemmWMMA FloatA, decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), - Sequence, + Sequence, Sequence<0, 1, 2, 3, 4, 5>, 5, A_K1, @@ -481,7 +481,7 @@ struct BlockwiseGemmWMMA decltype(a_block_desc_k0_m0_m1_m2_k1), decltype(a_thread_desc_), tensor_operation::element_wise::PassThrough, - Sequence, + Sequence, Sequence<0, 1, 2, 3, 4, 5>, 5, A_K1, @@ -501,7 +501,7 @@ struct BlockwiseGemmWMMA FloatB, decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), - Sequence, + Sequence, Sequence<0, 1, 2, 3, 4, 5>, 5, B_K1, @@ -517,7 +517,7 @@ struct BlockwiseGemmWMMA decltype(b_block_desc_k0_n0_n1_n2_k1), decltype(b_thread_desc_), tensor_operation::element_wise::PassThrough, - Sequence, + Sequence, Sequence<0, 1, 2, 3, 4, 5>, 5, B_K1, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 9e356fbe1f6..5b4746cbda9 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -131,10 +131,12 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); - static constexpr auto WmmaK = 16; + static constexpr auto WmmaK = K1 == 16 ? 32 : 16; - static constexpr auto AEnableLds_auto = NWaves == 1 ? false : true; - static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; + static constexpr auto AEnableLds_auto = + (NWaves == 1 && is_same::value) ? false : true; + static constexpr auto BEnableLds_auto = + (MWaves == 1 && is_same::value) ? false : true; // If true, LDS is used unconditionally static constexpr auto AEnableLds_manu = false; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 494993f0685..6021ecaf43c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -89,10 +89,12 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB::value) ? false : true; + static constexpr auto BEnableLds_auto = + (MWaves == 1 && is_same::value) ? false : true; // If true, LDS is used unconditionally // LDS bypass feature not implemented for dequantization pipeline. diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp index 2eaeb037dc7..a2af5d6a85c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp @@ -93,10 +93,12 @@ struct DeviceGemmMultipleD_Wmma_CShuffle : public DeviceGemmMultipleD::value) ? false : true; + static constexpr auto BEnableLds_auto = + (MWaves == 1 && is_same::value) ? false : true; // If true, LDS is used unconditionally static constexpr auto AEnableLds_manu = false; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index 1c550895c3c..a7f23052916 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -86,10 +86,12 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm::value) ? false : true; + static constexpr auto BEnableLds_auto = + (MWaves == 1 && is_same::value) ? false : true; // If true, LDS is used unconditionally static constexpr auto AEnableLds_manu = false; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp index 18c996844f1..67e211ef8d6 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_fpAintB_gemm_wmma.hpp @@ -148,7 +148,7 @@ struct GridwiseFpAintBGemm_Wmma static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); - static constexpr auto WmmaK = 16; + static constexpr auto WmmaK = K1 == 16 ? 32 : 16; using ThisThreadBlock = ThisThreadBlock; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp index e01d835fb6a..82d010a99a0 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_wmma_cshuffle.hpp @@ -340,7 +340,7 @@ struct GridwiseGemmMultipleD_Wmma static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); - static constexpr auto WmmaK = 16; + static constexpr auto WmmaK = K1 == 16 ? 32 : 16; using ThisThreadBlock = ThisThreadBlock; diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp index 73700895479..8e4117593c4 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma.hpp @@ -135,7 +135,7 @@ struct GridwiseGemm_Wmma static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma); static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); - static constexpr auto WmmaK = 16; + static constexpr auto WmmaK = K1 == 16 ? 32 : 16; using ThisThreadBlock = ThisThreadBlock; diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 1d18e9c536f..70fbcec10fa 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -373,7 +373,7 @@ struct WmmaGemm static_assert(NPerWmma == 16 && MPerWmma == 16, "Only support GemmNPerWmma == 16 and GemmMPerWmma == 16 for wmma"); - static_assert(KPack == wmma_instr.k_per_wmma, "KPack should be k_per_wmma"); + static_assert(KPack % wmma_instr.k_per_wmma == 0, "KPack should be multiple of k_per_wmma"); } // WMMA output supporting C = A * B @@ -486,14 +486,16 @@ struct WmmaGemm , "base type couple must be (half, float), (bhalf, float), (half, half), (bhalf, bhalf), " "(int8, int32) or (int4, int32)!"); - if constexpr(!TransposeC) - { - wmma_instr.template run(p_a_wave, p_b_wave, p_c_thread); - } - else - { - wmma_instr.template run(p_b_wave, p_a_wave, p_c_thread); - } + static_for<0, KPack / wmma_instr.k_per_wmma, 1>{}([&](auto k) { + if constexpr(!TransposeC) + { + wmma_instr.template run(p_a_wave[k], p_b_wave[k], p_c_thread); + } + else + { + wmma_instr.template run(p_b_wave[k], p_a_wave[k], p_c_thread); + } + }); } __device__ static auto GetLaneId() { return get_thread_local_1d_id() % wmma_instr.wave_size; } From 924639f9ecc4f2288fb5895a09dd3d421949157f Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 27 Feb 2024 08:00:56 +0000 Subject: [PATCH 110/118] remove unnecessary changes --- CMakeLists.txt | 1 + example/13_pool2d_fwd/pool2d_fwd_common.hpp | 6 +-- example/13_pool2d_fwd/pool2d_fwd_fp16.cpp | 2 +- ...atched_gemm_scale_softmax_gemm_permute.inc | 2 +- include/ck/ck.hpp | 1 - script/cmake-ck-dev.sh | 2 +- script/unet_mha.sh | 52 ------------------- 7 files changed, 7 insertions(+), 59 deletions(-) delete mode 100644 script/unet_mha.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c1bac8647d..bdeba33eace 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -410,6 +410,7 @@ include_directories(BEFORE SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV") if(BUILD_DEV) + add_compile_options(-Werror) add_compile_options(-Weverything) endif() message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp index 6113910e796..3ce08fd2afc 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_common.hpp +++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp @@ -123,9 +123,9 @@ bool pool_test(bool do_verification, {N, C, Hi, Wi}, {Y, X}, {N, C, Ho, Wo}, - {}, - {}, - {}, + {C * Hi * Wi, 1, Wi * C, C}, + {C * Ho * Wo, 1, Wo * C, C}, + {C * Ho * Wo, 1, Wo * C, C}, window_strides, window_dilations, input_left_pads, diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp index 71f75732bfc..d767e922489 100644 --- a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp +++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp @@ -18,7 +18,7 @@ using IndexDataType = int32_t; using InLayout = ck::tensor_layout::convolution::NHWC; using OutLayout = ck::tensor_layout::convolution::NHWC; -#if 0 +#if 1 static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; #else static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc index 0b876af952f..fa76faea84e 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. int run(int argc, char* argv[]) { diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index bbff9ff9695..c93d1d06394 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -108,7 +108,6 @@ #define CK_USE_AMD_WMMA #endif -// TODO: enable buffer load when found correct 3rd dword // buffer load #define CK_USE_AMD_BUFFER_LOAD 1 diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index da061019fa3..51d6f7a30c1 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -11,7 +11,7 @@ cmake -D CMAKE_CXX_FLAGS="-std=c++17 -O3 -ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ -D CMAKE_BUILD_TYPE=Release \ -D BUILD_DEV=ON \ --D GPU_TARGETS="gfx1100" \ +-D GPU_TARGETS="gfx908;gfx90a;gfx940" \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D USE_BITINT_EXTENSION_INT4=OFF \ ${MY_PROJECT_SOURCE} diff --git a/script/unet_mha.sh b/script/unet_mha.sh deleted file mode 100644 index ce50aadab51..00000000000 --- a/script/unet_mha.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -while getopts e: flag -do - case "${flag}" in - e) executable=${OPTARG};; - esac -done -echo "CK-NAVI31 Performance Test: MHA for AITemplate" - -VERIFICATION=0 -INITIALIZE=1 -TIMING=1 - -ALL_TEST_CASE=0 -SELF_ATTENTION=1 -CROSS_ATTENTION=0 -CAUSAL_MASK=0 -# self attention with causal mask -if [ $ALL_TEST_CASE -eq 1 ] || { [ $SELF_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 1 ]; }; then - echo "Test launched: self attention with causal mask" - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 4096 4096 40 40 2 8 0.158113881945610 1 1 - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 1024 1024 80 80 2 8 0.111803397536277 1 1 - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 256 256 160 160 2 8 0.079056940972805 1 1 - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 64 64 160 160 2 8 0.079056940972805 1 1 -fi - -# cross attention with causal mask -if [ $ALL_TEST_CASE -eq 1 ] || { [ $CROSS_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 1 ]; }; then - echo "Test launched: cross attention with causal mask" - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 4096 64 40 40 2 8 0.158113881945610 1 1 - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 1024 64 80 80 2 8 0.111803397536277 1 1 - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 256 64 160 160 2 8 0.079056940972805 1 1 - ./bin/example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 $VERIFICATION 1 $TIMING 64 64 160 160 2 8 0.079056940972805 1 1 -fi - -# self attention without causal mask -if [ $ALL_TEST_CASE -eq 1 ] || { [ $SELF_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 0 ]; }; then - echo "Test launched: self attention without causal mask" - $executable $VERIFICATION $INITIALIZE $TIMING 4096 4096 64 64 2 5 0.125 1 1 - $executable $VERIFICATION $INITIALIZE $TIMING 1024 1024 64 64 2 10 0.125 1 1 - $executable $VERIFICATION $INITIALIZE $TIMING 256 256 64 64 2 20 0.125 1 1 - $executable $VERIFICATION $INITIALIZE $TIMING 64 64 64 64 2 20 0.125 1 1 -fi - -# cross attention without causal mask -if [ $ALL_TEST_CASE -eq 1 ] || { [ $CROSS_ATTENTION -eq 1 ] && [ $CAUSAL_MASK -eq 0 ]; }; then - echo "Test launched: cross attention without causal mask" - $executable $VERIFICATION 1 $TIMING 4096 64 40 40 2 8 0.158113881945610 1 1 - $executable $VERIFICATION 1 $TIMING 1024 64 80 80 2 8 0.111803397536277 1 1 - $executable $VERIFICATION 1 $TIMING 256 64 160 160 2 8 0.079056940972805 1 1 - $executable $VERIFICATION 1 $TIMING 64 64 160 160 2 8 0.079056940972805 1 1 -fi \ No newline at end of file From b62926dca0e8b59b6bfdea09eea27424360fa40d Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Tue, 27 Feb 2024 08:46:42 +0000 Subject: [PATCH 111/118] Remove tensor layout limitation to LDS usage in tesnor contraction --- .../device_batched_contraction_multiple_d_wmma_cshuffle.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index 5b4746cbda9..d35645c0689 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -133,10 +133,8 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma); static constexpr auto WmmaK = K1 == 16 ? 32 : 16; - static constexpr auto AEnableLds_auto = - (NWaves == 1 && is_same::value) ? false : true; - static constexpr auto BEnableLds_auto = - (MWaves == 1 && is_same::value) ? false : true; + static constexpr auto AEnableLds_auto = NWaves == 1 ? false : true; + static constexpr auto BEnableLds_auto = MWaves == 1 ? false : true; // If true, LDS is used unconditionally static constexpr auto AEnableLds_manu = false; From 8a6e65a3dac5764ac62d5d06799c0d38f3e5adab Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 28 Feb 2024 09:19:57 +0000 Subject: [PATCH 112/118] update self-attention and cross-attention --- .../cross_attention_forward_wmma_fp16.cpp | 22 +++ .../run_cross_attention_wmma.inc | 184 +++++++++++------- .../run_self_attention_wmma.inc | 167 +++++++++------- .../self_attention_forward_wmma_fp16.cpp | 124 ++++++++---- 4 files changed, 318 insertions(+), 179 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp index c5b6c7efbe9..4c92c5497fb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/cross_attention_forward_wmma_fp16.cpp @@ -301,6 +301,28 @@ using DeviceMHAFactory = S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, // CShuffleBlockTransfer MN 1, 1, S<1, 128, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 256, + // Gemm 0 + 128, 64, 48, 8,4, + // Gemm 1 + 48, 64, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 4, 3, + // ABlockTransfer MK -> K0 M K1 + S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 128, 1, 2>, 8, MaskingSpec> #endif >; diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc index c5ae4a6b01d..9ff4c56e069 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_cross_attention_wmma.inc @@ -9,20 +9,18 @@ int run(int argc, char* argv[]) // GEMM shape for A/B0/B1/C // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o - ck::index_t M = 256; - ck::index_t N = 64; - ck::index_t K = 80; - ck::index_t O = 80; - - // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape - // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) - // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) - ck::index_t G0 = 2; - ck::index_t G1 = 8; - - float alpha = 1; - - bool input_permute = false; + ck::index_t q_sequence_length = 256; + ck::index_t kv_sequence_length = 64; + ck::index_t head_dim = 80; + + // Output shape C[batch_size, q_sequence_length, head_num, head_dim]. Batch dim, outer dim, + // inner dim must match GEMM shape C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) C_g0_m_g1_o = + // permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t batch_size = 2; + ck::index_t head_num = 8; + + float alpha = 1; + bool input_permute = true; bool output_permute = true; if(argc == 1) @@ -35,58 +33,85 @@ int run(int argc, char* argv[]) init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); } - else if(argc == 13) + else if(argc == 10) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); - M = std::stoi(argv[4]); - N = std::stoi(argv[5]); - K = std::stoi(argv[6]); - O = std::stoi(argv[7]); - G0 = std::stoi(argv[8]); - G1 = std::stoi(argv[9]); + q_sequence_length = std::stoi(argv[4]); + kv_sequence_length = std::stoi(argv[5]); + head_dim = std::stoi(argv[6]); + batch_size = std::stoi(argv[7]); + head_num = std::stoi(argv[8]); - alpha = std::stof(argv[10]); - - input_permute = std::stoi(argv[11]); - output_permute = std::stoi(argv[12]); + alpha = std::stof(argv[9]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); - printf("arg4 to 11: M, N, K, O, G0, G1\n"); - printf("arg10: scale (alpha)\n"); - printf("arg11 to 12: input / output permute\n"); + printf( + "arg4 to 8: q_sequence_length, kv_sequence_length, head_dim, batch_size, head_num\n"); + printf("arg9: scale (alpha)\n"); exit(0); } - std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_lengths{batch_size, head_num, q_sequence_length, head_dim}; std::vector a_gs_ms_ks_strides = - input_permute - ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] - : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] - - std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + input_permute ? std::vector{q_sequence_length * head_num * head_dim, + head_dim, + head_num * head_dim, + 1} + // A layout [batch_size, q_sequence_length, head_num, head_dim] + : std::vector{ + head_num * q_sequence_length * head_dim, + q_sequence_length * head_dim, + head_dim, + 1}; // A layout [batch_size, head_num, q_sequence_length, head_dim] + + std::vector b0_gs_ns_ks_lengths{ + batch_size, head_num, kv_sequence_length, head_dim}; std::vector b0_gs_ns_ks_strides = - input_permute - ? std::vector{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K] - : std::vector{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] - - std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + input_permute ? std::vector{kv_sequence_length * head_num * head_dim, + head_dim, + head_num * head_dim, + 1} + // B0 layout [batch_size, kv_sequence_length, head_num, head_dim] + : std::vector{ + head_num * kv_sequence_length * head_dim, + kv_sequence_length * head_dim, + head_dim, + 1}; // B0 layout [batch_size, head_num, kv_sequence_length, head_dim] + + std::vector b1_gs_os_ns_lengths{ + batch_size, head_num, head_dim, kv_sequence_length}; std::vector b1_gs_os_ns_strides = input_permute - ? std::vector{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O] - : std::vector{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] - - std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + ? std::vector{kv_sequence_length * head_num * head_dim, + head_dim, + 1, + head_num * head_dim} + // B1 layout [batch_size, kv_sequence_length, head_num, head_dim] + : std::vector{ + head_num * kv_sequence_length * head_dim, + kv_sequence_length * head_dim, + 1, + head_dim}; // B1 layout [batch_size, head_num, kv_sequence_length, head_dim] + + std::vector c_gs_ms_os_lengths{batch_size, head_num, q_sequence_length, head_dim}; std::vector c_gs_ms_os_strides = - output_permute - ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] - : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + output_permute ? std::vector{q_sequence_length * head_num * head_dim, + head_dim, + head_num * head_dim, + 1} + // C layout [batch_size, q_sequence_length, head_num, head_dim] + : std::vector{ + head_num * q_sequence_length * head_dim, + q_sequence_length * head_dim, + head_dim, + 1}; // C layout [batch_size, head_num, q_sequence_length, head_dim] Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); @@ -158,9 +183,14 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } - std::vector kv_gs_ns_ks_lengths{G0, G1, N, 2, K}; + std::vector kv_gs_ns_ks_lengths{ + batch_size, head_num, kv_sequence_length, 2, head_dim}; std::vector kv_gs_ns_ks_strides = std::vector{ - N * G1 * 2 * K, 2 * K, G1 * 2 * K, K, 1}; // kv layout [G0, M, G1, 2, K] + kv_sequence_length * head_num * 2 * head_dim, + 2 * head_dim, + head_num * 2 * head_dim, + head_dim, + 1}; // kv layout [batch_size, q_sequence_length, head_num, 2, head_dim] Tensor kv_gs_ns_ks(kv_gs_ns_ks_lengths, kv_gs_ns_ks_strides); // merge kv into a packed pointer send to device b0_gs_ns_ks.ForEach( @@ -189,20 +219,20 @@ int run(int argc, char* argv[]) printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { - const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + const auto device_mha_instance = std::get(DeviceMHAFactory{}); - using DeviceMHAInstance = ck::remove_cvref_t; + using DeviceMHAInstance = ck::remove_cvref_t; auto gemm = DeviceMHAInstance{}; auto invoker = gemm.MakeCrossAttnInvoker(); auto argument = gemm.MakeCrossAttnArgument(static_cast(q_device_buf.GetDeviceBuffer()), static_cast(kv_device_buf.GetDeviceBuffer()), static_cast(c_device_buf.GetDeviceBuffer()), - G0, - M, - N, - G1, - K, + batch_size, + q_sequence_length, + kv_sequence_length, + head_num, + head_dim, alpha); // if(!gemm.IsSupportedArgument(argument)) @@ -212,13 +242,17 @@ int run(int argc, char* argv[]) // return 0; // } - ck::index_t BatchCount = G0 * G1; + ck::index_t BatchCount = batch_size * head_num; float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + - sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + std::size_t flop = (size_t(q_sequence_length) * kv_sequence_length * head_dim * 2 + + size_t(q_sequence_length) * kv_sequence_length * head_dim * 2) * + BatchCount; + std::size_t num_btype = (sizeof(ADataType) * q_sequence_length * head_dim + + sizeof(B0DataType) * head_dim * kv_sequence_length + + sizeof(B1DataType) * kv_sequence_length * head_dim + + sizeof(CDataType) * q_sequence_length * head_dim) * BatchCount; float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -237,22 +271,26 @@ int run(int argc, char* argv[]) { c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); - Tensor a_g_m_k({BatchCount, M, K}); - Tensor b0_g_k_n({BatchCount, K, N}); - Tensor b1_g_n_o({BatchCount, N, O}); - Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 - Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax - Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + Tensor a_g_m_k({BatchCount, q_sequence_length, head_dim}); + Tensor b0_g_k_n({BatchCount, head_dim, kv_sequence_length}); + Tensor b1_g_n_o({BatchCount, kv_sequence_length, head_dim}); + Tensor acc0_g_m_n( + {BatchCount, q_sequence_length, kv_sequence_length}); // scratch object after gemm0 + Tensor a1_g_m_n({BatchCount, + q_sequence_length, + kv_sequence_length}); // scratch object after softmax + Tensor c_g_m_o_host_result( + {BatchCount, q_sequence_length, head_dim}); // scratch object after gemm1 // permute a_gs_ms_ks.ForEach([&](auto& self, auto idx) { - a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + a_g_m_k(idx[0] * head_num + idx[1], idx[2], idx[3]) = self(idx); }); b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { - b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + b0_g_k_n(idx[0] * head_num + idx[1], idx[3], idx[2]) = self(idx); }); b1_gs_os_ns.ForEach([&](auto& self, auto idx) { - b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + b1_g_n_o(idx[0] * head_num + idx[1], idx[3], idx[2]) = self(idx); }); // gemm 0 @@ -264,7 +302,7 @@ int run(int argc, char* argv[]) ref_gemm0_invoker.Run(ref_gemm0_argument); // masking - const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + const auto mask = typename DeviceMHAInstance::C0MatrixMask(kv_sequence_length); acc0_g_m_n.ForEach([&](auto& self, auto idx) { if(mask.IsMaskedElement(idx[1], idx[2])) self(idx) = -ck::NumericLimits::Infinity(); @@ -294,7 +332,7 @@ int run(int argc, char* argv[]) const size_t& g0 = idx[0]; const size_t& g1 = idx[1]; - const size_t g = g0 * G1 + g1; + const size_t g = g0 * head_num + g1; self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); }); @@ -330,8 +368,10 @@ int run(int argc, char* argv[]) std::cout << "---------------------------------------------------------------------------------" "-----------" << std::endl; - std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M - << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "Problem Size: BatchCount: " << batch_size << ", HeadNum: " << head_num + << ", q_sequence_length: " << q_sequence_length + << ", kv_sequence_length: " << kv_sequence_length << ", head_dim: " << head_dim + << std::endl; std::cout << "---------------------------------------------------------------------------------" "-----------" << std::endl; diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc index b9c474bb0c3..b844512744a 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc @@ -9,20 +9,17 @@ int run(int argc, char* argv[]) // GEMM shape for A/B0/B1/C // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o - ck::index_t M = 256; - ck::index_t N = 256; - ck::index_t K = 80; - ck::index_t O = 80; + ck::index_t sequence_length = 256; + ck::index_t head_dim = 80; - // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape - // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) - // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3]) - ck::index_t G0 = 2; - ck::index_t G1 = 8; + // Output shape C[batch_size, sequence_length, head_num, head_dim]. Batch dim, outer dim, inner + // dim must match GEMM shape C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o]) C_g0_m_g1_o = + // permute(C_g0_g1_m_o, [0, 2, 1, 3]) + ck::index_t batch_size = 2; + ck::index_t head_num = 8; - float alpha = 1; - - bool input_permute = false; + float alpha = 1; + bool input_permute = true; bool output_permute = true; if(argc == 1) @@ -35,58 +32,81 @@ int run(int argc, char* argv[]) init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); } - else if(argc == 13) + else if(argc == 9) { do_verification = std::stoi(argv[1]); init_method = std::stoi(argv[2]); time_kernel = std::stoi(argv[3]); - M = std::stoi(argv[4]); - N = std::stoi(argv[5]); - K = std::stoi(argv[6]); - O = std::stoi(argv[7]); - G0 = std::stoi(argv[8]); - G1 = std::stoi(argv[9]); - - alpha = std::stof(argv[10]); + sequence_length = std::stoi(argv[4]); + head_dim = std::stoi(argv[5]); + batch_size = std::stoi(argv[6]); + head_num = std::stoi(argv[7]); - input_permute = std::stoi(argv[11]); - output_permute = std::stoi(argv[12]); + alpha = std::stof(argv[8]); } else { printf("arg1: verification (0=no, 1=yes)\n"); printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"); printf("arg3: time kernel (0=no, 1=yes)\n"); - printf("arg4 to 11: M, N, K, O, G0, G1\n"); - printf("arg10: scale (alpha)\n"); - printf("arg11 to 12: input / output permute\n"); + printf("arg4 to 7: sequence_length, head_dim, batch_size, head_num\n"); + printf("arg8: scale (alpha)\n"); exit(0); } - std::vector a_gs_ms_ks_lengths{G0, G1, M, K}; + std::vector a_gs_ms_ks_lengths{batch_size, head_num, sequence_length, head_dim}; std::vector a_gs_ms_ks_strides = - input_permute - ? std::vector{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K] - : std::vector{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K] - - std::vector b0_gs_ns_ks_lengths{G0, G1, N, K}; + input_permute ? std::vector{sequence_length * head_num * head_dim, + head_dim, + head_num * head_dim, + 1} + // A layout [batch_size, sequence_length, head_num, head_dim] + : std::vector{ + head_num * sequence_length * head_dim, + sequence_length * head_dim, + head_dim, + 1}; // A layout [batch_size, head_num, sequence_length, head_dim] + + std::vector b0_gs_ns_ks_lengths{batch_size, head_num, sequence_length, head_dim}; std::vector b0_gs_ns_ks_strides = - input_permute - ? std::vector{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K] - : std::vector{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K] - - std::vector b1_gs_os_ns_lengths{G0, G1, O, N}; + input_permute ? std::vector{sequence_length * head_num * head_dim, + head_dim, + head_num * head_dim, + 1} + // B0 layout [batch_size, sequence_length, head_num, head_dim] + : std::vector{ + head_num * sequence_length * head_dim, + sequence_length * head_dim, + head_dim, + 1}; // B0 layout [batch_size, head_num, sequence_length, head_dim] + + std::vector b1_gs_os_ns_lengths{batch_size, head_num, head_dim, sequence_length}; std::vector b1_gs_os_ns_strides = input_permute - ? std::vector{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O] - : std::vector{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O] - - std::vector c_gs_ms_os_lengths{G0, G1, M, O}; + ? std::vector{sequence_length * head_num * head_dim, + head_dim, + 1, + head_num * head_dim} + // B1 layout [batch_size, sequence_length, head_num, head_dim] + : std::vector{ + head_num * sequence_length * head_dim, + sequence_length * head_dim, + 1, + head_dim}; // B1 layout [batch_size, head_num, sequence_length, head_dim] + + std::vector c_gs_ms_os_lengths{batch_size, head_num, sequence_length, head_dim}; std::vector c_gs_ms_os_strides = - output_permute - ? std::vector{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O] - : std::vector{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O] + output_permute ? std::vector{sequence_length * head_num * head_dim, + head_dim, + head_num * head_dim, + 1} + // C layout [batch_size, sequence_length, head_num, head_dim] + : std::vector{ + head_num * sequence_length * head_dim, + sequence_length * head_dim, + head_dim, + 1}; // C layout [batch_size, head_num, sequence_length, head_dim] Tensor a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides); Tensor b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides); @@ -158,9 +178,14 @@ int run(int argc, char* argv[]) b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal{}); } - std::vector qkv_gs_ms_ks_lengths{G0, G1, M, 3, K}; + std::vector qkv_gs_ms_ks_lengths{ + batch_size, head_num, sequence_length, 3, head_dim}; std::vector qkv_gs_ms_ks_strides = std::vector{ - M * G1 * 3 * K, 3 * K, G1 * 3 * K, K, 1}; // qkv layout [G0, M, G1, 3, K] + sequence_length * head_num * 3 * head_dim, + 3 * head_dim, + head_num * 3 * head_dim, + head_dim, + 1}; // qkv layout [batch_size, sequence_length, head_num, 3, head_dim] Tensor qkv_gs_ms_ks(qkv_gs_ms_ks_lengths, qkv_gs_ms_ks_strides); // merge qkv into a packed pointer send to device a_gs_ms_ks.ForEach( @@ -198,10 +223,10 @@ int run(int argc, char* argv[]) auto argument = gemm.MakeSelfAttnArgument(static_cast(qkv_device_buf.GetDeviceBuffer()), static_cast(c_device_buf.GetDeviceBuffer()), - G0, - M, - G1, - K, + batch_size, + sequence_length, + head_num, + head_dim, alpha); // if(!gemm.IsSupportedArgument(argument)) @@ -211,13 +236,17 @@ int run(int argc, char* argv[]) // return 0; // } - ck::index_t BatchCount = G0 * G1; + ck::index_t BatchCount = batch_size * head_num; float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); - std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount; - std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + - sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) * + std::size_t flop = (size_t(sequence_length) * sequence_length * head_dim * 2 + + size_t(sequence_length) * sequence_length * head_dim * 2) * + BatchCount; + std::size_t num_btype = (sizeof(ADataType) * sequence_length * head_dim + + sizeof(B0DataType) * head_dim * sequence_length + + sizeof(B1DataType) * sequence_length * head_dim + + sizeof(CDataType) * sequence_length * head_dim) * BatchCount; float tflops = static_cast(flop) / 1.E9 / ave_time; @@ -236,22 +265,25 @@ int run(int argc, char* argv[]) { c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data()); - Tensor a_g_m_k({BatchCount, M, K}); - Tensor b0_g_k_n({BatchCount, K, N}); - Tensor b1_g_n_o({BatchCount, N, O}); - Tensor acc0_g_m_n({BatchCount, M, N}); // scratch object after gemm0 - Tensor a1_g_m_n({BatchCount, M, N}); // scratch object after softmax - Tensor c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1 + Tensor a_g_m_k({BatchCount, sequence_length, head_dim}); + Tensor b0_g_k_n({BatchCount, head_dim, sequence_length}); + Tensor b1_g_n_o({BatchCount, sequence_length, head_dim}); + Tensor acc0_g_m_n( + {BatchCount, sequence_length, sequence_length}); // scratch object after gemm0 + Tensor a1_g_m_n( + {BatchCount, sequence_length, sequence_length}); // scratch object after softmax + Tensor c_g_m_o_host_result( + {BatchCount, sequence_length, head_dim}); // scratch object after gemm1 // permute a_gs_ms_ks.ForEach([&](auto& self, auto idx) { - a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx); + a_g_m_k(idx[0] * head_num + idx[1], idx[2], idx[3]) = self(idx); }); b0_gs_ns_ks.ForEach([&](auto& self, auto idx) { - b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + b0_g_k_n(idx[0] * head_num + idx[1], idx[3], idx[2]) = self(idx); }); b1_gs_os_ns.ForEach([&](auto& self, auto idx) { - b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx); + b1_g_n_o(idx[0] * head_num + idx[1], idx[3], idx[2]) = self(idx); }); // gemm 0 @@ -263,7 +295,7 @@ int run(int argc, char* argv[]) ref_gemm0_invoker.Run(ref_gemm0_argument); // masking - const auto mask = typename DeviceMHAInstance::C0MatrixMask(N); + const auto mask = typename DeviceMHAInstance::C0MatrixMask(sequence_length); acc0_g_m_n.ForEach([&](auto& self, auto idx) { if(mask.IsMaskedElement(idx[1], idx[2])) self(idx) = -ck::NumericLimits::Infinity(); @@ -293,7 +325,7 @@ int run(int argc, char* argv[]) const size_t& g0 = idx[0]; const size_t& g1 = idx[1]; - const size_t g = g0 * G1 + g1; + const size_t g = g0 * head_num + g1; self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]); }); @@ -329,8 +361,9 @@ int run(int argc, char* argv[]) std::cout << "---------------------------------------------------------------------------------" "-----------" << std::endl; - std::cout << "Problem Size: BatchCount: " << G0 << ", HeadNum: " << G1 << ", M: " << M - << ", N: " << N << ", K: " << K << ", O: " << O << std::endl; + std::cout << "Problem Size: BatchCount: " << batch_size << ", HeadNum: " << head_num + << ", sequence_length: " << sequence_length << ", head_dim: " << head_dim + << std::endl; std::cout << "---------------------------------------------------------------------------------" "-----------" << std::endl; diff --git a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp index 3f964908ed7..8e037272b83 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp +++ b/example/32_batched_gemm_scale_softmax_gemm/self_attention_forward_wmma_fp16.cpp @@ -83,12 +83,34 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 32, // Gemm 0 - 16, 128, 64, 8, 8, + 16, 32, 160, 8, 8, // Gemm 1 - 64, 64, 8, - 16, 16, 16, + 80, 32, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 2, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 2, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 8, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 16, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 32, + // Gemm 0 + 16, 64, 80, 8, 8, + // Gemm 1 + 80, 64, 8, + 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 8, 4, + 1, 4, 5, // ABlockTransfer MK -> K0 M K1 S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 @@ -105,12 +127,12 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 32, // Gemm 0 - 16, 64, 64, 8, 8, + 16, 64, 48, 8, 8, // Gemm 1 - 64, 64, 8, + 48, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 4, 4, + 1, 4, 3, // ABlockTransfer MK -> K0 M K1 S<2, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 @@ -129,16 +151,16 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 64, // Gemm 0 - 32, 128, 64, 8, 8, + 32, 64, 48, 8, 8, // Gemm 1 - 64, 64, 8, + 48, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 8, 4, + 1, 4, 3, // ABlockTransfer MK -> K0 M K1 S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 - S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B1BlockTransfer NL -> L0 N L1 S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, // CShuffleBlockTransfer MN @@ -151,16 +173,38 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 64, // Gemm 0 - 32, 64, 64, 8, 8, + 32, 64, 80, 8, 8, // Gemm 1 - 64, 64, 8, + 80, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 4, 4, + 1, 4, 5, // ABlockTransfer MK -> K0 M K1 S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 - S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B1BlockTransfer NL -> L0 N L1 + S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, + // CShuffleBlockTransfer MN + 1, 1, S<1, 32, 1, 2>, 8, + MaskingSpec>, + ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< + NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, + ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, + AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, + GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, + 64, + // Gemm 0 + 32, 32, 160, 8, 8, + // Gemm 1 + 80, 32, 8, + 16, 16, 16, + // Per repeat = wave_m = wave_num, wave_n = 1 + 1, 2, 5, + // ABlockTransfer MK -> K0 M K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + // B0BlockTransfer LK -> K0 L K1 + S<2, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B1BlockTransfer NL -> L0 N L1 S<2, 4, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 1, false, // CShuffleBlockTransfer MN @@ -175,20 +219,20 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 128, // Gemm 0 - 64, 128, 64, 8, 8, + 64, 128, 80, 8, 8, // Gemm 1 - 64, 64, 8, + 80, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 8, 4, + 1, 8, 5, // ABlockTransfer MK -> K0 M K1 S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 - S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B1BlockTransfer NL -> L0 N L1 S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, // CShuffleBlockTransfer MN - 1, 1, S<1, 64, 1, 2>, 8, + 1, 1, S<1, 64, 1, 2>, 8, MaskingSpec>, ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, @@ -197,45 +241,45 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 128, // Gemm 0 - 64, 64, 64, 8, 8, + 64, 192, 48, 8, 8, // Gemm 1 - 64, 64, 8, + 48, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 4, 4, + 1, 12, 3, // ABlockTransfer MK -> K0 M K1 S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 - S<8, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B1BlockTransfer NL -> L0 N L1 S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, // CShuffleBlockTransfer MN - 1, 1, S<1, 64, 1, 2>, 8, + 1, 1, S<1, 64, 1, 2>, 8, MaskingSpec>, -#endif -#ifdef CK_MHA_USE_WAVE_8 ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, AElementOp, B0ElementOp, Acc0ElementOp, B1ElementOp, CElementOp, GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, - 256, + 128, // Gemm 0 - 128, 128, 64, 8, 8, + 64, 64, 48, 8, 8, // Gemm 1 - 64, 64, 8, + 48, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 8, 4, + 1, 4, 3, // ABlockTransfer MK -> K0 M K1 - S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 - S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<2, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B1BlockTransfer NL -> L0 N L1 - S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + S<2, 8, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 1, false, // CShuffleBlockTransfer MN - 1, 1, S<1, 128, 1, 2>, 8, + 1, 1, S<1, 64, 1, 2>, 8, MaskingSpec>, +#endif +#ifdef CK_MHA_USE_WAVE_8 ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, ADataType, B0DataType, B1DataType, CDataType, Acc0BiasDataType, Acc0DataType, Acc1BiasDataType, Acc1DataType, CShuffleDataType, @@ -243,18 +287,18 @@ using DeviceMHAFactory = GemmSpec, TensorSpecA, TensorSpecB0, TensorSpecB1, TensorSpecC, 1, 256, // Gemm 0 - 128, 128, 64, 8, 8, + 128, 192, 48, 8,4, // Gemm 1 - 64, 64, 8, + 48, 64, 8, 16, 16, 16, // Per repeat = wave_m = wave_num, wave_n = 1 - 1, 8, 4, + 1, 12, 3, // ABlockTransfer MK -> K0 M K1 S<2, 128, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, // B0BlockTransfer LK -> K0 L K1 - S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, + S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 4, 4, true, // B1BlockTransfer NL -> L0 N L1 - S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, + S<2, 16, 8>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 1, false, // CShuffleBlockTransfer MN 1, 1, S<1, 128, 1, 2>, 8, MaskingSpec> From 08ab9cfa3d075c6109ff78ae28ed621789a12100 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Wed, 28 Feb 2024 09:52:03 +0000 Subject: [PATCH 113/118] fix a typo of name --- .../run_batched_gemm_scale_softmax_gemm_permute_wmma.inc | 4 ++-- .../run_grouped_query_attention_forward_wmma.inc | 4 ++-- .../run_multi_query_attention_forward_wmma.inc | 4 ++-- .../run_self_attention_wmma.inc | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc index 9add86cc1bf..2e77479bcca 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute_wmma.inc @@ -182,9 +182,9 @@ int run(int argc, char* argv[]) printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { - const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + const auto device_mha_instance = std::get(DeviceMHAFactory{}); - using DeviceMHAInstance = ck::remove_cvref_t; + using DeviceMHAInstance = ck::remove_cvref_t; auto gemm = DeviceMHAInstance{}; auto invoker = gemm.MakeInvoker(); auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc index 0d66d837d30..609d085299e 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_query_attention_forward_wmma.inc @@ -185,9 +185,9 @@ int run(int argc, char* argv[]) printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { - const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + const auto device_mha_instance = std::get(DeviceMHAFactory{}); - using DeviceMHAInstance = ck::remove_cvref_t; + using DeviceMHAInstance = ck::remove_cvref_t; auto gemm = DeviceMHAInstance{}; auto invoker = gemm.MakeInvoker(); auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc index 5a069d79576..b05915c07fb 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_multi_query_attention_forward_wmma.inc @@ -185,9 +185,9 @@ int run(int argc, char* argv[]) printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { - const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + const auto device_mha_instance = std::get(DeviceMHAFactory{}); - using DeviceMHAInstance = ck::remove_cvref_t; + using DeviceMHAInstance = ck::remove_cvref_t; auto gemm = DeviceMHAInstance{}; auto invoker = gemm.MakeInvoker(); auto argument = gemm.MakeArgument(static_cast(a_device_buf.GetDeviceBuffer()), diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc index b844512744a..3fdaaebb0f5 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc +++ b/example/32_batched_gemm_scale_softmax_gemm/run_self_attention_wmma.inc @@ -215,9 +215,9 @@ int run(int argc, char* argv[]) printf("Verification: %s\n", do_verification ? "ON" : "OFF"); // TODO ANT: replace array with vector? ck::static_for<0, std::tuple_size_v, 1>{}([&](auto i) -> void { - const auto device_conv_mha_instance = std::get(DeviceMHAFactory{}); + const auto device_mha_instance = std::get(DeviceMHAFactory{}); - using DeviceMHAInstance = ck::remove_cvref_t; + using DeviceMHAInstance = ck::remove_cvref_t; auto gemm = DeviceMHAInstance{}; auto invoker = gemm.MakeSelfAttnInvoker(); auto argument = From 68459244ba5a0e05008e3edc6083957fd4daed01 Mon Sep 17 00:00:00 2001 From: aska-0096 Date: Thu, 29 Feb 2024 03:30:57 +0000 Subject: [PATCH 114/118] Add arch limiter for fp8 gemm --- example/01_gemm/CMakeLists.txt | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 5b71cd15480..dd6b7b53503 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -53,12 +53,6 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp64) add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp) -add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp) -add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) - -add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) -add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) - list(APPEND gpu_list gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) @@ -72,5 +66,13 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) -add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) +if(GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942") + add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp) + add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) + + add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) + add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) + + add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) + add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) +endif() From e42f9ecf382f1289779fdbb26ea241168a70fc66 Mon Sep 17 00:00:00 2001 From: illsilin Date: Thu, 7 Mar 2024 19:45:42 -0800 Subject: [PATCH 115/118] enable fp8 gemm_xdl for all gfx9 targets --- example/01_gemm/CMakeLists.txt | 16 +++++++------- .../CMakeLists.txt | 2 +- ...ed_conv_fwd_bias_relu_add_wmma_example.inc | 5 ++--- .../CMakeLists.txt | 2 +- example/64_fpAintB_gemm/CMakeLists.txt | 2 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 21 +++++++------------ .../device/impl/device_fpAintB_gemm_wmma.hpp | 3 +-- ...e_grouped_query_attention_forward_wmma.hpp | 11 ++++------ ...ice_multi_query_attention_forward_wmma.hpp | 11 ++++------ test/grouped_convnd_bwd_data/CMakeLists.txt | 2 +- test/grouped_convnd_bwd_weight/CMakeLists.txt | 2 +- 11 files changed, 31 insertions(+), 46 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index dd6b7b53503..9b44b02113d 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -27,7 +27,7 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16) add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16) -if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") +if(GPU_TARGETS MATCHES "gfx11") add_custom_target(example_gemm_wmma) add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp) add_example_dependencies(example_gemm_wmma example_gemm_wmma_fp16) @@ -66,13 +66,11 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() -if(GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942") - add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp) - add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) +add_example_executable(example_gemm_xdl_fp8 gemm_xdl_fp8.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) - add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) - add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) +add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) - add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) - add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) -endif() +add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) diff --git a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt index 32a87dd200f..f343cc19109 100644 --- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt +++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt @@ -1,5 +1,5 @@ add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp) -if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") +if(GPU_TARGETS MATCHES "gfx11") add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp) endif() diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc index 325d42dbe47..ca8746bb970 100644 --- a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_wmma_example.inc @@ -279,9 +279,8 @@ bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[]) switch(conv_param.num_dim_spatial_) { // case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param); - case 2: - return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); - // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); + case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); + // case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); } return false; diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 5e091f56471..c6cca7b586b 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") +if(GPU_TARGETS MATCHES "gfx11") add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_wmma_fp16 batched_gemm_scale_softmax_gemm_permute_wmma_fp16.cpp) add_example_executable(example_self_attention_forward_wmma_fp16 self_attention_forward_wmma_fp16.cpp) diff --git a/example/64_fpAintB_gemm/CMakeLists.txt b/example/64_fpAintB_gemm/CMakeLists.txt index 34059c7ff90..89cc2d7f62c 100644 --- a/example/64_fpAintB_gemm/CMakeLists.txt +++ b/example/64_fpAintB_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") +if(GPU_TARGETS MATCHES "gfx11") add_custom_target(example_fpAintB_gemm_wmma) add_example_executable(example_fp16int8_gemm_wmma fp16int8_gemm_wmma.cpp) add_dependencies(example_fpAintB_gemm_wmma example_fp16int8_gemm_wmma) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp index 378232d9f0c..e218ee5c15d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp @@ -56,8 +56,7 @@ __global__ void bool input_permute, bool output_permute) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__)) // clang-format off // *************************************************** @@ -162,7 +161,7 @@ __global__ void ignore = G1; ignore = input_permute; ignore = output_permute; -#endif // end of if (defined(__gfx1100__)) +#endif // end of if (defined(__gfx11__)) } // Self-Attention @@ -188,8 +187,7 @@ __global__ void index_t head_size, float alpha) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__)) // clang-format off // *************************************************** @@ -294,7 +292,7 @@ __global__ void ignore = head_count; ignore = head_size; ignore = alpha; -#endif // end of if (defined(__gfx1100__)) +#endif // end of if (defined(__gfx11__)) } // Cross-Attention // Self-Attention @@ -323,8 +321,7 @@ __global__ void index_t head_size, float alpha) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__)) // clang-format off // *************************************************** @@ -435,7 +432,7 @@ __global__ void ignore = head_count; ignore = head_size; ignore = alpha; -#endif // end of if (defined(__gfx1100__)) +#endif // end of if (defined(__gfx11__)) } // Computes C = A * B0 * B1 // MN = MK * KL * LN @@ -861,8 +858,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle static bool IsSupportedArgument(const RawArg& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || - ck::get_device_name() == "gfx1102") + if(ck::is_navi3_supported()) { if constexpr(!(is_same_v || is_same_v)) { @@ -1439,8 +1435,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle #if 0 static bool IsSupportedArgument(const Argument& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || - ck::get_device_name() == "gfx1102") + if(ck::is_navi3_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 6021ecaf43c..4385d64c19c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -509,8 +509,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB || is_same_v || is_same_v)) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp index 2313b256c32..84ad48d4c78 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp @@ -61,8 +61,7 @@ __global__ void bool input_permute, bool output_permute) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__)) // clang-format off // *************************************************** @@ -169,7 +168,7 @@ __global__ void ignore = G1; ignore = input_permute; ignore = output_permute; -#endif // end of if (defined(__gfx1100__)) +#endif // end of if (defined(__gfx11__)) } // Computes C = A * B0 * B1 @@ -597,8 +596,7 @@ struct DeviceGroupedQueryAttentionForward_Wmma static bool IsSupportedArgument(const RawArg& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || - ck::get_device_name() == "gfx1102") + if(ck::is_navi3_supported()) { if constexpr(!(is_same_v || is_same_v)) { @@ -960,8 +958,7 @@ struct DeviceGroupedQueryAttentionForward_Wmma #if 0 static bool IsSupportedArgument(const Argument& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || - ck::get_device_name() == "gfx1102") + if(ck::is_navi3_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp index 2fd7147c58f..b7551e78a22 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp @@ -60,8 +60,7 @@ __global__ void bool input_permute, bool output_permute) { -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx1100__) || defined(__gfx1101__) || \ - defined(__gfx1102__)) +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__)) // clang-format off // *************************************************** @@ -168,7 +167,7 @@ __global__ void ignore = G1; ignore = input_permute; ignore = output_permute; -#endif // end of if (defined(__gfx1100__)) +#endif // end of if (defined(__gfx11__)) } // Computes C = A * B0 * B1 @@ -595,8 +594,7 @@ struct DeviceMultiQueryAttentionForward_Wmma static bool IsSupportedArgument(const RawArg& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || - ck::get_device_name() == "gfx1102") + if(ck::is_navi3_supported()) { if constexpr(!(is_same_v || is_same_v)) { @@ -952,8 +950,7 @@ struct DeviceMultiQueryAttentionForward_Wmma #if 0 static bool IsSupportedArgument(const Argument& arg) { - if(ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || - ck::get_device_name() == "gfx1102") + if(ck::is_navi3_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/test/grouped_convnd_bwd_data/CMakeLists.txt b/test/grouped_convnd_bwd_data/CMakeLists.txt index 9773e5a9c6c..305c568ee9d 100644 --- a/test/grouped_convnd_bwd_data/CMakeLists.txt +++ b/test/grouped_convnd_bwd_data/CMakeLists.txt @@ -1,5 +1,5 @@ list(APPEND gpu_list_xdl gfx908 gfx90a gfx940) -list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102) +list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list_xdl AND target EQUAL 0) diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt index b167943c97d..d7d6f8a3d68 100644 --- a/test/grouped_convnd_bwd_weight/CMakeLists.txt +++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt @@ -1,5 +1,5 @@ list(APPEND gpu_list_xdl gfx908 gfx90a gfx940 gfx941 gfx942) -list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102) +list(APPEND gpu_list_wmma gfx1100 gfx1101 gfx1102 gfx1103) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) From 7b28bcb351a3c85348aeb0fd000258d640d1f0f6 Mon Sep 17 00:00:00 2001 From: illsilin Date: Fri, 8 Mar 2024 07:53:00 -0800 Subject: [PATCH 116/118] temporarily disable gemm_xdl_fp16_fp8 on MI100/200 --- example/01_gemm/CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 9b44b02113d..d09a9ff4ed9 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -72,5 +72,12 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) -add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) -add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) +list(APPEND gpu_list gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) + add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) + set(target 1) + endif() +endforeach() From 91ee125408c6e1a3a50b8e6db57c56be4f2469a3 Mon Sep 17 00:00:00 2001 From: illsilin Date: Fri, 8 Mar 2024 09:15:52 -0800 Subject: [PATCH 117/118] fix the cmake logic for gemm_xdl_fp16_fp8 --- example/01_gemm/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index d09a9ff4ed9..9ff1040d4d6 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -72,10 +72,10 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) -list(APPEND gpu_list gfx940 gfx941 gfx942) +list(APPEND gpu_list_mi300 gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) - if(gpu IN_LIST gpu_list AND target EQUAL 0) + if(gpu IN_LIST gpu_list_mi300 AND target EQUAL 0) add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) set(target 1) From 56a6723126bb6f9d0bbb2513b3044f74a60a2f20 Mon Sep 17 00:00:00 2001 From: illsilin Date: Fri, 8 Mar 2024 13:04:03 -0800 Subject: [PATCH 118/118] re-enable the gemm_xdl_fp16_fp8 on MI100/200 --- example/01_gemm/CMakeLists.txt | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 9ff1040d4d6..2fa8e774621 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -72,12 +72,6 @@ add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8) add_example_executable(example_gemm_xdl_fp8_bf8 gemm_xdl_fp8_bf8.cpp) add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp8_bf8) -list(APPEND gpu_list_mi300 gfx940 gfx941 gfx942) -set(target 0) -foreach(gpu IN LISTS GPU_TARGETS) - if(gpu IN_LIST gpu_list_mi300 AND target EQUAL 0) - add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) - add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) - set(target 1) - endif() -endforeach() +add_example_executable(example_gemm_xdl_fp16_fp8 gemm_xdl_fp16_fp8.cpp) +add_example_dependencies(example_gemm_xdl example_gemm_xdl_fp16_fp8) +