From 733d2f4a3da53e140bf58262c49d6b4c6416e6a4 Mon Sep 17 00:00:00 2001 From: amd-ruitang3 Date: Sat, 20 Dec 2025 15:44:57 +0800 Subject: [PATCH 1/5] bf16_gemm_clean_in_kl --- aiter/ops/gemm_op_a16w16.py | 13 +- aiter/tuned_gemm.py | 5 +- csrc/include/asm_gemm_a16w16.h | 1 + csrc/include/rocm_ops.hpp | 57 ++-- csrc/py_itfs_cu/asm_gemm_a16w16.cu | 291 +++++++----------- hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv | 34 +- ...p32bf16_tn_128x64_bshuffle_splitk_clean.co | Bin 0 -> 28656 bytes ...p32bf16_tn_160x64_bshuffle_splitk_clean.co | Bin 0 -> 32720 bytes ...fp32bf16_tn_32x64_bshuffle_splitk_clean.co | Bin 0 -> 16432 bytes ...bf16gemm_fp32bf16_tn_32x64_splitk_clean.co | Bin 0 -> 18512 bytes ...fp32bf16_tn_48x64_bshuffle_splitk_clean.co | Bin 0 -> 18464 bytes ...bf16gemm_fp32bf16_tn_48x64_splitk_clean.co | Bin 0 -> 20544 bytes ...fp32bf16_tn_64x64_bshuffle_splitk_clean.co | Bin 0 -> 20496 bytes ...bf16gemm_fp32bf16_tn_64x64_splitk_clean.co | Bin 0 -> 22576 bytes ...fp32bf16_tn_80x64_bshuffle_splitk_clean.co | Bin 0 -> 22560 bytes ...bf16gemm_fp32bf16_tn_80x64_splitk_clean.co | Bin 0 -> 24640 bytes ...fp32bf16_tn_96x64_bshuffle_splitk_clean.co | Bin 0 -> 24592 bytes ...bf16gemm_fp32bf16_tn_96x64_splitk_clean.co | Bin 0 -> 26672 bytes op_tests/test_gemm_a16w16.py | 3 +- 19 files changed, 179 insertions(+), 225 deletions(-) create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index e9f86a5cf9..3f36cde219 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -20,6 +20,7 @@ def gen_gemm_a16w16_asm_fake_tensors( A: Tensor, B: Tensor, out: Tensor, + semaphore: Tensor, bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, @@ -37,11 +38,18 @@ def gemm_a16w16_asm( A: Tensor, B: Tensor, out: Tensor, + semaphore: Tensor, bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, bpreshuffle: bool = False, -) -> Tensor: ... +) -> Tensor: + ... + + +@functools.lru_cache(maxsize=1) +def get_semaphore_workspace() -> Tensor: + return torch.zeros((16, 64), dtype=torch.uint32, device="cuda") def gemm_a16w16( @@ -52,4 +60,5 @@ def gemm_a16w16( splitK: Optional[int] = None, kernelName: Optional[str] = None, ): - return gemm_a16w16_asm(A, B, out, bias, splitK, kernelName) + sema = get_semaphore_workspace() + return gemm_a16w16_asm(A, B, out, bias, sema, splitK, kernelName) diff --git a/aiter/tuned_gemm.py b/aiter/tuned_gemm.py index c7c8b5994f..db96a98bd6 100644 --- a/aiter/tuned_gemm.py +++ b/aiter/tuned_gemm.py @@ -24,7 +24,7 @@ import torch.nn.functional as F from torch import Tensor -from aiter import dtypes, gemm_a16w16_asm, hipb_create_extension, hipb_mm, logger +from aiter import dtypes, gemm_a16w16_asm, get_semaphore_workspace, hipb_create_extension, hipb_mm, logger from aiter.jit.core import AITER_CONFIGS, AITER_LOG_TUNED_CONFIG from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.jit.utils.torch_guard import torch_compile_guard @@ -392,7 +392,8 @@ def asm_gemm( out_asm = torch.empty( inp.shape[0], weights.shape[0], dtype=otype, device=inp.device ) - return gemm_a16w16_asm(inp, weights, out_asm, bias, splitK, KernelName, bpreshuffle) + sema = get_semaphore_workspace() + return gemm_a16w16_asm(inp, weights, out_asm, sema, bias, splitK, KernelName, bpreshuffle) def triton_gemm( diff --git a/csrc/include/asm_gemm_a16w16.h b/csrc/include/asm_gemm_a16w16.h index c7788bb3ec..26a207882c 100644 --- a/csrc/include/asm_gemm_a16w16.h +++ b/csrc/include/asm_gemm_a16w16.h @@ -6,6 +6,7 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 torch::Tensor& B, // B:[N, K] bf16 torch::Tensor& out, // Out:[M, N] f32 + torch::Tensor& semaphore, std::optional bias, std::optional splitK, std::optional kernelName, diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 135e8ae03b..5530e1f3a2 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -456,6 +456,7 @@ namespace py = pybind11; py::arg("A"), \ py::arg("B"), \ py::arg("out"), \ + py::arg("semaphore"), \ py::arg("bias") = std::nullopt, \ py::arg("splitK") = std::nullopt, \ py::arg("kernelName") = std::nullopt, \ @@ -1537,34 +1538,34 @@ namespace py = pybind11; #define GEMM_COMMON_PYBIND \ m.def("get_padded_m", &getPaddedM, py::arg("M"), py::arg("N"), py::arg("K"), py::arg("gl")); -#define TOP_K_PER_ROW_PYBIND \ - m.def("top_k_per_row_prefill", \ - &top_k_per_row_prefill, \ - py::arg("logits"), \ - py::arg("rowStarts"), \ - py::arg("rowEnds"), \ - py::arg("indices"), \ - py::arg("values"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ - py::arg("stride1")); \ - m.def("top_k_per_row_decode", \ - &top_k_per_row_decode, \ - py::arg("logits"), \ - py::arg("next_n"), \ - py::arg("seqLens"), \ - py::arg("indices"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ - py::arg("stride1")); \ - m.def("top_k_per_row_decode_fast", \ - &top_k_per_row_decode_fast, \ - py::arg("logits"), \ - py::arg("next_n"), \ - py::arg("seqLens"), \ - py::arg("indices"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ +#define TOP_K_PER_ROW_PYBIND \ + m.def("top_k_per_row_prefill", \ + &top_k_per_row_prefill, \ + py::arg("logits"), \ + py::arg("rowStarts"), \ + py::arg("rowEnds"), \ + py::arg("indices"), \ + py::arg("values"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode", \ + &top_k_per_row_decode, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode_fast", \ + &top_k_per_row_decode_fast, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ py::arg("stride1")); #define MLA_METADATA_PYBIND \ diff --git a/csrc/py_itfs_cu/asm_gemm_a16w16.cu b/csrc/py_itfs_cu/asm_gemm_a16w16.cu index b627f06af9..2f57864511 100644 --- a/csrc/py_itfs_cu/asm_gemm_a16w16.cu +++ b/csrc/py_itfs_cu/asm_gemm_a16w16.cu @@ -9,16 +9,15 @@ #include #include -// start to prepare the input and output buffer struct __attribute__((packed)) KernelArgs { - void *ptr_D; + void* ptr_D; p2 _p0; - void *ptr_C; + void* ptr_C; p2 _p1; - void *ptr_A; + void* ptr_A; p2 _p2; - void *ptr_B; + void* ptr_B; p2 _p3; float alpha; p3 _p4; @@ -50,10 +49,12 @@ struct __attribute__((packed)) KernelArgs p3 _p17; unsigned int is_out_b16; p3 _p18; - void *ptr_Bias; + void* ptr_Bias; p2 _p19; unsigned int add_bias; p3 _p20; + void* ptr_semaphore; + p2 _p21; }; std::tuple @@ -64,6 +65,7 @@ get_heuristic_kernel(int M, std::string arch_id, bool bpreshuffle, int add_bias, + int clean = 1, std::optional splitk = std::nullopt, std::optional kernelName = std::nullopt) { @@ -72,7 +74,7 @@ get_heuristic_kernel(int M, HIP_CALL(hipGetDevice(&dev)); HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); uint32_t num_cu = dev_prop.multiProcessorCount; - // printf("num_cu: %d\n", num_cu); + uint32_t empty_cu = num_cu; uint32_t pure_tg_num = 0; uint32_t round = 0xffffffff; @@ -84,41 +86,44 @@ get_heuristic_kernel(int M, for(const auto& el : *cfgs) { - if (el.first.find(arch_id) != 0) + if(el.first.find(arch_id) != 0) continue; const auto& cfg = el.second; if(kernelName.has_value() && el.first != (arch_id + kernelName.value())) continue; if(kernelName.has_value()) { - TORCH_CHECK( - N % cfg.tileN == 0 && - cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && - (add_bias == 0 || cfg.bias == 1), - __func__, - " the specified kernel name ", el.first, - " cannot support the input shape (N=", N, ", tileN=", cfg.tileN, - ") or bias/preshuffle setting (preshuffle=", bpreshuffle, - ", bias=", add_bias, ")." - ); + TORCH_CHECK(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && + (add_bias == 0 || cfg.bias == 1), + __func__, + " the specified kernel name ", + el.first, + " cannot support the input shape (N=", + N, + ", tileN=", + cfg.tileN, + ") or bias/preshuffle setting (preshuffle=", + bpreshuffle, + ", bias=", + add_bias, + ")."); } - if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && (add_bias == 0 || cfg.bias == 1)) + if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && + (add_bias == 0 || cfg.bias == 1) && clean == cfg.clean) { - // 1. select splitK int split_K = 1; if(splitk.has_value()) split_K = splitk.value(); - else if (cfg.splitK == 1)// auto select + else if(cfg.splitK == 1) { - pure_tg_num = - ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); // M-orient support OOB + pure_tg_num = ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); if(pure_tg_num < num_cu) { - TORCH_CHECK(cfg.subK > 0, __func__, " cfg.subK must be greater than 0 to avoid division by zero."); - int max_split = std::min( - std::min(static_cast(num_cu / pure_tg_num), 16), - static_cast(K / cfg.subK) // “K-dim must satisfy min 128 bytes. BF16 are 2 bytes each, this means min ele of K is 64.” - ); + TORCH_CHECK(cfg.subK > 0, + __func__, + " cfg.subK must be greater than 0 to avoid division by zero."); + int max_split = std::min(std::min(static_cast(num_cu / pure_tg_num), 16), + static_cast(K / cfg.subK)); for(int i = max_split; i >= 1; i--) { if(K % 64 == 0) @@ -132,16 +137,15 @@ get_heuristic_kernel(int M, } } - uint32_t tg_num = pure_tg_num * split_K; - // 2. better or not - uint32_t local_round = (tg_num + num_cu - 1) / num_cu; - float local_compute2mem_effi = cfg.tileM * cfg.tileN / (cfg.tileM + cfg.tileN); + uint32_t tg_num = pure_tg_num * split_K; + uint32_t local_round = (tg_num + num_cu - 1) / num_cu; + float local_compute2mem_effi = + static_cast(cfg.tileM * cfg.tileN) / (cfg.tileM + cfg.tileN); bool is_earlier_round = (local_round < round); bool is_same_round = (local_round == round); bool has_sufficient_empty_cu = (empty_cu > (local_round * num_cu - tg_num)); - bool has_same_empty_cu = empty_cu == (local_round * num_cu - tg_num); + bool has_same_empty_cu = (empty_cu == (local_round * num_cu - tg_num)); bool has_better_efficiency = (local_compute2mem_effi > compute2mem_effi); - // printf("oob %d, tielM: %d\n", oob, cfg.tileM); bool less_oob = (M % cfg.tileM == 0) ? (oob > 0) : (cfg.tileM - M % cfg.tileM < oob); bool has_same_oob = (cfg.tileM - (M % cfg.tileM)) == oob; @@ -153,7 +157,6 @@ get_heuristic_kernel(int M, compute2mem_effi = local_compute2mem_effi; oob = (M % cfg.tileM == 0) ? 0 : cfg.tileM - (M % cfg.tileM); selectedKernelName = el.first; - // printf("Selected Kernel: %s\n", selectedKernelName.c_str()); selectedsplitK = split_K; } } @@ -163,175 +166,101 @@ get_heuristic_kernel(int M, return std::make_tuple(selectedKernelName, selectedsplitK); } -torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 - torch::Tensor& B, // B:[N, K] bf16 - torch::Tensor& out, // Out:[M, N] f32 +AiterAsmKernel* get_or_load_kernel(const std::string& selectedKernelName, + CFG* config_map, + unsigned int& SUBM, + unsigned int& SUBN) +{ + static std::unordered_map> impl_ptr_map; + + auto it_kl = config_map->find(selectedKernelName); + TORCH_CHECK(it_kl != config_map->end(), __func__, " not find kernel~ " + selectedKernelName); + + const auto& cfg = it_kl->second; + const char* name = cfg.knl_name.c_str(); + const char* co_name = cfg.co_name.c_str(); + SUBM = cfg.tileM; + SUBN = cfg.tileN; + + auto result = impl_ptr_map.emplace(name, nullptr); + if(result.second) + result.first->second = std::make_unique(name, co_name); + + return result.first->second.get(); +} + +torch::Tensor gemm_a16w16_asm(torch::Tensor& A, + torch::Tensor& B, + torch::Tensor& out, + torch::Tensor& semaphore, std::optional bias, std::optional splitK, std::optional kernelName, bool bpreshuffle = false) { - TORCH_CHECK(out.dtype() == torch::ScalarType::Float || out.dtype() == torch::ScalarType::BFloat16, + TORCH_CHECK(out.dtype() == torch::ScalarType::Float || + out.dtype() == torch::ScalarType::BFloat16, "GEMM A16W16 asm only support Float32 or Bf16 output now!"); - + std::string arch_id = get_gpu_arch(); - // 1. prepare args - int Mdim = A.size(0); - int Ndim = B.size(0); - int Kdim = A.size(1); + int Mdim = A.size(0); + int Ndim = B.size(0); + int Kdim = A.size(1); - unsigned int SUBM = 64; + unsigned int SUBM = 32; unsigned int SUBN = 64; - float alpha = 1.0; - float beta = 0.0; - int szA = Mdim * Kdim; - int szB = Kdim * Ndim; - int szC = Mdim * Ndim; - int szBias = 1 * Ndim; - int sz_A_pad = 0; - int sz_B_pad = 0; - int sz_C_pad = 0; - int strideD0 = 0; - int strideD1 = 0; - int strideC0 = 0; - int strideC1 = 0; - int strideA0 = 0; - int strideA1 = 0; - int strideB0 = 0; - int strideB1 = 0; - int is_out_b16 = 0; - int add_bias = bias.has_value() ? 1 : 0; - // A row major, B col major, C row major - strideA0 = strideA1 = A.stride(0) * A.element_size(); // in bytes - strideB0 = strideB1 = B.stride(0) * B.element_size(); - const auto elem_bytes = out.element_size(); - strideC0 = strideC1 = strideD0 = strideD1 = Ndim * elem_bytes; // inbytes - if (out.dtype() == torch::ScalarType::BFloat16) - is_out_b16 = 1; - - szA += sz_A_pad; - szB += sz_B_pad; - szC += sz_C_pad; - KernelArgs args; - size_t arg_size = sizeof(args); + KernelArgs args = {}; args.ptr_D = (void*)out.data_ptr(); - // args.ptr_C = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; - args.ptr_C = (void*)NULL; - args.ptr_A = (void*)A.data_ptr(); - args.ptr_B = (void*)B.data_ptr(); - args.ptr_Bias = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; - args.alpha = alpha; - args.beta = beta; - args.stride_C0 = strideC0; - args.stride_A0 = strideA0; - args.stride_B0 = strideB0; - args.M = Mdim; - args.N = Ndim; - args.K = Kdim; - args.is_out_b16 = is_out_b16; - args.add_bias = add_bias; + args.ptr_C = nullptr; + args.ptr_A = (void*)A.data_ptr(); + args.ptr_B = (void*)B.data_ptr(); + args.ptr_Bias = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; + args.alpha = 1.0f; + args.beta = 0.0f; + args.stride_A0 = A.stride(0) * A.element_size(); + args.stride_B0 = B.stride(0) * B.element_size(); + args.stride_C0 = args.stride_D0 = Ndim * out.element_size(); + args.M = Mdim; + args.N = Ndim; + args.K = Kdim; + args.is_out_b16 = (out.dtype() == torch::ScalarType::BFloat16) ? 1 : 0; + args.add_bias = bias.has_value() ? 1 : 0; - // args.stride_D0 = 25; - // args.stride_D1 = 80; - // args.stride_C1 = 3; - // args.stride_A1 = 124; - - // 2. select kl - static std::unordered_map> impl_ptr_map; - AiterAsmKernel* impl_ptr = nullptr; - CFG* config_map = &cfg_bf16gemm_fp32bf16; - - // 2.1 static dict + CFG* config_map = &cfg_bf16gemm_fp32bf16; std::string selectedKernelName = kernelName.has_value() ? arch_id + kernelName.value() : ""; int selectedksplit = splitK.value_or(0) ?: 1; - if(!kernelName.has_value() || kernelName == "" || !splitK.has_value()) + if(!kernelName.has_value() || kernelName.value_or("").empty() || !splitK.has_value()) { - - auto it_sel = get_heuristic_kernel(Mdim, - Ndim, - Kdim, - config_map, - arch_id, - bpreshuffle, - add_bias, - splitK.has_value() ? splitK : std::nullopt, - kernelName.has_value() ? kernelName : std::nullopt); - selectedKernelName = std::get<0>(it_sel); - selectedksplit = std::get<1>(it_sel); + auto [name, split] = get_heuristic_kernel(Mdim, + Ndim, + Kdim, + config_map, + arch_id, + bpreshuffle, + args.add_bias, + 1, + splitK, + kernelName); + selectedKernelName = name; + selectedksplit = split; } - - args.splitk = selectedksplit; - // printf("=== KernelArgs Important Parameters ===\n"); - // printf("ptr_D: %p\n", args.ptr_D); - // printf("ptr_A: %p\n", args.ptr_A); - // printf("ptr_B: %p\n", args.ptr_B); - // printf("alpha: %f\n", args.alpha); - // printf("beta: %f\n", args.beta); - // printf("stride_D0: %u\n", args.stride_D0); - // printf("stride_D1: %u\n", args.stride_D1); - // printf("stride_C0: %u\n", args.stride_C0); - // printf("stride_C1: %u\n", args.stride_C1); - // printf("stride_A0: %u\n", args.stride_A0); - // printf("stride_A1: %u\n", args.stride_A1); - // printf("stride_B0: %u\n", args.stride_B0); - // printf("stride_B1: %u\n", args.stride_B1); - // printf("M: %u\n", args.M); - // printf("N: %u\n", args.N); - // printf("K: %u\n", args.K); - // printf("splitk: %u\n", args.splitk); - // printf("is_out_b16: %u\n", args.is_out_b16); - // printf("add_bias: %u\n", args.add_bias); - // printf("=======================================\n"); - - auto it_kl = config_map->find(selectedKernelName); - if(it_kl != config_map->end()) - { - const auto& cfg = it_kl->second; - const char* name = cfg.knl_name.c_str(); - const char* co_name = cfg.co_name.c_str(); - SUBM = cfg.tileM; - SUBN = cfg.tileN; - auto result = impl_ptr_map.emplace(name, nullptr); // insert new kl. - if(result.second) // emplace successfully - result.first->second = std::make_unique(name, co_name); - impl_ptr = result.first->second.get(); - } - else - TORCH_CHECK(false, __func__, " not find kernel~ " + selectedKernelName); - - // 3. launch kl + args.splitk = selectedksplit; + AiterAsmKernel* impl_ptr = get_or_load_kernel(selectedKernelName, config_map, SUBM, SUBN); const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(A)); const hipStream_t stream = at::hip::getCurrentHIPStream(); - int bdx = 256; int gdx = (Ndim + SUBN - 1) / SUBN; - int gdy = ((Mdim + SUBM - 1) / SUBM); - int gdz = 1; + int gdy = (Mdim + SUBM - 1) / SUBM; + int gdz = selectedksplit; - if(selectedksplit > 1) - { - out.zero_(); - // HIP_CALL(hipMemsetAsync(out.data_ptr(), 0, elem_bytes * szC, stream)) - int k_per_tg = Kdim / selectedksplit; - gdz = selectedksplit; - } + TORCH_CHECK(gdx <= 16, __func__, " gdx (", gdx, ") must be <= 16"); // 16 = 512/32 - // printf("argsize: %zu\n", arg_size); - // printf("gdx: %d\n", gdx); - // printf("gdy: %d\n", gdy); - // printf("gdz: %d\n", gdz); + // semaphore.fill_(selectedksplit); + args.ptr_semaphore = (void*)semaphore.data_ptr(); - impl_ptr->launch_kernel({&args, - &arg_size, - gdx, // gdx - gdy, // gdy - gdz, // gdz - 256, // bdx: 4 wv64 - 1, // bdy - 1, // bdz - stream}); + size_t arg_size = sizeof(args); + impl_ptr->launch_kernel({&args, &arg_size, gdx, gdy, gdz, 256, 1, 1, stream}); - // 4. return out return out; } diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv index 3e58803dd2..63698c39bc 100755 --- a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv +++ b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv @@ -1,11 +1,23 @@ -knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias -_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1 +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias,clean +_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1,0 +_ZN5aiter49bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co,1,128,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co,1,32,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co,1,48,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co,1,64,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co,1,80,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co,1,96,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_splitk_clean.co,1,96,64,0,0,1,64,1,1 +_ZN5aiter49bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co,1,160,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_splitk_clean.co,1,32,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_splitk_clean.co,1,48,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_splitk_clean.co,1,64,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_splitk_clean.co,1,80,64,0,0,1,64,1,1 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..4fef26f0ea862281f562112f211870f6e979780f GIT binary patch literal 28656 zcmeHQeQ;FQb-!9IVi5v_&4&QNE?CCIg=K{#By2twA+QVr%MP{y6E7=i#mW|vyjo;% ziFjH|vaBdVRusie)7Wur$N6y5q;}&pskHH=P0}$PaMGzgwL2u!%%n3(>$*uNXj9N$CRoJaFz*CAuALz{Adb{Ef+GoF%y5h zPsj>&Kg{*Dh_Jubmx&!hWb5+fjy#@=#P{N-{MGxr<7e?EyubH)JhkSxlOK-veqT5i zUr$UJF9Y36ggT8vWU~qXeKBJ`!kznTZmGYMvBX!yCz_(2VgLP+SX-pE^Rc04{NdQ~ z&NoB2aQS>kxJfyA zEKbigoDT62DpLG(N96M@Wt+??evwbHy{+~16=%3h9jwrDuy>tdk;UovJ;R-9hCa)| z-gSoSEKa}g8CI(q1}q1A*BMq>oTig^smj* zzitNpPd7y3=>@2tK>&RL4rb|pC`05T(XUe1UoXAmW0A)d}Qz;oFK_{t0dsD-$k5q~~| zC>aa!Lbd^3$~M4vW)MIwMD^v2NX{Ti#zK5A+WMEFumXQ=&TJk(IS`SNo3W)LWIIZCpQurBKe>)%g=nXxKQSGR;aFRxob9P#~; zKj(9*EN<_1TmKx9QK5#6qX*y5`?au8=W}YZj!>I*gu3?=!8~j4k*<5bBOVKNMoxq~ zqV2KB#VLBYoX|8akuIP_{;h;6!;0HMT~=ZM(ni?})W@h2x~2 zGG+FHKNfj_2-1ure}b+pIXPeUcOLJEg&NyWw#7TX;O{(rqM^O@^+*dm6 zc^CVd|G{uqq&e13ONkW;DsHzc669?`FY8D z!a?Tp4Z7UEl-s>FMQfFExdbxCv7hDnCTZWr#wT*wrzUb|1-j2BhbAwu(TNLen6^RM z`e}Q1BE>#2flN!Wa}qx$@ezq%;J7fgkgvhK)I5%JQgb-YO3mVUed>CSH0}|6(?a7o z(m0MZjw6lZxO~#BHQmU_33ZeOz9*`C-wT;b$Rx>)bzj~C=5x6|8AvCmJqa`R@V zT-S7yluEf40nesfOS)5M8C$i?JL;wPw7DmLat-+vuxl?eZw`Iy*7s_`HpY6H$2a2f z6u6faMVF?$qYlONt|FhJqUe$&VXxRv76}~Tw&hXp^5UqscvaN9$`|$e)-CL#ecqk~TUX*;w=Q}EOIYVIaHV-9#POfZW*!MKk3M?Ot7uMc zS_Ah^#-3pLYZLkTCGNTFqL?4n*iUQauqP&v6FaK)(~Po*AZeqC4~4aBs}>g=lJ>*`1%S8O7iCBM;|6RPYwztM4m^r_nz;J9)*iJl5Co2>G!==0&T zSm71t3-5*IjJ)CJi;$l$LVmso`S~K`=Zlb^FG7C42>JOU&ctkNuS3GhfVGvOz?B^>;%s7faj$GUPoSfiE13xC#nlR z+IL*vq0jO}H!D1N>FB+4oQ?%UlPMS0Je9h44&{I=zZ=)Md*|k_b?3g^J(8a+QTC8# z4{`h7lKp!n$8ryKhjWi~H{>4eZp^*6`&jNVk{d{FBDtC5D9QJc+)8pA$sHueNRE@- zMe-?|_jQD}qqN;i z+cDZ2XltUanYJix_tDl$TN`a1w8dzP)7C}XDcT;Q?F?<*v^lgTXzQh|pSA(o25B3j zZJ4$Z+D2&`qiwv~@t<*6;q8|evmE-Y(0&dTC6bpY8BH9?OO=eqj^rXG)8_{AG9}~T z4U(5D84o*uUsAUn9u+8bQ|A2(rFLko7f!tgjJdeT^XNYXn(eBgpz1LDts@@|D(TK(5h{u|@&% zqw~b`2>bdPLDts@vc5);^)-U5uMuQ@jUel51X*7r$od*V*4GHKzDAJsHG-_K5oCRh zAWySKE7((XUH{yKx_&P6ezPZjZfKIa-pPt0?;~E>BQBb^llO>>z4K4kQ%VK}x|xg1 zo}j=2=H_x?P~ak#$K|4+z+&d%vM(sGgynO2T~Oe9R>0*zP+&Qm&*iG1z^z3I_v99L z?_?|}=D42yr-~@TBU{YWW#`VsT!n`HE2ozGA4>{s!nocK$#Gh~LEpbY>b1Oc_HzGdQebNpj$isyaeOTw zp-)suy_T0w9ry1@3XEf1?-n^u%U9_8w@JO0x6T3Xe<~^P%vKzK=i9}p)n5Mj^{&aE zT<@&!!!xeUJz&W1kd`}e z|Gk{g`9?mEpg!Xoq~-G{>N9kDK8tHgP+nVjeI7%7#yBIN$5Ef5*YjCigKFiqh}Y*4 z)Mtz{@_7{X8G1dR#Wky5Ub}dG9z%V`I3u6OQJ&lqRq^C;>w z^m;yvYvM6^ZRGWN4D}h~jC>wPeTH7oXK@XU%4;dF&m*YM7-!`3DC#rxdOnM5Zil?~ z^7=f6`iyZ#K98e5L$BwvxJI9n*J@s$N2aUK)R!S|M^$~M{!Gv3v1#h_xKW=+WPKi$ z^?6Lz=W$t|M`V2-mGyZ{*5`3qpGRbU9+mZZOxEXdtv=tCmsy`3)Mwm#X!)E#eTGiU zXW4Hs*>CXr+>82*aYjD(qdr5g<+JQJJhI>5_1Qsv#yBIN6R6M7YxykujY8RP@cP_~ z`iyZ#KKG+OL$Bqt>^F*JzrpLXgZhkdMm{G{pP|?CS@s(~*>CXr+>82*aYjD(qdr5g z<+JQJ*2#WjJ?g82`iyZ#J||G0q1Wa!#3b3)eVURj^}Wqo#JeNM>w+$-yIzpT%Wtj`HqpL=C} z?$_$`Q?oMjc>wj<>^D%K?fphj_8YuD51~F|oRQDNsL#-A`7Ha5TG?;#`aFR8jB!Rj z528LpujRAsH|k}-!Rzx7>NCa}`872T-3e&dBFM)Mx0me3t#j zG1+hM`aFdCjB!Rj52HRqujRAsH=?rNXyyF|>NCa}`8C{RXekL#WRf zXXNuR>NE6OKFfaNlhr%D_k$y-&$u>e^?4Na89J>#oA6 zanxt%^?VlnM*7|t{qG3sGsYSDJc{}Zy`Ima-$>sp!+#9*8RLw69!GtKUe9OIZ=~<7 z(f^L1K4YAb&!ec%(Chgu`i=CxIQ+*@pE1tJ=W*0$==FRS{YLuU9r-+h`iyZ#K98b4 zL$Bwv=r_{$`tTn^ea1K=pT|+3q1W?S^c!hz0R6&8={l+x)dEBVaBeFh^ z%KAJe>+`s*&m*!vkIMQyChPOKtj{B|K99=!JSOY&xK^KUqI~}T?-Ez!IBZ;ojq^ZG z+c*xB%fL7r=YgEGaU3=-!^U~Q-^Ov6Tn5J3I1l8cjpML!88*%X{x*)oF_KFwVw#AkSfpIp@19@iSIBZ;ojq`xNjpH!642-jJ9>}{ZaU6O+>s$x?Y#fK4&)_|f*tiTE=Yc%6aU3R>fpIp@19@iS zIBZ;ojq`xNjpH!642-jJ9>_Br$6@0#Y@7%DZ5)TmWni3*^FW^2I1U?^VdFgDZ{s*j zE(7CioCosqN*srt&k5B_6W0O1UP}+H=QFqt-fvt5j-w?fvp$;~hwL{pxQsyy=Yc%6 zaU3R>fpIp@19@iSIBZ;ojq`xNjpH!642-jJ9>_Br$6@0#Y@7%DZ5)TmWni3*^FW>r zSU3(Fmto^P;BVtNOfCcCY@7%3@=6?sp3j3C*8#sF3&)}7Gq?`kZ(IeA<3)PzNdL~; zWjCxCG_&I5U7<2cOc3NX&b zc|dRDILzk|FwVw#AkS!U?@{g|>G^qjrWeo3FxL9o zI>y!q1#WNz`jY}TvW46p2nsB91a3+SEMsf9y*wze!V$PRDX@~2aeGxz;1);VO-X?_ zvl?#S8WgzA5x6}ma0k1S+k-)YI~{@5NrAgqGq=|S1@3kP?nw%~g+0jawLyWmIs*45 z1@6N-6w$Lnd32uetn(s|c%~eBpVu?_yA(aoOzX6Y&cUKwdJfVlf789bkZ98Ki|!q? zo{%>fvfq$58gjspOAUFGA(t6)xgl2=@@7M>G~_Bn-eSl%8S>4Byw#Am8S-{R-eJf= zL*8k~)rP#wkZTNiw;}H_pv!nL#L^NnZI>AC+x{#nO!#l~lSE33V*{;-Rv z@zlljOY>Yr(O>i|ew#tR;U1yiKzgF+xxSA5^5U#H!`$xwh_L$&zl|ZG->CWR*Zcxu zVGkI7rHw*gs`(wz{5C~|eUsr=c3kMoG{4(4zw#DgFE{)u?iczB&97eb+k8UUHyeJH z?LuFv`Q5JhRXrf=RfgY|PNCnT`Q4%U-E>meZ!-LDeo*Ld*8D!K`E5Nd>{|`LZGSBE z+cdv}n&0-13j21$Z^v1o-=X>4srdzaggt2Z?R;41cWQojX@1px!d`9o?RrG$cWHii zYkoD43VV&=xBFv4zgzP=r1|anxUlar{BAiX^tWh!hc&<2$ArDs@VoUBLVv5~cSQ5s z`$=KnYxwQ^l+f?f{O*x{jD7mG>+7i0dk7EEhj}3({ZPwUMtBFwYX}dLd?VpqB$p8$ zBKc;*BU7%mb@E!f-|_Hk(}ASG+vK&jUS4Z&cb0Mc9Z7*7me<;Y@>+YRb0fFkl@xfl zyw)C)*V@C*&D?$@De#{6e61T$*Sgv_-E~^+b4cEA$OjDhHbbsAAGpt!!HVsBAANo7PJz+w;n%^_9x@tg>l6rm{VuY+ApmY>z9O)_W@3 zW6GxWp~^O>Y+6sMY!54&)}Jcd8D-OYRb@M=Y+B!{Y;DS>^{~ozT-mgKR@sgzo7USZ z+hJwX`dnqZL)o;RSK0O{o7Vp-TaB`5y|A)vQ#P$HR<_N`ruE3mwo%!%ep%VpDVx?i zE8A*i)B0#-Tdr(cPpxd%Dx20{D_eoGX}z|x%~Ce4?}iQcVd%X-Mc>J)dOViRq#gz8x zV*WQ>Z(z$01EvB^FVyYf8kLrZ#A#-yQlj*-* zQh4T4!Fo46-*@KHynw*@95{}};Liu`90kjJC~KR>|B;s0ms z*YBj-v0E?g^xX}A#Z>?)E2wK}qkk>rCLjG9BEGu18sDaXpZ@c$+t}LPc03Xbb%f(l z{IR`IXL~3fr9Z|O>p0OFY7Do9I!|_Vw8!E!qVfLl@s_sZWRHZJA`K^xhg#a2+e6Lm zu~4|RHPqf2>WH;Bx3oq&H+IGvH@3ERo!CH{`y!2TF8BgXP2sX|Q**`U%FUZ2fpEAn z&=e>utt@MdG?i6WY>AZ8KQPkR(D)%W>#2yJt1$Y}$C*FMwBb>7=*#ilwf0)d_h+*U zhJKCIk8IEe)2C;d zo}Xlj^JGP){vkQO;Fe5%pVY70oT(p?`nT@R)c>i}Cm+bvKP&aW2xscQF7@narv5up z|Au&jY2){${s)HsRjGgFWO^vyc~gx)mZ|@_)W2Zp|5NJE9M2s8j?}-_n5mz`6KFR3 zo}phP^-V?``g3mdn$3>3X2!Wu>T3;sh15T1tY=W_3u2jZ{tkMVhpB(>NLlxh-u8DL zA0@r*?>aJnTYEgh{7t9Z=r2_=|M9ky{%E)}%H;o{kH?rl*4`A3hnYVfITh!l!zWrA zNgZj8wbMtusfj+|%-`640)O0-;jdoC!wrN{@)mmnS!=i<(i#c`D)yvzt7_7_)#d5k z-IsT(E7RKQ%VTTO14?&Rr~#$BtJLnMz~$Y`+RE}WHJ~i8Q|(q)?vlHurF#NOTeGKH z?e3~dYpeI9r&GOWlXBa%Cmpw@T4@6{JJa*sbNLJX)Y zEh<=P>25XO(oKP~?EW5RcHa1FnxUG*2X^kiRTMIneltd-^K9Dib3y)6I}+8uZ>7xY zx}{|J;wN>=Ok#U6f1=i}Q0npA3Utc9wqF=(n+3jv z-&<-g^ZoB98+68HlPUkNX};K1{tEFmxmr7Y=x^ll19D>X$mpX@&0pouy`)ofr`_~x c-F;$Ly?Nq6AEs0RsX;oR9>90uB%`YQTtysL^4POfrFx#3V%Q()fsg zXsseutJc;SV~jDzG)+^x?UJddd$!%9d(?JM@oeo$d)htSv+ZuzZufM%U1z`F{WILT zxdF{sj%SaX=fM0M2|NDrA{Lh97l-u?0eAD-v<|4k0Vl z{m|Ff4+{J1Pb6ZK5Ls!u+>ys~k@!*kl)w7jJ|%P^x2S_#D;gesvY>okS3Q5UP{(k0j|&_e|U( zKh{;O1|eS!LenII=!4Ljr2nBL{SQyVUk<{S_?7UcGl?krD7<+z*#IY!4M4SbvRA{F z_|@>{$w@>>7>LJ{4e(U50iK;i0C`1hI(tQQP9jReKy)V?pf}k7{gViw2IA}$F*u1R z2?H^lY=GC34e;Gb1ds!<`Ro-DokWy`f%ty10e+BdfHx-*Kn=v%E8^`*L`fKkA152& zOtJxfI*9$Z0lIhSgYdgaL`WEf|4cT(ACnF6 z=Sc+UzCjy=w^G<7Lg=ILR*F062%e-PWK1GLPfJsMUF&|86mO+uP9jL+P|Qs_LRQid zvL_LtudXH7c(65CnZM@jaQG$>C~-J)la8<~=?E)7PK3#^D&3pwLM>;<%`cAl@yMUI zQi?5Z=UdkI5*Zb&OgMVruUs7an;Q={(cx8BZjZsL`f#XW>v!^+n(Nv^t)w2; zWj37G9KM?f(u5;_iE2wq$}4#-wN1^zs>Xv2txYfGwcLB4va$ZXKvI3l+h0xGS3K-n zF7{g9J)yR6O>-j+B}OEmxZSpJb4y)g!zo2POFq~HnCGMtORRWy+29(&*~E+oBu^iZ zdbC#?DEdO74?y?*pD(%S{`o)pV=C=q!v{1qYzd#F#(*W|O(U4lmGcXUy^qT01~xy8 z*?p-Q>?r;4>N$a}-G$V-**8z8#8QsOV)P6r%@rM^gFA%3Ba9OAlQY#7?Ibxs7$tNF z9YQw66%7ys2+JX-q(yx|S6b8$OidHR;ZBP}WHVgROd2kS(1&50M(qd#vnUSa^t5O> z;X};jd&uSX#oX?tF&e9w%O#L8j>lP=ZWTfW((L&W_FIV=yB&gX6T=G>%hZQ#dY(E#XM*9>9|p zYR8eu#4Vd-V}Q3cI@4Z4U8RS z9^XliC&N8ID>5(UJ=IAuy$i`FD=TtIl<=t7k7fxx$!!ZF-UZnaZ}!57ccCxh^(~Hg z7w1O2xl1G7rOP7TWh>6c%JLpTyi20KQ@*3VQ(m9s`4rvYd5T`*Y%4{pk?> z==>!5(;@nEjK*Lg_03IV;@-sAb1Z#nM|yg$d-}=<`iU`LLu2Q#=T74qxyG9|n#zvF z&%vK!9ixtGu-$PD(KbxmNIS;+V)9{cw~uA)aXY{n5@$-BB{5UtY>9IuULgqIR7Aj~GblrV=dn{Xjv z4xyKDA)$}ZOSp*8N4S`95#eQoiwScHFC$z+m`k{na0%h%gi8sR5nfKXoNyW83c}@t z@#{B>@+RANY98t5(fLGwvk1}O`Gn~20z&lnQbP1Mn-KlYAw+){5~9CeLiE>1i2g1j zM1L0(qQ93BqQAL>=6=UUJ#U^9%e&HX=w=za($6tx0p$eUuVU=g(7%>pn!fb7@6o~xPrFn2 z+!6P+9=ew}qcM*s{lnOOH@ate+NaPp;q|!a`T6m8-P2vnJ>3&=)4q@E=z6K6@k*b$ z?}#TOQaJO;&lk>2uSX2`3=f++eI`5NN$2_$=txc_oH`}Y4)oDIqR8hxb*cCDBZWtv zJ5uCjqs6{UkNJ@EMP6|}>1W~RlaQZJLVi98`S~Q|=aZ11PeOh^3HkXX*#(^p6{U;DqN)2EU)3IYakcx=+%5$w&K+>!)-t`B^*5+%;C;FwJS2_3&uSg>w>% zHB6&x#FgHTn%giveW^S37wsq0$8wduQL{I4`#+QYm$^-;d)x0$t!!^jt!{5gt!Zyc zjgY*L!PijwjSDgY3rk{pSFQ^C-1PsGH)82$5JR4X+MQ-nj~i_852^H z=PMZ#RgxDdner0yrAo$^QAo~KGQPY)a*mRx3VETDG4Uh2SIN_a>{Bwne?s;}N}ev{ z#Y)Z)@?}b%A>>>o&lK_!CC?J_QYB{!`En)C7VI)$Z@XJ#u)qh7(>>_7_vUbko7T!tdB8deT*UN zV+>gzW61g#L)OO_vOdO;^)ZI5k1=F@j3Mh|3|Svz$od#V*2fs~xyHCxj&Z*+#(o?} zUdPzi#~89c#*pthUAA7jY+7(>>_7_vUbko7T!tdB8deT*UNV+>gz zW61g#L)OO_vOdO;Cm7>PSr^Tlo~m*q;&XP! zKK!REC?x^{-OR;hPe5P>b8|T}AaE{A<8oF&U^eq`*%uI)%hI{LEFf?t%iyv7S37f;^(tyCStPc0+@+n70+XCWe@Ux%)s59k_Z2i6mUDuR1vd#OS zQ_f?RM+NReyY6YSopwDz-!nt%wd=~+$^G|41y&T|*t6di$I-4g=({eKdhNP%c5(l` zQGwNH*F9gh)2>J8dvc^+yDpvG+&>Z(SYM3eFZzWzzIMGr-?c>Qwd>Z|!~L700$b6p zd%0|tde@|x^`~o{)eIh53k4Z^FI*B*RFTyyNaYi1ym9t&|C3RHkE31pb+Vn7Ptf;l zmwGL)oJ#KhR8-)zB{=>~b8=#fy!`&;U7S9;(phS^sPmU-J@;O67#M1dCE6l z2gLmu`O^2|8`&}FwEHxBo{tv`2y~-e|I4zSk#7U9O1+kE>;>-c2?)$UUi5zbU2z;E z-}=5O^;*8M7rB3CK;T^X_kUZqGxBZVZ=_z!H}(?u&k6|4Mqc!OkMaUPr;%@ce=GG` zzOk3Nzb_y#7ykV}l zEQEjmd$OI8Zv*d3y_Rq6HSS*=5V#(B(ff1C3x4j!yb1Fx==**p^;*8MuXF!EKwt^l z_5X`(XXM+!Z>3(#H}(zgUm6fthWzOLj|uXP=4!C_{Xym%&Dqp^4*LH8w&&Zx|0d>J zZ^~sn-}Nu)-y>^1uWy5>Z)j)a+Yss-^m@LDIl*2zH{kVc81)V9jC>nGeS==lH!+8Z$hics zZ-c0BXlLZx5b7KBdcKJ{N0Xd;@cK54`i6E!zKx*1L9geVn4=t$a}{3S22tP8&d9eR z)Hmq$d=qmTN6u|{eH%u7LpvkiMo{0N*Yi!xflkP|5U+28sBdUz)VCqjH|X_z6LY4M za_+?I+c4@I+8Oyag8Bx%o^N7~^^}}z@%lD6QGKI%JMwBs)i;{I>-jc3L46xB>f4~K zZ$q-a4a@pABJ10rtZzfIz75OzHX`fWpsa60vc3(=`Zgl#+n}s(L$baN%lbB=)wk1A z6Z5ST^-Wtxqi%Jfz8ULjIiF;5KFRA_H|iVO8Tr(9X!W zF4Q;ZwS1HF$xJz)!wIiJju^GROcI#J)y&d9ee)Hmq0e3SD@ zpPWzf`qqv5hIU52^`O2%ujQMZPcD=5NnYPNQQy$c$hR)kH|Vu|lk-WxoKN!l){Xjx zc1FJSpuRz`<(r&O7R&i0uWy~GZ)j)aTNmmZ^jf~j`D8%OCwYDAMtws&Bj0*Z-=Np> zP0lAv<$RLYx6X;`o0?Cm`ex22C#Y{dMt$p)^{q?Rw{BVAdSrd;l=ZDk*0*k1-+E+y z>y-7aOV+n;S>JkOee0CKob_`8I(12ECSV zay}W6^T~QXpG18_J0suvP~V`}@=eYso8)|w*SCJuH?%YIZ2+8Oyafcgf#mTz)Cc|y)7d420eeM37V-}+GB zpx5$E&L>aG`6RDz{ittfXXM)e>KpW0zRCIIQ*u7Z>s#+c^-awuRedw(lM~dp0i(Y4 z%KFwP>s!C9Zv(Qv^~(CzC+l0ktZxIdzV*ub)+g&*zpQTqvcC1o`qn4wTfeMt16qAc zGxBW^^^MOdwD~0J8+7C6lT2|zsDs0(Z)j)a+X(6#^m@LD`DC0s!uo3v^$qQed>cZ2 zgI>=!gKL8SFzOrH8TmGX`Ubt8Zw5Dod>cf4LpvkihEU(2*YnNbvfw|A`i6E!zKx*1 zL9geV!F^%nH zeS==lH!+`#b8+w=Mtws&Bi}|)-=Np?P0T0b+#TkNgA>&^HJ?=V&74n8P~S$3`Zg%* z+mNhp!?M1O$oe)Y>)Vj5Z^N>_jmY{oDC^shtZ&1zzKzKGHYn@ckgRXRvc8RI^=$?5 zlRx=W$2mDl8y9KgERl0Ij?&~J(ay$MB4=zIrHzZUahCA6ag-((iFP*55;Gf ze;Y?>a*=3f<1CQ}HjdK9McOz^_}e&2lZ!+<8)u2UI2T8$=bO$|!q3K0>iGt)68ZLN z;3)4-P0Tlwqm<7P%XKc&##thdY#gP0?ilAH(ay$MA`fgFrF>4Az(v|POZeM3O8Hzf z&PAe~jk81^*f>i095jK8v~ia3w{ev6xoMn>L^~U2i9E1zl=3-i0vBoHEa7kCDCKk6 zI2Va_HqH`xVB;v|bKC?j(#BcB-^NkO=e}_+676i9CGzH69HpLbI#&rl8%L?<8@Nj3 z+oyq}e8QEOZze}+<05UGCGyC|QJP#N+Sxcu1TM1A!dW7Z zY#gP@MWUUJvqT=)I7%BAY2z&6Z{sLUE)wl*oF($0*TPZSxJVmk34a?$X>yThXX7l9 z2R4q<#zopVOZeM3N|TF3I~!+-Jg{+;HZIb}S;F7OQJP#N+Sxcu>`cPoB2_1Gxh$>^l7^sM!!kEWy^qwl$=x2t!?@V?M0s)u+VAm-Vf zSwijbUI^Z|IEr^GZrvThJ0O|#?7+Kwccf$Y?%WREwt~0CJ0tEpD7 z%b587CS#}HyTVTaXA|OO%Eg4mgaw4_2}=kAgtrjlz%_)Wg!d4Z5uP9{Cw!W47rlQ- z??BTpblyz-DuMVL)J!BXCVr;99nr+lvDN*Es^OiVD1%6>$6d zfWQroz>QIX*RT?94+I2mas+OU3f#hO;r5b%z^#tJZBc>OvKnqL4G6r>5qN!6;0^2^ zZZ8W6-0le65fykNJHhSc0f9F;0(V9Q-pron_FVyiw>ScKM+M%Bam=Fc2B*<=f_Ka2 zdc-?@(EGfe(f^6j`(rfD3+bF*l1lG=IEC-K{h9Pwr|=#3HFQ28=Ns~BLoP7nLPIVx z@_IwwV8|N{`5HqG81g1V-fYNQ47tRRw;J*`L%!CKOAYxt zL%!aSZ!qLCL*8!4I}G_oLoPSun+$oUA>VAsyA1gjL*8x3w`ww7i@09@jlRn>cZUCs zjdThA>(mZ@&+4LkmbzbcX!olV)JEN}@E#$(|K;mSugD}^I)(06c-N5L_srz?F1+Vu z+`HDZ(zpE~7d=Xhz2l#k=AuWM?|3f$fI**MCG`2Er$?GxzlZ(yohj2E;r7*GVP9?d z71RoSf#!Fc=2uuJ?1hG3(SD&X()@1M{MH;0_BDpz+D4&YtNGob`4!(S?8S!Px)z~d zr}e;kV@hq2HqUg*3mC2Zg=F@Z0(Yq2H?cRce0Q zz9{V548Ln175Zy6zbehI^f6&CHTrjBww$xzQduH@eO7@VUV4 zQGs{Jxlx6j8{O&5=k~jz0)ujHv{%lJLe2_quZ#+;l5?YKIX4PBS95z!RA8-~8%5;Y zsLmEt$md3F=f-%LCW@!vT17~WxHS5wDpm)9ac7Nt)y%Rl}%eWDO-cGX=^BDt5r5_ zJ*8}Wl}%e)DcfzzrmeG-ZKtwnYc6HGLD{tRm$H>8o3<8HwhhXrt;>`Rzaxg{gSJLf zwkwrQTdyhGGG)`&ZpyYu*|c?>vMo?HZB3_a7b%;zzEidgWz*Jr$~HyWv~?eB;32T~ zYozZ0srps#{>!ZW&R4TOTNB3K`D{&?$OU}1CKPL&&(?$=u+P?nd~W*}UlU?p*+uii z@3Ft8-+%F)(^sW@c$#5dNb5vZPo5#WL-y}k?C?{1x8$@D^NY5awANHHkJ(~M`$RDh z*<#XqRmE(w#gz7mVpiE=(wbMryu%h#+9!&+!xod)%_`)ZnEAGt z(mqklTw6?9zpI$pwwTgBQB15+U`x1`hyzeDr`lpl`$RFZMj02Acmx&m1G101KBax4 zm{_BXi%DFBiizj!<6=ttL@}{O85fiI3>EVSwwTgBQB15+#>FH~M8*7uEvB?j6ccNd zaWRQkQ8B+}iz)3B#l#wATukC-RLlo$F{OQ?m{_BXi%I;BirH$5DeV)*#2RH>OyY=C z%%Cl%v`-WhYm{*@iDyzVueZgN_K9L*jWRAKaaAg2ku9dQPZSetlyNbM?@}>+wwTgB zQB15+#>FJgOvRjTiz)54m}-rJd!$;UY)R=DjsGsr%)?_DemA|Bb$Dz>p}?8N0%xrk zNWXN6+h>;uoKq_BqB4PV%LQJ%%k8YB*cW;&!ZKU=@BYMbFXVG79Cya;ZU@J`kk6}d z+?f^PxEJ!d6^=XWE^*uo`TPpU&D<-Fdm*1=;kdIa#c?m>^DG>9PPI7hg?z4s<6cxF zj(Z`WZ{fIeBjUIh^f?zD_u_qY@8aJHIzrzoVf^)8peGCeGyYmFP>TFDVaSh51zyL+ z$Kn4o_JM=D8*UCZ zg<2!{W%|LE#$aoNe%XF=(}9*?Rj47@aoW%A4ssFQ~|9j|N9;W`DvGTfs^tQj-@F3}J zf470>LD#P_bzrSc(e7E@Q?&jjScAG!G zyLoebcT0YJw`7aj&EK+4?XJ!*jPDl28x$1AcZ=7>k5;gDjcQx6MIAW5Xj6RGza`#w zn?K&2twn2<+t#AhayNg?R@KRTe>^Vjs{ZD$UaNxT=ND~NZuv#3>(n5=Td*x2tYDk! zB=szQw4%-1lKXoX$$8_iWduuZ+p%f;b>c>*F26}5E@%9T3YUtYb|k95U#-llZYdeQ z_!Se%P1Q$*rLrM@jRub8k+Ld#vq=80Sd6~)B|fS?D$JF7Wmo=EO~2Qi@!G5Gs-7zJ z8UCvO3cX}crA_&(`mNBB{tECj=%33;fKK@b&>{N7QHb&X1Mxkvb0q=%)0BRN+*u$n z{&&@t-RM9O>EVl?0Uvc=Qhx`%NvPxV6E%LBQooT9I_1CYNnu#lCh!b?|A_W64$roe z4LZNp2D{4tHAAob749ax8aut|8F_ra?AQ!4`e;-ASJ%&7q*HyT-T22N3NBMWzcQ%f TDmg+sHvdD?|3`*{A+rAiKfbAs literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..7d039667e46dd295e6a33b94dece06059124ac8a GIT binary patch literal 16432 zcmeG@e{37)bswpdbfPGWvSo*sY28U-WW}XuN+K!Qvt{Z>Tz7RmH^_>%$i`A4B{HT& zj-(yC^XL<6DMl8=o?4{C8X#`sI9b~?9gwWqT8Qj0VEMz|?5uw{SOTa+2V_7Cr2Bte&Y@lo=jsl#{x8gTUdjQX9x7-$Z>gY-(&w&h@*WUcRcR%FMu45cfSXY zMg55r#=`)G9U8d`HIy_1{MB|sWPsz3PJQ;oZxd4bL2N#r%*XsE6S;*%I{!z{zT%JN z=JM~8W6MAA$7TwrV(DmXW+su(e>azi#iLKT3z{d;Zh0^r=MjWNIP)y2cuvJ~ca=$Sr9iFK1(ME+m== zy*7O|N>y-8g|pej%cdhj0e8fDM*Of2Ej5mK)2f2E ztt$9Q9SWEuCe|~eQiqlrNBp%_1@Bl@aJ>!%+!5;;akCCBHIDe1RRy=LDtNyR1y>)9 zr{?)(^ZHsH8fu*I3#$q~w5s5jbtt%cTd;W*gnzTD;6JS@_@6oyT>ZSx2{&w{ z4i$PY+^`9jHHelqIP1`GEuT%N3McuZxM6eEp`_Loot8DYEo<=9q2ar!d^B^a5S<%Cd%|mNgu^9}UK+ea!8;*RzFOG@qD{Ws{j)Vm*E*w6X6ueqS;X+cbWE z;h)Q8PGw>Ex%tFGAv&8nlZZ$05$JDmWBSkOrX=h}RvBXH)ZBvb)&@F?+^~K1ZU21i zOmsFKD@32lv=0hAZce^;F#46_p%`po=7OWsvxDKe#Qc18Hak3oNVKpJ9UeLp z4o0W*N!XxFCt!aQt7CK~oro_*;oDfv-8uL^m~V`OvQgP4m)2}tvBrC4gXWI(}~$!1{{h0i13}=(}`R@ zm04Kf+s?%u&v@BhEKpI;a+cBn$ zmByY6DNSWNxlsODyR9+?=Bni7ReM8IvufX31WZ-!oe0mX_8mp_JR#jXr4eFS&i|?;0gb9IG_DEAF+?(E#1kB9%;ozg16dX z;dTIY0o)3(10eOc8)EE{SK5&WY(ZWZK;-QJh`d_?Vw_5%!+VM9cFIdmr?;uqk^DN~ zs_1lyHh9Mfc*)V{b*s{fEWjF({EF(|rzlWQ)O`U8cXxrWA-7fX`YINX3#8wxf}bEA z$#xj46Xu1nzwD;IR;1nXN+;4`4%ojPU~sXu90q^xT9z*bonlc5zPKoSR)oA$%BmdT^Y*nU+4g74svUizs(mdGBfF!B^|7zj(I+%KR=n&u;pP2%bp3m1 z{}-YEac_S^p!jgZP;qZVu=tsVaIwE(6z~Ip9|U}?C_-*3_5h4K1ZNP=Fr1@s9)R;8 zoMT1Be@-E;FRZl_8{`7CAx{u@aU3t8fOl{lFRFlV1>7eHiqhCvYVs|EjwjYSyh!x#bG7}(a2fw+DQ#5WmZ7kLTR z>~~gi?d_C)w0I8U1Xqi4$SplBvD(_%Hco3RA$R^^8@MB)L4nvQE=DxybeDv3%66fA znrJ%QT~@#$ugfCQh9epgB}l# z-I4RCJrebe)axkw6yFk!TcD(1fA9932l4H}QZ=?n_wLl>Dqt(h4JZRkNblR7^YA!F zNY9UYdWG`y_F8-Ud%#{@{Bgc}AATw*z3pmB3)-M%V&2)%V!$j{>|dc^O?ujn9rNU*KZQvYX*A* z7++&PV|>l?nebYOc@V_-n&mUb*F2xAwG;C-uvvV0KJ)k*^Ldl_hE3x8B-s0{JEgnU zXtP>uRuRLU^|c!1%xbjWUaZyJ-e1E#n*JJWtkGt**sNxw9cDG!Sc{R* ztY+i9?^dJr_Udaj%9+(@y}ekgxxH^0>^0VCvs!Fcv(XN-8f~n_$Y)lwao%^U(RzD( z`C5irjdEU-HLSN6Yc;p`tifJmjb`~=Q;YYS)NHiFtVSDaG4h$!Y@GMqYP8pI`))N_Z|@L~FVt$3 z3*IBXSgX0cHn10KG(J}j*q6#z;CXaJa+H4#&zK54H}QQHd`ABr;Kk=KRqS(3!Tl;e z594#@1$?I5)0f0&Ul%+ZNPT_DQeU69M0tIb7oU}d0bkOM`x-u9vgGr53A=YEWc9sm zZcx|(5WfG#eFJ;(6U-a; zBRWNG-v;TjB$oeMg*^utryJ(9r2(E7mEZ@0+Xau~!CS&%m=EGR4S1IU-(|o(27I>x z?>69)4u`qme7C@TU}sz4uf|{st@aZ2`Agt)?(dS$-xq<0`y2N@;JICX$>DPW>}v#n z*RWMa@~it{%(VubXcQHI=Kg# z93g*ldz%cQ6an^b$Ss-WR*xdmTDETgSW)_iX8Sx;4%Fw(&mQ`sRI3-ltpZypQg4q7Axr zk9}B&aP7D2;_rm`W4-;P*Z#d?{{JHM06I`JN5gHEV5TxZBkC+pu#C$5nVI-y>1owAuu*1wrfTq7HF zLe1kkTg`N`elwkXjT`|vULz-AAGZ4bz;$lT=@ww0^xRsTM^Fwz->03+#8sg0D{$OT z^*RURw)KMT^qy)F?s>FB#QQ2ZS~WlzO6YH}fc8yBaIpR&ga3X<&0zBc>xcUSdK9<< zhI4Q%wE(|1Iv~SukmO^>rsSc3AO7V2ed)}?Tp}0E#tKP%Ss%@3qJ<>9%+F=#^U;~u zLNtFWo6Y13z%g?&HkVqMgZ@M`o|ry07fmh9W}>s1Tr`$WM>F|oHkX-Ar4#vm`P|ID zbo%uC08pMx%oHdh2jcPAa4bF>3Xg=ti9jqi6Nm?f2Sx~e%wh8>I>fC2)`<#s4>8qtb!049^bhy=kqnUhUpbs$m3kLcqqrYjOPcr(NfqtCP zZ~AGY`tQq(er{itL>qBNf5AYfL56wWrZ0TgWxmGvud?6U@t!3{Cxe`#!OM((^k6Oh z+l($eQcJ(a=xcbNtvmjVwwuW>;Qgrn_z9y&4D`QYbf1C#E~C4S=(P~?0i&NYjQb&@ zCk^y}W%RZ2T0M6d{ryNSy@e)FGkM2A@8ooY9(WxLM>Dys1-hziFQcC`&_j&=%c&|W zZA2JdHu(7+q}xT}X1|x@QlAI9*>@oqfo}F)2=Omu3JKzmpIv}YU5I~f;gmla%O?r@ zkMu&0_;ZwRVGeAzHa~W9T@i;7S;-AUPGbTkkM9jP`)L)GSZu-crMtTx9-)|*q+=ENv(4h~nv2G{ithO1(O6JgFfJhY!T z2PXr(8Qed_o8iep-VBXYhX@|2iUp_E#U>{>??|9JhtT>QLZNZa5IRy-7MiHeEHqIa zICP}i42P@A!t2Vy6V=helcSbCsIerBU*kllzI=TA(a#yOm}lYrLrmvpIPF}NHyGr< zkLBI`y2Wvn#cLrK^hljJ%)EpF;`y8+7vg`pqG~els=H2Hr-v2Trt$>#u(QW%v#ZdGUzQ|Lgzo z@`3XF{-XL_S(nCB{oOVfL1x_%f;P&c4S4Xn#D8a<)yUQR6Yf9wjDj9x@E>Vh{vxYu z7uj{{4FCPN#=gP+M}dx0GFQtbkhsPg1`kRClz89H|#W7BL?juGOijhI*>7iJk!bk&Od=P9nrJiwvT!yXnTlPVNv0+v$=CgJ zk=fWQ<8Sy=)6>~lj=b*ACcYX|zT-a`NuG#>k0<7$Zy8O)lP9L9W0?g*<+XGq%9Rug zr8g%}hp7o}DR(*@do3~0FAwog=@3(M$RkOQ-%JU*<2dQsMWw9)oS2dRTy9%FlQUS!ZS`EBY ztATf`Fu;J=U-pPn6;>*M_^VnCyjQD%>s1)wK$Jb=W))T{fcR;x20p0Oz|X2MaOH3` zG0PvD*Kb#0p#p@zuhqaWYBlgrRT#K(Lp0IJI*)74V$|PBb89J)oj62vxSZ-EPOkW4W~}z!jpZ& zWjNF-3{}G6tJy+#%@%rAW1+gN%9Z_zNVbey!02&x$)6jxpoQA~eyx8Kl2PHwirz!3 znXg3_>V|E+W($XEwlJ|83-YRc%5>dZ>0Bn9jm<{V@l+;OUcaM8->Y4}V-{*FuHT>g zXELc1Y3P1tHa3?FPbW^rqG9|M=x;DF{Ue%`glweQAd*bX%sGB|j~rEQ*q(dGKN~p} zo=!$`;pbDC}+^CJj&N`~K%6Cu7r@6u^eF5Ai(i$yg?vNX;$rBs1iC z9x=hIn(A)u|80q6O6Cdc;t##B6}%NsO9HvU}x-B-SL{(K$C@a7j!Nh42( zQpK!E6*_XbGqWR0B%t?WGWq5YVH@+ZswLQhP%wfV8fmgnmIB4J9O` z*t71>*@pfv!x5gn`CMECV2_&7vWxn`|7ezzOjriLv(VH(cfeA1xDYXv{i4V7&Pn6 zls4*(l-BF@lj#6kxDYT;$+ELnG^lIKk@5N2teM|c`?$h=a_PsQu zH5BdST=AzZw$eVBu1;QCw%0YR(e3N+#C1mSn?~)hhG(7Jtwu7FF ztiMIPHhAl{@V@I|Le3FIy`(5E$7WA_(`9J86>g6wzOe-KoKY@$40?&ywHxg`-g#`h zv7|1k=hP*y>YXo;4xhK9Bkm;y%YM$U)K9^%fBAm>6pVfrV9Z;gPY2AwF-pkG#MxDF zI(?2N=o|V+j5fktHS+Q@uIVG*hGHFAFfsd2y-?KbNAsHfDY&18`%8Jmryg|3&HQaw zw?jkP4AcR0AvaGl*-0wvpqMU#Uwn62gEheM6cokRg5%kX<59XO-?%k?zQeoZCWE(H zVB@y}bpu@ov=u1D+XHLdp)R#x8<-2Wbpyq=tw6EuI-s~t1;y#RNNu~+1((a$(BzDN z9e7=FxfL74PT&j9E}uvDE~yUKBVNCz`+GGFHjv&Mp!9)uz#4p8CvR1Zy$#3y8|AUL zHFn3(Kj2;30R6YYST_RQfMW)_5vT{~CZL;vdVsbA-3)XK&~~8Ie{gOOs7q}iYlAtU ztqnlY)<&RcYZFkk3 zQJi=6vrjr&lzctR!K>Ke-R|Q1jwU;CG%0ZhlvOH&`6UQZool}`uejoaYrpxsgKM2h zwBcx0$eO0LWL|MnxeaB&*8yEqAIc+D!Ow?O@6s0UJM)9{FV7EoNinEySy0h;hrGu6 z!My4E0j28)l&&98-i~4cJjDVi#R4eB0w~1-D8&LO#R4eB0_c7H03MKr9|Wl%K-ePQ zU@%U70vMct0;qU-)f@JqWhkIw{ule5cFH<`cWe9p(~ zdxiR5T0af-QD2}gl<%wCpYN|5&kxic%m?ZY0slDg6TlzIE3h9mdkFg75BE6S2jM;h z_v3I+zMZn3h z(?;x?j%8+N1Lv8Yjhtt8HgO*IAlmhC9zQw&elzFsB|Y%%oL^(`TY%rAD4N#TSa9xn zQPX;b`CwU`5Ax!CkY8y&{e~SnA1sUWL0+5>^5T4u7w3b#I3MK2`5?d2d;(Ah^QP;A zWpO^pi}OK#rTGjPcIbStEY1gcaX!e4^Fdym5Ax!CkQe8J{0j5gLSBVf`pPoK^UdD3 z=TAL(p$I3CB0hVDJl+?)?3vxxJW8M0gxvjTJuG#|pblcEyb>~~OY6J)Q3oMv$)G+@ z!BO09KUX|M47)1%`QNIxLWk(*DBD(F=#c$gb5q;yl0nPI@1GJI$4BuPiDLTLIkU@K}y>ByXFRU5L zyB=V2aqZBqxOT{kYlpnJcE}5BCysrOwr7FwwGioiyK|eP_>Xm!Sa){<*1{e{tak4J ztfak2I@!7!)Pe1KcCmI6*8V+AE?`ZzGe0n>3+Lb6XMr{DE!aQk_t<9l5VS#=;Qt(J zEyH}tpgwHhvzN7(u=YR9=3JS$DVmY$g`h7cA*%6^UQto$0~f?u;3qOcgUvR~sq6|km$jWP?Y zSqxEG3@Q6HwwJJ`evNVwYve_&kr%N>Uc?&tRbc&hR`FVjA=Iy}V~9Is#E`OIV>=0J z8n000Ym`}F&0dawvMnRjuPG^z#Y@d7DVXOk6u8}};nZH~3onZxRg zV){1Pu<~t`i@uG#=-bGPzKy)-+sLoNw`V1+<+F&@nZxRAV*0fdN8~ey>DN+RS<$a? zugX|koH?w{D5hVd4GHVd<1DgH!dgCySe-ejUrT2b)32qoh^@-qnLh;HYBV+kF&_zkk7g2cSno^FhBeqoW+oeGl$jL#Pn{>rd7MR_k+7D}A}odw@yua$HZlF$I)<2M4%4qKVu+<*uN*_n zd;?-?bw)A$+9HO09%m67V2x)H%n$FeFBC6BzJJ8)EdCsF#Tw)u`TH2m?f*8k#ayPY zbh*bN_kwwJ%om@--0;pWnjdw;dCc3@6)$vk`3kgckhaAfyJIipQ1N{T;;PXtAIJjyCC3w9@=oc@tq0e-_=*0K{wE@ zdcYgstH3)WH^m=O15kwZ7tQ9Hax)o-br8te{2ZYdl7g`m81%#e429F7Q7-cA1b zR(<13w0_$GqkfyDxBXE=zFpAUCFuDcGwOYk-i{{>`3^yEx1iT`*r@N4^maaJ$ae~Q z4>CPM{`iCS-LRAjP<`BpdctzDWzeQ9^epbvg(-#O2H=g`NFZecAo;CD&cp5}Gyd7ZGod7X>b345K_(f6Ym zSA>0!br^?m@3#o+?}GKmc>7uJ{rj!?|HlLx3u=xD=$B7{FZ_i31-#q%@$wgJcb5sq z1&9;eA0CH#4eEblQIC3D&YwwW^IE1Rs|$|m0<2SJa=$T4`AX?}O$K6BfpI^Z09=618sp*;k3ziMA5?i~Hj z0sH-xKIfp{=G`!NdZwC(GmmlI*<{cr#(NmWU-|Jm!vs!)CGawzUjKGiPhla*>fu~K zuV=Uc#&a@}n1jDk{(=gByR1%3jH~?tKm1d>dy}cTnOG*Aj^yI_kzY8Q3g_bR5nv`g zn+;Dz=EB(%>2xZSgBDZABQuG)8K{qiqp`^oGvUPCbSgZZ%7i1yWH^-#r!%SPL^77` z&1R;0lgX2_dqDD)*i?=(Y9Jbo3`C;SL&GD(!?8dlG8Kph2Kq(@ree{7k)ge@KKR4t zsmZBbeAM?+OeYdZc$|5#vd+ciZ9SE81Ku^{k|f{8+TW4ndzk!Nl03-d*ChEEli!x) zN0>a>OAF?YFEe?Xe)wgINtDSiNpiZ#FyFvHxhQls8fbf`pkQZ{u>w4tz!i*BbKP6P0#$Gx;@1KE&ic3C|Fd zACu%iLb+WbN1v^<{{+abzB4@wa;xu5iGMDYixGeH^c;LEOZ+o)C;ah9Hcr@2l;<+U zpGid{xd`#+VyAMnb!0X%1>#sTlY%WCjlu>e{;AX~epF2GyW3o35@;N>jgmo0GBO!U zh6912gJv-}WEMvU&Eozsvp70ZE{>bU{Xt&r8`^Ic_nK;bWn=w=rgU(`w9r2^%GHKP z1H2dfVs#mKNZ$Y7vsY|NbT$XL1KK)E<# z8Vi<<1;@`5;!Y-oeD5{EiDZCiv|9^%d zBs8VX%ScfdpLtx3KL2|_VEuf z;vH)YMCzVk7(|)$xFcNu`(;&H;`*F^38W@=LfbUvh2RRXfi*)}g`1C``wtG6j~$BU db$P#04~On%9d@~mc=JUwq;wQDT<?R-8n3oFMIz&MQNSl&F{z zIg)nl&Xb>5OEJd7Bu7GT3(@O0k!y!Zazx%WKY_ndQwMuyEMQ@}#>qlzpM6T3zdVAq#} z`o%~!%KSuuziWw&)PNX$1G;(z&9h*fD$EYWyiF!5A=(z5<&@ZU9{r#TglyG$vZs_P zZ931X=!bn(2AKZKr8KFttS9mOQG8_+u$Ht=R^!HNHa3u6hYBCcAO(c>6Sty+hk43Le#1o_8_-n4|@Kp5W z;Onl$_;@;+A+Nd8lV6D{Z@Nx}<0qq`6O+@Cw{_Fd=*jW%XlhP(c{Lf1a3{sU>5b7d zAsT{5%A83?U!Cms$X)y{?P6j&e&)74Jjfff8#H#u9y$!1KIn2i02oZ(r zCQHQ;l!_zR%7}0^os3UrPVh-_-DEE#NGTMJ6-RJX9HFU<2;ZAbhY}|?-Dp!Uk!|7$*+22TZ{d>Z|ZPokt=dOuV z;$#wRs1V_;hGI z9?pcGNu*A&;8X_-JZ?^Y+#mYV(Vj4DVWzzP(eW#6LbzNmpDaBCzvSjC$Qne&Bj0d@4GgN&t|EM}Y6_o{FZ@lZojC zzU|C*gI5wXSYHLZ#C)Hfncy~h1MG|k*gk!L$88U zzEFU7I5p<{5;U#>{S;^(D7|sD=KN*g13>dYHJ}<$Qfpn-J|2YG8v{ut2bh*^EkY*7mq%gzE|RWO@%N(iO% zq(;rbwV8Ztv5I_qv8viVb1wgU?gm*{yg^=o^Eo)@;k>$7Am3QTOe>I!jDC&LXBmB+ z(uP6<#h|`WPibADj?(HvHKjWWJ17OaXYox7*ij00l!6_lU`OfJoKthoJ1=f_4lE38 z8qfx^126PwH90dmpZjo~DL(+iEs$3i%~dtE1@rnD;0gtEBhqsP^QM`?IYOGZI2W8S zCf}?zw+;MkWW!D3G{IZ9xesjj6LOv?>Lo?7S++Q0o8JN3W;h*=*rq(t^ZIq(q0>uL zw>9S6+N9e$oU>@ZDX%W5=hX$L>YUAz<~C<@b4(>!!?rH0)>c+;cZ%swEupT7Vm%M}^46jT_XhiyRg^Y6 z_HRG-PuWTNIyH8o$+^%#yf^JIq1OX#0J;I_dZ3L!HvruTv=J!1FAm6sCUwCMGCPbF zeH(zH?|Pu56p=Id3zBTp7X@jRsdWfjfn%=+6?`~oVnzn z_eOJWR~I&;9QuHI2T0d^IrZ^^jE)!R{j)W>PROMmjpX(M%~A}nF5*48pd7c+Tt9vs<{Hc&$Y~X> zHS<%DhodvZ{`iv1U8g12J)bL>5u-xET?hBfY@NY%*1g8M(^54)bIBTMgsq)WsSM|+!tNLepss?5}RRQ3KfDZyc4E$l>9|C>^_@gs6$Z^f=hko?I*$?Lc zoB=q8;0(e!4Ci4uAA)lP&Z9G$>#RoXk1pAX333OnA+L~c;5-&;;Ma2=i#PBaI1hP* zd?V-a1wHT^f$y+bG_ATiThlSG5&Q9_&F(Cv4sTZLR1{6CtIJwDpVPE`LLcy2>;v** zACMROfV|iT;v+v^kEx$1=g6C7jbRa z;{5S!@{#9rQ1x;>4(GE@RwEkg2WgEU zbUt^Q=rNnfFaFkTN;Qf17-gR7RFizKH3yC5$m_HT?Ywo&P8d7Nef3N(V4!WK{%T&Q zZ7R0ikk#7}utB+JBa;glX-(9>J+IRawDWEOyA~(C7tU5|u0!jXqj?m_YN+>Z&o&|N z2xg19)`so2EKt8ZV26I4TIU?fJv$bfc-|rJ-M!F+)YrMt6d)}>ZfUjTQdOn+`MUr= z%qf-|IR3sK7(dE{93fq_|0JMO3)=PXXLb^P1NSqzfFJ3m{z^cnHXNUS5XLX%H_H1C zF}Z*r>7oAifKD6HuK&SR@Pm1X`hg=1KbVhvt|A`(M~(68`$8#x{ZH!n4gCHxeytLI z0n9&~4+4Hen13h}@zZnM!E&AEUl8*T?Iiq$G5=65;-}9EmCX&Be*w%tw3G602=fo+ zB7XWDYG-qa=3j8N{DU~$_4z`{I#?BOY_gW zTK@6*%kxj3zpLb*N6J4h%Rirxe_07XKjxo2e=+}z=dXj!Uz&gYn15&|;WvQ!hjIZw zHh)z%e`)^tG5^p`!mkhW59I=WZ2q>h`AhS!f3^JM^OxtJJbzcozdkAd`dR)B2>Iuc z@C#u6;am~tFXkW0R?c6-YZc~c5c3c1B>aXk|4=UCC)GBbUjfWNw3F}~!u&(Ih@Vso z(LcCa{_*+C^G}|?tK{F1lz%~%f5Srl{XNuQ=dJ8*Yp_wRHLA&op-~N%Yc1Lt)nvrL zs0JI=TBDkb{wFW_=&Z6wHhqqhqV~(jB4hz{tu?C2h>KATX3wF; zT8nl@H5tcmRD;=bYe}s&s>$eoyBaLwC)Q%b#i$00_+c$ZJEI!>Pw<Pu5Na{v;xh^NEbH;K5%$}g zkj0NTH9^1zpeoQ-prFP5O#Ho1Nclcjk4v3)^wPenJh55BE_U6@A|s<*HN3{bvF8N}(Uk(D!v!uur9VKe05~ z;Ze(b({cd%i~M$pZW~ui^Q%vEKcc z12BZ2nX~lXz5@4_-`A{gUoU_SzpuE*3;U1iD^`yk=+0`muec8k``&hXk8z(-y2t(G z@QtSZX4qCO+-%xhV}{4no65$2CMe(jJGy*3$l)=S%G=3rZdTX5K=sY{>-uKN&v`(X zI|aYpf}c95>s86G<)AKa5&ZTDeyxXeeXHbm-vhe*KEZFV;Mew`u5Xk4c08=hcL;tR zg5S;~x_+nRx9bsIzDw}yWPXJF@kg85A@vlX_|CBbXvc~<)6VA1ZcU+cU{7ABd)b`n zU~{JPuIIo7J_ineVrdt0N&|kk#P5;#y%OIc@tp$CaFl`B|<>y;o#zmus(8~jq8N9lIyCu zPFOclhjjqg&Oe0T4Sel9ZTg>CJMWZ>|G%adZvJ0dQ*l0?g}nPI`3rdO@sq_q)2&5< z>ng0XJTEh#*FgVM13mh2Ie#|5m=i`ZVXfvdLq;)~ezlmmrmW~EtoJikMKlc+53MF`3>dCSOy=!H=&g zWAJ{m_@3T=cFERcfqliZOZBQw*S1--FM-YHy#^3_hJF`-?S7umSJ-a-9*c(Uey-11 z(C*p}z?}Y8nT6j7`k^>xR|EcZ!;)&^rXeyKpXJYtqUMQUiWn%F0UMe}24vmGUL+O*r zWFnOTi?I{oiOJ~+&__d&=;+Cb(B$-ZA~c>zg~IW8D3J~&Q;G4(cr?8)of_L0kDr?A z1j*CUu?%HYcO(+-4oAj&e7!zj)Ey3wxg+lGuHNpkXr#NhXMeN{{z3BC=-6)F>y;F@ zzX|*+>a%-Gbv7nAv#-2u1NNNX8)cfKgi^tO7f#j?$}ov z|4AnQP?ATO+(JL(TK1tg8OD3LYgvX@U!iO*IS${-h{qh0e0 ztPthPrSTsIxzTr^_zjZLcc8>IoybIqD{^KUK5ZqgiRqKBSU4Rc>@UhQDdI{cBH>J! zxH8ex8EPG#nj8ahG@eSp6put;f)m$RVhTS7Cit0bCOir>2HyIWPDwmG8jXkC?w;Y| zrQcV)99+IU=vkKTU%p&+8#>6PUEZQwS8q>I>hCIE9`f_c?qDy!>>4h*^$Zma{QHC4 zz;~$F2VZxw4?ee>8}tkh@yq?oQs2R1!@lKC_WFxW^m>YJy`Ewpdi}-L{9Q$)iVdxn6HPT&h;<&hf@q2IV7#somclnRZV_89e{(MpVj(n@` zQ~dw5=&WwcuYvZ!iM}`nJoviA|Gz3pU0i%V;rQ8^d;opm7Gx^@~BaNznS9uiBA zfD&a=86@2Q2ck2xMG=qlMY!hJp`0J>xW5XAn{})nu2ndB|9SpkcX{98vUof(q+jy! Wa|Lgg^D($E@_(QC-;^9Ami#yC-R3g@ literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..9be88b91c49ed02dc9778e130f5594c239ce22e6 GIT binary patch literal 20544 zcmeHPeQaCTb-$!2iJ~Q&vSpi=W%0?8ZP}J6N}?nxa}-LJ9e>D<<0hFCWMwFk5*1S- zN79aydGZr$IYt&Fo>FwhS|DGxleS5l0co49naB7q|WRa-D|a20ye^4732o7@N0wszs`-a zi_uEt`H3C=UrB076|hk^z?yHt95*J=&gGCT`sA!42y9@aK;mRH>cJ2QX=F6nQH+6h zMhlenaIAs7ynUh=6O1Ie8h*k1vjX{9^n|{c-_gHMhv~X}#{R^b-v@fw-*R8r7hO*r zFuoN0cr9ZK6l-4%><^($0~|Uua?gIWNQGq#|i@$&;FTGGhOp=VT~$A`(14H4}b=i3Z0`OiV?TUDjXe>h zCWNK*6N$(xQ$5}45Who*7@vtf@u?%+Ejn|WbavYjx=fTl?-34(5o#u#-FAd+CQ6_8 z2#3W8y(XRAc7%Qtr6otmcgdwtED;TfZUfAMO>7C=T(`3z@3WCI_4k#jKU#wR#j!}* zSb!rX7?2m>{xbECm8t(o3Hq<4(#ffCWW+0W!J{P@kryFcrhc?ceaM-ZBA2y`#UgZx zMM#ujL|%k+nfj;7)IVKLGV?1&djuu`-T7t1y9TDb;ZFTsFVh{BF|vji(e3-MOD2Hq~$ zz&j-v;0rNa*b%uBtQ0N8UzTg&XXP5WQi1`o5QQCatpqDY3-R-E4ZL5jfnSth;Nqe1 z)U>#5Ub#_%g`!3HRk;R!U9N$DEWyCV!}210T!Mw7Mflfp4SZ6rf&VJOz{P{oB3!kQ z5=_XeaMfZf+k(Ap3$-O!xSUGFrqahnP+YY*OE6Nr6b)rtaFuPLsRRq(ok|7cC(^;O zuD-%@XeAgbUXIqXEwq(wp?x_PN;_4#I6M_f71qtmdR*S&&sB@xMD2DT>tBPzsNh&p z?}6pa*W?uHs%4~X3;W8pFuEKIs@Fc_aorn-(n@4~TYQGd_F9mQh#o+mY#NCwB_ zCuY)#S3RjGrpMy35Bf?Nf#-M_&bbU@SFPj+p2tHcBNNFuEE~=~AkJ}5Mv|$i_{@Sh zWM)UfMgL>d>Iygw7U$`_wHL4+Pr$`_Jm;glc;ivLg`JyQkaytuk1YEyWqw%!*Z3Ak z(Qs+JP>z%^>r!-JrHp=Wy`Up0ueb!AI8#we&cdX`Iln6Nn?~En4;L+Y%ej0W zo{dylb2nfDI^f5EIY8>Z~~@EVmA*;j-6&9AID-*g@J+l{*6XG_h(=TWwn2X4{&Fwa#0u3<;rlj#Oz` zxVDmSELM;|TCAw_&YsCVlf6b37O#zG_d=16Qd?m##`7IPd?kRlI0&)~Vjv~lW1UZV0S-0*!=YDRzdvIZJ-Jm|0 z8GN=^ugY4<+3e3(S#pCg-8^|^(OOYeowu%@1uCDnHXxqKTi4Cz&k)kQ!M)&yIrYss zvKv9KmaMr>+!lE1HveAjU4)z^cI|@QUTfRnimrbXWSimbaz)qW0MD}PoQvTF+SV9# zH#RX@mwOK7*X6VY?X0%o*4%R$(%kB9ZjNdsW7^mGrTWUSK3@DRePvi*^Dy>i=+_2w zu?-RO05I2^6E70po`1Flsg%vmQdETV7kc2{L9$h@(3|B=sRb?dQN-TE-RAA$E* zXK|gXL5EzMy;0k4(-G@9t^}-td68=;dD}_ecJcK7O6g(hvm2|Opv6?N+Y zQFk>U>aGDq-3CC^T?>f*&Db5S&(XeXwfWlG)~Xeb=+}VG+iRWYE%0m-=!~Pi)s=TI zXpp9Lw@1%=I(6LzJE_x4aZ@As2Vx>m-YA;edYs#D73Q|S#uGii$-Ur${@26UHvqZ- z8v!=}ZUk%uYy#W}xCyWcuo-X@Al-L3*G<~Odf=^xIiM{UAlljhh_)I5(bh&lwABQN zwl)Ext!6;q+=}cT@b|`Z+H<~I`>gJJVa|4s9pXdJ=IwUJ&HR~rZL93Fl`sdl-3rfy z&%I|`VI{T|_NWc6HOhneO%b9wRz5aouZ{Xve&hFjD;+VkVXL!~>J=-=oZUhB7UTh4 z0a#rbm_^heHhMMp!bbPobH2G3=6c;E>(@5UYq;Ng-OQgbZ|YA#>Q6xGPeAHVK7@B88X;H>UBtrO>cH=M)*@dDQt2zsFF1r2M4^lCw4Ap`UpLE9MJ zAZYwj0<^CMy4_~e^~%ajRr@(z?_&L|CssX=*SsH>pm{$V1kL+t6g2N=qo8>|O@hWR z3qZe{1YOPOW}rLlc3rQj$v8Tm*7Z(lzIZLq7ioFENXzp@TAnY`OU<_%+F(3%zIZLq z7ioFENXzp@TAnY`@_dn&=Zo}G^Yua-jDz|YujTn7EzcKedA>-?^F?~8`Sw7Ym@i(- z^F>;oFVgaSk(TF+v^-y=<@qAL#C$iBmm#;mw21ec4elS$oxb;(EL2iiym$7x+)ufA z?a@#-L~9R1Zhp89wn~7ZjaVsd4=}7HHcC4K3>)|JixiYrwx2$KgvxN1d zkzf9e*OF*O)IKK=m1 zU2Qm~mY12lv8yS8HA`Mbb(7}q>zaP9Bx%{1kaC&dowqH9M23~ea4YkN^7r8&Gc7eiPqeZs2i=GJ}Ue26<9^3g78Z^S#Z` zhimS?57r#z72nd+w~CVE4~~29nP11hjwM(A-xRW9aDTq;|_C_9r9)0vD~2X98rinlrxJv68h6Mu^Bv1a zex8>KafkAX?`Yg1U-2D{JLD_Aqj86P*>_0GzC(H$zWav?lem*=5hL!DnuO;}!gHq0 zh&z;1d`IIB`I7H=&b0HKS%^E7Q+!9`4*8Prc+Pb4oVj$|VIGovN8|3(bEcQ)%tG9u zyy80=cgR|a(YQmNS=_N)>CZ7N#2v~jzN2x6 ze8qP(?vStej>aAGW#1t!`wr=4_-=*bJ9@vv{3PW}dcQ;7tvQp3I?0GTlv8|1;|}?< z@07aAxZk0i;yW65$d`Sm)L~2B?=TNZzN2x6JTu=ZwVZLkLwUt_H13eE_>RUM@)h6F zxI@0|JEUdbA-xRWeH-e`$bV-(wbnGNDa~q6^q*O+sn(PzXI6XS`kU37W;LZ*?TPxg zt2ISkqSlnCYgU7z|Gvz%X1nq^L9I2-YD%-((}+DKZ>lvV%9+)kxPE4}rddsCR(qoU z?P^WoJGG`nU9%e0h&{|_X0eBSv)DtvS?nQSjyV_(ug~hQ+!9`4*8Pr%xX%r+7s8`H104DDZb-5)2Jy?*Q^FL;tp*nzWXxNnhC{s zYOTq0CXs7Ov)a>$JM)}roYfg|XOc5%+?nT0<7^Mt*{t?NzG>VooimN|LDV&?L5;XG z$(dhPc8=lGFb6R*dlNlUrCHyRvki`$y zX<)D(5V&MBU>kh*0M2spOqY<@`*0Sk1sJyI3^(T(-a+bVduxE<7Mv9r6Z>(&d2toY51+p_*xB=HuY{VS+b8|u7&F$hz5 z*P5ZeeHnZ!{F;&cdLCqiU-8*6JV(@Cb__ZJw^o8*@%b@4FLzQO%?Nv}iF zYu(S)8Q4;GB9u_!5c|GwnQPI&?dY0Z)!$CyyB~kD0E| z90TV?4D9>J)-LTS9q0~)_9%3xLVFduOQQK6ZISk9Gwe|{C%yDJ=cN9O`?>zFY3E#; zBRi*S^KPwQ$LCDHsky49-wM6^k{Otjr4+N8WD+WJJB zl;cEOr)ZP%ooH(lZBp(NZ7rfr%7dZ}zc)kYC*?%Zwnnr``BAjhiZ&@%indD8Cgn|R z!}|c{&fkO28X|X|vi$Gl&fD$9|IexU>;IQ>D#qh!*mplAe+JLCf3kR|<>n&6yb5_% z?8`K?>(KsFlXlb-d{JAV%}FzxkgJ8wpqWkHzLZVODYuLh^1ZNmz|1CZU&+HM7avm$HdD49DVvy6Zm|jXE@5+pnN8kqW>e&p3D6TcWgNca zV|>5QdHP1JW`oanrTH&STTD-!HVzf__(e zVa@4hl{xr~z+NA(XSjmXE{=EnC)(Yn1J_@}jX#^N)Rkz_CtN=NZa{lQc`n2y4i{*#I6RB${r6HJ{* zB;v_5NQ@s3O-{{BLVF|_j*Oj{3{K5V#Df#@WH1zq1>>n;A{n2UibYbLspNQPEOv6b z12~UG#?zG0yy0-DClsFO?d$LBi+Dq!ac|h$)79TI9trpK_YOq5;18LMkBx5^qrMgK zc#Xh8?CG7wZMB^5f?8I7vE{8M-&FXUxxBqYmL`Oe)#Rcg@8|OGDE!@=|FOb9%=vG1 z7TbS_^M6La=v`t!hcW5KUggP5$Eqx`u#QMw<-L8 z;rwGqitXIw{9iFg7>zY_167l23crE#1BxB^+Bm$b$%Uha33{=E^G_@MUe5ozvYr9X zzozhignX-=1SX2@9|XSH?@G=9-|TlK#4{65M~El<#0>m4hIl4tPI#iBRFv@lk)KWy zPcj}3r9;G%j+{zU>Cp7lIIttJWE?^~9EJcVp7HoJ{)mX+?`+bcF~BHjvnz%q78;Ah zf?jX$UgOf=XIu^yE_ZhqxC4dDg4*zI!R_iBF)oJACJ0)Ah=ko9{FV}7qOvSEq4+Zd#O_mEH<7%;jR=rPm=dVB=|qqm`MgFDn^ z7#kYu6|zIazEXafQ;H7$nkP8&(BYv&`&rT|-f+vLaNFYJFg!-4FLA+9{JkvEEbdQ& zMqRw;2{mya6u6HI;(ZYNvU6J8-c|wq@A-UO=!^THz*a6V+J!!6)1PIbZP707n*udO zUyNTMoFmfXexWb!(*k2$UjWWQer*E+c|xBK2nCeiiy;eN0s4R8KfJA=zNjON-=8%y z8RPG`2^D1k`1>@}#YYzS68T5`owo#I(?_CsDb~-)`FPtwp3r}j=dU;SGJzvCT#$p( zJ;5}NJa4fW5&Az+_(ESG-ZqT2lVl&ghY?;rusR?$c#H9i{qt4eiLt|_@w&k8Q)2$2 VLG&x=C|sE76MkRJDhdio{s*8X`||(* literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..f4eb170061009c801e0b05ae31833dda34f7ba99 GIT binary patch literal 20496 zcmeG^e{7rAagP)wQ?g`Bwrtb3Vt!I&M|LEN`c0*dOv$pWD72!eiWX;SXiB6+)s)DR zv||UZ{E4v?FAEY!4Z7|Jmeg^atWBD%Yu9WoM0P094zLG1D~1GX01e2nVpxG-TZdpj z!ri?e)Z^19TPCyoG4wM&-@AKv_uhB+?%wfzq|P7l^qEa2w}r5)fGiUeZzFK?_9YK% zcnXo`B3Af+6DcJ{Kt@?N>s|#@T$zAY&WB_%CMRV9eJ4W&B;^c8IhX<=yBJOmWW!JH zWw?N{9M1KVBfLM54I>OC))L;}g(<-tmOZ9|k!b?@liqi`o+> zjF$o}+Zj0n8A>(){M9x>G=Sa5dXEl0LP+8}zG;6j=5w43L}vn_*o)7-=I}))W4|Dt zh3`0gap9*D)h6%F|O-Y26|@c{Jr{ZXKKV-eqY;Ob;3Jmw3%=9uwK2fo?+ zx+6R>5evl0YmV5|QvvI@9jASvQvvVEsTu!UjMY1KYGNV~oo6DiMtpuDWHk_aW9+P# zst}suXCr}Er&^lTDgFtaVt6KW_Leg|ECwq#80@w)v>6C})-&uDGt>+QyX_3?41_-G z8TN=7It>Q9?F?N8LTk>Duaeh%p-9jthIKOy7O^C7aotXXyv};_l<&_|-jjp;m9aoP zV}PL?6vze`$y5H(JmnwHLH>#0qhUi&Y5vd%sWEtWo`6~EXz6!48pgpZS|D-w@-V}WU3Bp8kcR{S@_#=cYkJ!&AfrvLuJ zF&Pb?ioo!b(}9_|cVg;H!0*M6K!1Y=(|=A6CE+wO!{7@|P0m=pzlx3`*G(thbxixt zcqc->xcA9$^dwhK`yj#N;p7Jgy^jsH`QQjM+UgpcXlkDfOiz0!A}!5`c;hqP_SQ4) zt=_R%5Dq9q0XW~p?C2d21$;9{e&~oqr%wCgKwdRt@;yg1@D(^8%^LV?u(g;>-*&_% zBT?^o_|!~1^135-c6uxv`ruGLQ*oU1!^u}h>vc1E+wr9DbYLPH21BAf+~Q>SbRZg= z3ePNxV`qLEyc!_GmI62>7U%4or3+vMoQ%8qIlY_Hkzd^Gi2f2GI-~ua=j|sRc{;jO z06ch$BkEO?$j1mIA7QW{%AQwrFXKWnUU>l{T9hB!t2R zQlurJZ6;rbwe)*S1%=Mp^Qq^OH^}1B4e~tP&%(U`_tm8|`Pve$v^05v!{6ob84h2k zup(VSZBUjjqp&z#OkrWVkixy`y%Yl9Gx(+j_$UNE3W1M8;G=L?(yrSV>@V!FcP(~p z@6x*xUC+1aMM*Qcko@^(Q>qIdH%(q$G8YtWNSn9J0+vpjD-oVgo43!V&l9p|r+v{5 z&(uCwlB@!`QnK|Xv76wn+x&Z__Y-n~ShY)5YpG?YEx02EynEob*@D}#eSx)8HU=-z zzFk54uBxEDY7gVL+2>GZdrDi>E@+E(%|4ePTCH8vg4HBpINrsz#!IkqURm#W2{zt5 zJi{Is+XByI=_cf5QnD{mQc`Q#s0DEzXtO==>^gaQ34MIXUX&~#^BEidXF8G8%_nAc z^BCNIxF=@OUd13oZp<#1IxRZFG7dL!xEY`oo{`)*&3R69p3}>W=kzl2R8b!+z&d(! zNe7?dd=pk{Z3WJ^6X$E)NAcpC;KeHY;ug|+vjPHrE5I!PD*!0iCH0o(y_ zJ3t%29f+gPY_Mjkw8aXbRe&u}*A{@NYb!w1RS6JvZ3Bq9wgW_6I{>0C8$eta32RC1 z3v_O!+I(qgZPCV(;M0Jot)=BwNW6$AN}6kJY5SrE$w;?5^t7W<*ELvyjZO;h+68lh zSW1((mM~#mAg#4&@B=J=vKz+Qh0g=?zGS1m)a`d^i@T5xbHM)H09)raCcD62yBD+< zT1&07dh5$`mZMf!=Xx@2wU&IEKL3zqvvsx*>}iLI6_SPbEE~)gw8ZFZ7$X}jk&T(fObDx z0PmkGN;bh-Y|}}y5nzJa^y(7&=%O`JO5-IGftZCDgqYBvt()Vpj!(~$@{wiNeZ_iY z^K;3x8EuwM-&YJiG?&a`zP@i`$v#WLQ?r*!&eV$jFG&4gp#8JZf3EgmfqVA;f}Yu~ zg5KFf1%0yz3;F>c0NexkAmBrQ4+A~|_$c6y0{$4_j|2Y1Y&pb=Zg#`Gdf@JbyASSu zxCh|&z&!}}5ZuFXkH9?&_oHw>2KVD|KQXI2&grE5;pGZqg0%*169g3CTLg{;0pME& zjzt3Cl>&!#hxj&u;}-z{-!5=`aSr$nfp1{A4e)x4Mb`@p6Gin4Iw^mAd6zpu;qKmq z-e9%rdU0{0q~TdzZ^`GgwM26fbu(SV@W~ZgVqOWlLC+$ITa#PZ2hTM*iV% z-KIp9yrz)WQ<$hy*OsnPU62MStVO=wV$LVo3h8}ioG#f+uc2~vDF*9naqNmkHjZRV zr1xy&bjhZAEtPks7;Htp-kqFJvNh8C?&5UG=K4M=?@BRvunxzszFd{Awo~8RYf6&! zdi^|I*Fy;%`+N3Yu0nkGNFuZLk>0zX$)&&-DA%Aj%p<+8;c}ITgL>58^xZ0iJ#7s3 zb~4y^|97k0r1}TdHJ0R{CEFkU-QW*g(|m2BJqM0}J&`7@QPNGHhqxKEAfM*}&Zqce z@EE5{{vd~`yw%NMDcY_7VXzyHqxfUs5l)xj1e~_b8 z-sWbo3hmeLV|K;o%<>218SEbjaDPC)5xEHIo+&1$`eSf9+aLX3VfG(5#h~XbgM(jP z@y9A4r$|!D6_M$1;tNA&d{? zQ~WWE@qu*NA1vmbJoagPj9`2qpW=^Ej1Q#C{$M%6#d8IXkD;~V19B?beOSZ?ez{5g&a@e7Jdh^ziuTL8@#z+5J@gee+hz~Vitq~uCN__P5_!!{v;o<^Z&GPMWh{~?SI*LYo@ZSVpy!QO$+&Mm3gNOCg_8&4oU>RgER@8FHYK2A=g?cXH;X!{=ixb{jr`J zt3mOHT4Nd2Qbsiw+S#baQfn#XGpf1JPDVABQ7vUubD_LZjiuI7$Y)e@pvut98^^e+_3`@@I6l#^U)ZtCkutsJYP2Mm3gNOCg_8&4qR{ zs*;0*XJ>8jI(v ztXgW=pyon58`W6qIS%p})m&&NqZ&&+_d!0RnhWVhHI{l#gnULd7uxk!HJ0p;VG$ot zYoXl8ZQ=uKEt;>^QDd3FA6R4Ix%lrF%qLTDp59?ENluyO;e54ZR=kJ8cNi}MFP;;p ztj1U@tcQ1$@SX+F@dydMUquM)WfWHH4A!I=yqi=|daawm zy*h*YQViZhYG{9*o5B4$g9lO!-b-3&f4!T*2Ax4iior&lhYijVi@*-}{-@H)-kTv^ zvs;t@m4#4=DJ(3SO_^4GQj1@J0y-TcEx0+@Z3}`Bz!I{bZAX#WLdZ?)wtpo^ zuRhG^)j)@jPn7N?zq(mi{5K6rJMakWKcL9n`&*2DuO!zd$<-fc z{q>4m!*4TsgCy53$vM8n`W=c~qnFVeCAkhRN5~(%ztah!Vg=X?i=hHwE8sf;wy(+s zPM!;zbSsVB<`jc1JQuX`T+pWPr2XwF20PXr%NIo~_kU<{N^4sOyh*{E6}&~kTNS)b z!P^zQL&AA1{TX~F7P0h>8Q+0r=jfk)#yPqpVSl0HX>Gx#l_ckMZQibR==hH1|9kGP z?yw|hz?NzbH{rVm=$jOMQa%@bCq$o=+eP1T(I@43(Kjsmq?|AM9u$32{uh0{qEFfj zMBn|QPudqmU%Tj&_6X63?}q8KOZ$cBs}p_F-XZ#`MW3{fh`wE-Puf#N-&WBl?JuIQ zRP;%Ejp!>BebT;zeYg+f-t$lJc}&F1N2dRqz2|l@@|!&=ee*Ya68X&cB+NrsVg3J@ z{3X2i{^8Pnrcaj$?n|&wiTL4g1K=!_fbu#2mb8p0MW%on9lIynij7n1fg8gne7+95B+!``6NmIe3*$*!zXf zHY1(9e=VJugIDQ<`a$T_jCAt;wRB<*UZoRi5TSFUkxt%kq*LVJ1&|Xt_-XjwPUib? z<>!`5H4D67Ik#L^%iyLu1~=DR^fSQwdFM`29;e^G!*M^)x*o?ZYqsb(?&n$G|p#FJ{xPa=P=uZ2F%H{-fFSA8~rBFPr{TPJdIOzsKnx!{57+uMau>xdE9< z$S*nlj6(l6PCvm6o)P+#)6aP_B-$vZ3kd9~&?`B;U(o|!_l9c&(H_p$bAZ!J6?z+| zUsLSq=JfKBY&}0iy4gxT31rhB1iI1hJ}v^?=yxB)F%ymlh{Jz&27byy9FsGr96?_! zNcjIqk4K3k8ut6*KH`W6&crFVZ+dDR$bnEa3?c6KLx2;&7*z#vGo z7K0?@8w-TIPG?(Rrs?X)G`n4y=HZoQcTa}e)0Al*c8X?GTZXsEmEmpb%ru*ux&?2m zt4%bUn-7a-d+!m^?C$6kO;>A%p<{*G(UKXsqd7A~cVC9r)!dTN(3H{J*_4^2t26WP zuFlNNI-8n>Sm(;acOK3t>+V=-x?1!4!AxGl_-h+)@A1LzV-K)2mc0aX!f-j`S1rs! zS~NHmfB#B!i+xMrD2scc5EJ{Ofc=~i&NHD-TlpQ2*!DI9?LX4#u#gw~qky$sUi4cz zE&zT<7Ilk$v7ZX4aYoTE<}aWf`thqSqW>j6F#*N?&DGFn5N(9_z>Uu(s+Cgpo7#iLhTer*{EF znvf4pu#O{!BIMtenK>_#gq~4o3)>-GT(+aE<>V7915|@s n%wMdZCx9mA4$aKf&-Wei{GvmQEASvRjO5?t@;4O$1ttFrGpv7g literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..36eb76a83c293f4b9b709e78c7f4ac60d68b1c3f GIT binary patch literal 22576 zcmeHPeQ+C9l7EtATUPAEPMpMXoM@7Af)i|6mcIi8*;axHP6%w^fMZc)OSVj8ODicM z*~&U&5t}SWk*xP>zwU!<2!UH*m%C$kx!WyEOHsA=2f0ePtvV={s$3OUw}0GLQTJ6_ zcT{xU?_>1zJR-5gQE;`Bn*65wb@#mK>FJ)H8Tr(nk-b)n#cLyMDI?d2g>NHx`Sxpv z*v3dX^7@D!{$EL|Nd>S`*UN^NV1^qLXy)~A82l)7OKE@bHu84!@`?CW7X7q%m5ezcac1q$C+0{eX!(*Otd`|djU03oR#2Bw1HM8I_<6rT=765n{{1y>+Gk@z(k zng5|HFqS+Ti1-6zW1&Ri2k}rK=zk(UnGC^5Fc=4(KM@Ozg)U7*qN9Px3$E$FROtJ@ z7hTcu@kA&|UT`HQzZ$Z?>^c^R91Z!8Oil-1WupGkqvPYD_#9JtJ{AZHCA*2zOQR?J z)P%5K`vr|6~#R+zCTQB)qK`VMX@B+owx4aHdoPPuYKEy4hgh=E*0oG-#ko+B=nYT%_(4ZKo>0pW;TM7&mnl{`nhUaEmNN;UBF zA`Eax4CW#tQ-qZ~N4!<4fxj!&z}rO_5RS-2#FZkfJ3r5bp*R0F>*!obD-!O1C6 zHs8KhgoQjOd|0Z1-;`?LUyCqs@qp}vkBYF6=Y;<#)xhscHSlo}1}@$sIpMN}6k$U4 z!exuCWDE9^EmRj_;ZhA9 zhgO84e0MaLY@wxO3$4qsP~20Mi-VJaM9yz+w#Vf?{JCuDGf}%yTmKR~jPj4>?cKYa z<288-b=l%8*}`zi7Dkq1K|O0X`MB=YSTgQUgr)+qa5Nsuoxg)@-^)FJ51FVHJb!=V znute_#$fjoQ=#dke|++IDCox)=3<>`Kf;aEF-Y+EaPqxQ|04&w1Mmnl-q}Ao-qAA=nws*D$2vVo_>whKzaSCQ4p+#Y10#$D(=r`zBNri{)ikVj>pzk42A8 zCu1+V5+|lcqmlP|ikN}xND!WL8OAPK$&Xx51dfHq<56%M9=%sQ$2}H`CnlrQ3*sR& zdla1aKMt)fgQvmbc{(5M2CTy;;Np2a=cBw>@lm`No;O?Ip)&G@|FQq&rw+cDDuZoY z#iM8(%Ug;#YEpy_cq(tV`Ghj7y8Ka@x)X>~W!2;~9F%y@FOPo_w{7IdilTm3bz8g)A&yAp2}L+&16pzvU!7Z!3oFIGwVn< zfnGIPbCozP@YZeao$5OXIZf=^IlH~uwxJ=sJ_E9wVQpv#ufy%rY&+Ay@Ejd$3_BZ} z!p^46OuoT6i#qEv+JbgkTX1U5*%Z;5othTjLQUH@m+S+us}B3(x23?Ke?HkH<`;ThffIIIy^ zr)SV#RiHzz%v`JPvgwF599ME&1!#w3Bv+1cnPXh$*fl0|>>A26(Q_~larDZf4(AN# zo3h)R>u|pPIA8mAN>^mX6A0bC7O54Z+!Enq$1I>5Dn>jBpRHUO?i z8qZk+#7vX6PzSs^@CDjh4T!eZ0HUpWK(w_M5N)jkL|f|t(N+T>#zo5RXg*8lR;|rd zS2tIzaD<-(I%}`4wL?NiI_2;*H)Nd)8YDB_>C&^Vc3syX0^8jbH#NeX;9AO(SM!d& z5s&@0+_5)Sb%oDtaxQFy{Wrp~Gy!e|+yvMJxEXL0pc8O2pa$p!+ybZp(s;#V-=r-x z0A; zHPY>5=NFEfo?k$EegWzE1*GQ}ke**adVT@v`2`eqsIP#gz5=Aa0;IkIq`m^Az5=Aa z0;IkIy!kls(KvxLN}N7lSj0@$*`ezJh?foziyKcT#XDvz(j9R9b?YSE4w#~GbZHUy zcgB9an&yb($01Ka-hn)z!M1Ka3D@q-45>YI?e^VOdTiA*>8uscTQTj&Y?Gp@jIpQJL&j5jGu1ymJQDglm#r}3hQIA z9)@)k)*!6oGrH@fPHOMFR!1yweZjT`k_gbN1r0t$dX1p*h6(6;LE9L;R?ztI1B|Z| zG=3}q^m;*8GP(iiR+~-N%ga+0t@FCx!RFCOta=u=`8+lXn$M$2(0m@71kLBMSXW><^@6e;_UU1L;EkaKjkPgT@DL%l<%G_6O3kKaiIFfwb%o zq-B2~UC1AuFedzg+p<59mi>XW><^@6e;_UU18LbGNEh(OMsfja&G#3v9&T{{bT)SO znKaxHrm^;SH#nbi@_VTIngMzbMacE{u{H5Bv=J+%?Ouk}#75~_FT;9LLFooB!zMj& z{r#<=o?+-~NZHbB%1)-2%NLr|>si;RZAkwVn^DeJ#pR^) zi2S`ZoG+bEeJj;#$uMkf!F|_VVEd8IGx7)5a=vuF^=7K?&M@pkIo}2@C&dBs_io~R zDIWCgRKG97@b*^R|CYaM%5HJecyMlYq+9jYIXRc0%-}YVQF4mNJtRf_JM3jRg7Ur{ zTwaPPml z<)>S=+S1RL=f}eE0K@|36@Ja)x!kvBp(%?z>Dna&)R)A|(1vm&cXK%<77py^d?^;l z5Y@MP8CK&t8@}%X+m8|p`ySwYDHg~cs$c76SdaQ64|6#s77l!w^QBlIcTxQYFT*B0 zKf?hyKWZ#sZsU0aW%h+Y26MAIgtRa(N{d4oq>r5(~p$d6U^!VqxD= z&R1e#!~(6bm#jAWx13mYX!5qjGrx<&;>Uc>(!yELiEe zXyG|6mlse@i3OS$kT1sq%YAO119N!+<&;>Uc>(!yEU;YJ$8%=kynuOKiUpb%kY^SP zEZ+|Eyqn7lD6hl<%?rp^Vu9ub^1 z<(`k%2*PVbE-#>*5(_jhAYY0FUL)+hM&$AW$|<^`oj(7a$?BQ#zka(Myelvtp70r^ra@EXy=YeX(Dpqvs5G%p}u ziUnRH+`LBQ@&d{!u|V?z@}*edHKLE#h{Ab6sSz|Ud~%H#&afJh%L^#4!~)F=$X8;4 z<^|*{u|V?z@|9Sic>(!KEYQ4w{AI+#x0QQGdcT1ARH_m5egSz)YXlMZvBv!Z$|LC_>`*G?M zdkM3i!K}BycrfcF)Sdz5%z6tvpJu&;S zgK3X}bNT}H66dTY_Y9a*d2TT48O(YMBR8O~lpEBZ0p-kk3p__=y@XlMVAflpzF9A! z_6#Uz)>{{S0flw^?3)H810rjOi!Jo4jJp;;_^%e#50_IvZ7EF2u(;fro zgn0pND0SlV*h@TZG06*f-_7%aSp?p7tn?h3!lecqEd+kwU^*E zBClsK>n)7DU|u7Pp25fqD5t~%%?rqvV!^CuFzYRhyny;rET}z$krz--x}PkV7Yf&i zyq>|d#}Ih|Z78wudF&-l!@D#2IkehK@ESqnp24iQF!F+VjWBu!BQKcL2$~nnYeZhp zVAfk0dBMC!7(IiL7f?=#1=DAbg=<7!&p`76%E~>4$O|Sl;`7)`SRfX#m%#Vi-z}R< zXW)B?UZ*2HX_|+Q(E#s>D!>o;-AKKieQ$$&&1p~nPZqunfq8C*W2`NMcg%X%2e#%~NON6R zZFj=)AiYhYw=4853f-d6I~01SLf@*;tqR?y&@P2;S7^6FcPO++p*tlSe1ZO&hkmub z#{K4QaKV#hDH=l;AcjQzq@?&c12Q6h@EaKT9zwg|@YMotFNgTS?_}Uxp;{VC_}zmN zOShBZE6sOX;XAtQ)#mjTRwyx7?Q8#yAb;CEjK2-|(4JF%7y0eg@~UU)`1bv5e7mA| z%e{<$i=@{r>9yR)##Jn2on8dToEm z_-&G2pQPvdG8=a(dhLG3ZD|Wl z2>IT-8@k|9u>3!(T~ci8KzA#2k3#ne`%B>(%i+`v3Rby`|Tdo(5m4IoyKpZ((dgj7j-ij2#wZQf?Px z4~sD=&x^79#h8@y#n?S!Ov?Xa%qPaAS|G;m5MxqZ5Mw=JOsWxL48O0U$1c?iG1el+ zq}n0IwumvQj)<{FF(%a%F}6mGN%cjH!I#l&E>f)#W94E@syjG_br@^Uzd+w3u9puj z|2MVgM%T#iYf|>=?`snI)N2yvp-T||zaW1L-!}bhahK)#BEh-@bxK?xe-7h1jQ_%9 z9Q6cW+@qpRyw6`^6Kb8X`E@g!e7um&lV&!dUJ9E@Gn;(8kWI|NOXdkRSJ?EM+2rGe zY+?>xViW4Nu({97CLb?k6Latqn^60O&2BTBe7ulN%)v`+!u^A=shQd2jj6=pX1xS36ngXcj{oX10BNlIw$U@*`b2#$C6 z^!D_G+=0NDJLvB0=h;k?-U3Zz=pC&gY7TG4|*2D#^#~28(Vy%;mL?e14Gg z4=a3nl3|`1=n>`Rt6V-fAan6|HMmrg2K>RHw49^cmE^Vk5)(H6g!7vo&F5d@{P36Z z`F|J*lHmuO|JMEa{NHf?YYPA0IsZEk=F4B_ z{MQfV^Q&k8K|Cn@dd}AsJMc3xSSraw?8F(yc5=Q?;dgVsP4TCf^M@7wPmpi56F2`m zied-%0N?C)q^E#y_B&GInvNzz#1%X-4ZnpYu8HZRu5cg`Cj5WoC*#Bwj|KzD0C6Ql z$CFe#Ff};_>`){gg-bjbgbSRw#-daBqhW%-t4#(*0mGopwiuE~U^Eo*yWQP;jm^GZ zV{@R-*c{4j4h$OH!46|{$SpQIdJSV8IkgU-VZk$K$aeesg_>umM{M@(84{aAx#nMQm^se;-o}i~1yJ)Ww=7)I=Q= zILrmH4&uJ-oEGJ670~~c%|?a3sDlEVxx5$``kYOFhK;txxTu=~HAP>{U!W7l@im{& z7j;@-gzF2yW003yfIyzm7rzS^=#_oAu{@wI z9s@2S|A@b{8e?qwSr={!|J8CnKHEl~&_B=f*Lj}j4=V?(kOa#R^4ze>i_rg(%!F;B zFA&qT;X6t8;Ju7+^NG~}slh7dFX9Kwj+i@a8jH^Bi#UEUAoeThFl?CVf5i3Eih@Ft F{{<*nkRSj6 literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..7676e8b634b530ef415f7dc9eead3fc5dcef2936 GIT binary patch literal 22560 zcmeHPeQ;A(cE6HkTQ&v^7%*Va!x6*~Bg^1MOr(rha_dAkPBN1GMs%LHX9oTku@m;9e?9*c_hbLQXnUgB2fz;7yVVc2MaL5_ z7(WVH)-iSl8A*x(KV3_R2H1AE^X~442ub|VJMIs~ypCgm=tLkKd*;cP9p30@>?3mM z+z%bzk@#_M*y9};3B+RGj|RMc&tuWCcmO1Re-wD0*razPaCtNw@p;29J0`s2f$w#` z;)n!;u|S->?1+thBVc{aal#uu9`GC+oAAHRR6V}q!C)Xd!%SY9^!kO7)xhXg-zg7u zAw0!TO$J^XYuK$`;-Aw?j7)@2-S7%KL}TR!jotJLD-Dc(*H_pku23^*?50;(V_@{V zzQT5Kg-(OUZhD0-1EVFckk68rz2V7_S2XKp9xP%?;N-fQ2YH@#=4s!Rr~RQE>@WEO z@yq~p=iopdfZjaq`}4FP%)$QESUfuB4|F=kEEvwgi987YJnchy+QXWOOXMxhVlfEy zVh|>Ca3T*vJWu)<>2I{0o54)}~{TbL0qbgZFZ9AO>P#MqJCmOV&XAJYNSN>nE!+va=oIe5q#gkR?C;FEkE{BsTtE*+5v;jd>#BY2M3q#lLq0c ziR9oy9)+tWOWq!=d3z|!!NcX)WOyuoOeDosQ+Wdg( zd|9o#Z>7HmccHGDI`j6>mA8jOx8gyK+84ZC_xfZ!>WKx$y_2CxG_VlAZr1j##;@PN zY{~fjjbk(#IX(%^kB$c>;-28x$$;O3UxEG}Z%qF+y(tNIBQpx#@Yv{t<;RQoXmZsw z{H9~vd(smOd*hzRBGF^qIhDZzk2fbj+~*lM(&U9(n9)X;FIe9^8W2;*+m9VyDJ^k?@BH^0^epF+be-%6PqMCO>jK<~aFur*IhiA=$tx}dNO&9%+6gSPr-2kj&pEao=cN& z&*2P9lczcU9>*s*zCv+TdKDdm<>}=VOVTA23)6)Zx23mH1idHlNek#v1U-tNM-lWW zZcN&A+d13Q>us&Gt?OF#)6G`11n$pdW!V!jJVhYD`8Q73(Q}bnw79DXp#}yn`ax4cd zg?^K3C%Enju6tsh>7JNpx+msQw~EH<9L%>L%;^vZy#9pMYF~}lzXz}1x}DM`_R#q% z+w290M`Om0ImaE3%DL|9bhHkdcX~Um4F)oHvnz|+z6P7dnL@bDs6T( z@K?ilq5n02=)VFG{jUW?|LXwJ|9U|5UkQl*HvppljezKX6Clp9gtgTEG`;>ZZKkZu zUbL(<^f=IIYgzeuSiF%=l)CJdFbB0#$ZwlNPdn;#y#{7;os;6`jW8b|FVp1pIb8hC zlD+mc#12x4Y=O2mLtk*tUaX|?*KM1%+0Dp@Ye4xHz{cri$##hGrgPfUjb+v;z47^J z%iUJU89kY{T1!7qpSj<%(mGWLV{WsW;W_cS_btoJ#Inp9vcS1Uc@U2=LbTEqk4{_5 zLX9h)`)1>c(lGk4EVq*4Wh=gS2ZnBZ2 zOWQP~;e2nhvGIq#(D4VP;}1y3ANvS>!MM`#2c+WmlIttpSItx0d z4i?-s)n3pA^u0hI0{RHh-9YyM-3xRd(EUIU06hrw5YWRw`+)WX9h|CwywlBYxXyMs zI^nnrjxIRvh2sz$N8sp&qX&*&IQrn|hhqSaK{$rs7>2_KhaZmMl>Kw9nt(o6JV6L}HV@$b!v^>u^oC(|eIe=-TxNwTTZ_Job^q+B_{J!O!FO9RlliE8|3>)p(_NrOdjx_$r?^(Gwsvi7C;A-`uE=Sy+aoz%W7 z#qbd7_3i+@tu{K}Y}KX7TD^9L=Gt%qwkf!dT`yK4y~Ur%U-A&OFLyJnK>NWluBXIzXq@w<_>zaIeWjaW6^?)Z*C9UGz7pSo6>ZEk*gzH+g(vO%>}c#;bq&ieY>QX0qcu_$?OSq2Fim?SF>FbKnmc z4*n6tp+8-S?-fgSe1|cAam|t9>%;s-o*ZAchSc!2gyydw^B47$_y#e5kuS%WtvOD< z_R#zt#{5M+CB8n)U*ya2Wowj+uT?aE{g}U~r^Gji`HOrxzHCi%^Rs>M8N{ zVg4dtjxSpSyZBm2^Vh#r{=zr?XA&+|9P^LLo%uaD=ipXYB- z%HO?8e7iA!r8)!ivIp~5sW$8im&oqAr zF@I4{iSH2RFY=}M^7Yxx*Jql){g}U~r^I&v^B4J2eEIs^#n)$=zk^HVuUMZ&{;KQq z68SrzWIZ{2h|=_g*Ew!a%wN<~ z;v2;LMZO$gr8dO$YFY@L1vh_JrtD?Pssr(h|v&dg{eO@AeeM5JW<8D+#8`a82H8tAbsD_r~n^j9=y9PNd#}{j9jBhSA z^e;@=d$3wV^Y7hRwX#u7jd3%oq51dpOs$N1Mm05#yHO3zzt?Bg%0@Lc+8fo-{60XY zRz^Leni}I{R73Oo23fVTQB95ZH>#oK_-578*zQg87i(#ZZ!R_Tm=a&LhBm5|jcRI) zn^6s|*2<`7R8!-)8`aQ8wX#u7jrK-0v|1~po>5JW@iD5QjcR41ni}nIR71=0?Gt-3 z*3#JSP4X9OX}Uh=P(%L;?n}$}u+$owug_Vv@_<22jd3%oq19R$^^9t29CxD{+Nf4G zs;SZ5sD@T+Wz;jOsWCoAHMCK!Y*bUD{f%m9Ile=3EsgEoB!97%rt5PKHS{SZzG@B4 z*XOKS*=JBwW892tX!SlS>KWD4IPOL@w0hqa^^9t2U~<&GpeaEK1MaPdS4gy zjB0A+->8O`;~NzDi?uYidz1XdS{n6osi92}U#y|=*;}_|CW+7N@Humftu*<%X$GFt zlun6f=lFc}HPFZBPid>Gyc6{BxgkD}JB!cW_P9d$jIbQOOW9nmP{QT1C#Y^K)x~Fu zmIJM!N_=kJ+8Ro})cQ)s(FZ&T>)3Vpjm*C_N3h2E*qcPMnNLhn*&heFpWv{Rw$ z6?(TqH%K&$C60GH+_S4#?tG^ehVXzTLFej=FjvJqO-S?fJm`pdiqG%ixv=(PX=^#) z_ClDa_>3RkfhebQ7N7qrb9Nu;x~8?6;VX6ey0*T^Y$4>jb?v_r^)<%dCE4wj z>>Q6UxkIt5^Dus$WOt`z=kzkUQ?aWbVf=c@Zl7egJHX_-6}yH}#&3}9_H#Qz{`iBc zMo4WdU^7gDTXb4jdSreZjZi!%J-%i-pSXv zeSD4E|Ap72^I}cv`q%LpnY3pL z*?mGL?Vm!{DP+=KDrEbGOxjn4tXasUJyyu-giP9Rg{(%%q`g{{;s z)1Hp&?={H7pOU|Y_n+RH+iUuKj^Mry`@G20cR{X${HF$Tv=e;sT?Kuy6r zjC^wWQa&#m`Gk5y`1~^?pIp9_&u5H$Ld_z4e#^)wmoMcL_bilUjVIJS!e`vbCzmhf z6Zfn|KB0CJK0QW0xqK;~xMwZ$3H6uod9RUAE?>$g?pcd`LJcQ;;`^(M<`S1L`Gi_h_{8_d7hNZp8~GG_7S2(zXT1dPwPxNyFF!qBrdi-w z^Xd8Jc7`iz7_O{kSnjmw&wSi4{5 z_e*Sdg(DNAfv9KF z8xP@+^E|PLCmwdSG^eVwGaX?SDd~hTUN&THF$4Q`mCqt#pF$eU(NYl zyCh{eWn?k=nWFFF`X4F$4$jvUKkU!m6_dB?vi%?B`rm7Q`^Nz=l}MRZ2gZp->J0w3FlWS{D0&8w|lewe9rlyo@{;zoj}E;SK(K1{w2i^ z{QL_(#l+H|?Pn+FfA(-TzlrnvmGN{7{|f(I?LOce{SNdz@Qr>4N*oiB zcz`(krzYU1t;8`paoiE|#zKVukNkL)IHD21H|`~lc;IB5YJ11WMt~g%M<&CVE-u-6v|dz{XuyE3P)w#;eU!OUsr?#yX>{laNW=Cs2l zPV1X9&g$EmGu-x!-`!0aXH5-zgu?FTj0cx9qubHcAr$tscZyS2hck29oVf;9bH;DS zU72gx<67|G%ILPWWfa=#Gp)7NHwnLOZJ8@;cV&9q-jZ?F(Uj?1NAsS%en6X-G5-42 z(|Pnr+u?iIGA1Tp9*EN!Uwy;Z^5{pLa1?(JQ;5aBC1|w8y-=8m{ZU{SSH%4h+p=<6 zY|Yl-x;ljBB)nZMJ@+3E!}0UvGg^-27Q zbxHhP)k(&tV?^bgF6W5b5ss;_Kgs{@>q-81U~&DDKMF~=734W(B{&!MKT`O@USJ={ z#n{QJHTN^h$uDd5kd C?gkzJ literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..c59edd0bd189b158dfa5f7e889fbeda3ba9e0a1d GIT binary patch literal 24640 zcmeHPeQ;A(cE6HkBMSot4A@}M!#JrSMwY*20!IE~9RjH#4l%5QkS*DAkS#}+oseP8 zvx5nnrGa?2X_|f{O9)9fo85G~A8E2fWL>*o+lX(qc9F^}`kIrp4<-+8}x&O1-fI@PgnkJV!F+6epPkn6<4_mOz{exjG{ z?8`-&huGo&<)nb*ff#MQOg)b#xHCcRd>o3!oLrPfyd{hkloT=^?O+LntY zWW1oV9j?`Ti0Qo(>2#E_B$3Z|_{q0}tx`&YIANYp@p)tSfcrZE=439l=?s=C#Iym+| z*>~m#E`NV)+#mM&`}>1qW8aGg{Q=*D(VT z+x~(pGB7X}jFIPEV?z%I?Jv0|{NeGS@A%M2;AJ+{*Ec>e5RA?+ljlbL0byh}F?zA@ zgpaxqo?<6PgU=1s)~HMTE4swUNchC3uCPr^R%kNWO;=cKV)S{hutls;GnwqBE37av z`n*@zDpu$=ne3)3^q3eexI&JS=l$W)kY7yeWgct-5(uuFd603|o~38{5Nvk~6dcRIi5!GLmiD16?ICAkid+lX2WPW&0JS$$G_)CLc=JLAUeW^b zoopREm92w6&%gmkL`w=0&t~8yEfANob?{=g4!)m(0}+T6B3{YBOIjd)oUMabvvu(E z3>@%4w5AY|$iPclAby#xgI{Iq;LQvih(M$eaWw-kX@Ph)`zi99-HT7#bF3 z^UdoSct{Jvzh>*;qih}gdj<|J-6aR%;|x5c1>uuy9sEzW4t|?~gG+ZxLAYWe8Mu(6 zaK&QF+Jil74+R-`xI8u*9*P|o7sVAzVFpgpLs68qhvKX~lxE=JdqZQs$au`xSKW{b zhn9h(^l+4C?V%!T50#7Ykl9m}ORYoxu~gjLY{tbs{JCQBn3&zHt$z_7M)~^EX7?`U zcuk%{U9q%h?V&Sk5BnD5K|O1q@p0YDqp_%OEI8~R4Mn2C)cNaTb1(M%^_Z9~c>ey@ zH5iSIkHYkW!@-f5Z(!)5V8Dkh&_CgW>7UYvlJGRr7~l^N4UX7;GEa^sS1iX~bq)I; z@(qOjG2erc=yC3x>L9`6!^wAReMj!9_roL1Xl+y9Ky|}laCq1^Fj`xKj4w9g^SB>s zsDsCvm=t~e;h=w{<8{|)bZEjK3mWT4EtLG&6%9TBk44kw`!-Y*i{&NP*x+c?*B==l ziH*MC8apxE7YV=HkiiOE#{=-3%W!tZN`B~i&_59zh(;i6c=TTJ9CsoZ9UF>_%!-H1 z>``#q|EOA(15bm+^K`DQ2f7lUfQ#qxT#n<#hL7SeSTy*b$@Xk`vit9T^N(9{fX5BT ziO12foBCvM)FcBNh*WCMd3(4z>9bPWS9W&;e=4VdoQ8uE&-s-hzjLgO{AA9Ow46yM z;k%JMYvMW_fe!Rzpb4P#B{6v8iQrYWt8S8^C`_u=2Ch~@)k;A+@ttS3yh-_#!(96D1~v9u8%u)=Naeub0b$USUZz!{H{$b92_5yd_ENs!8CINox_(Q%URE$>b?QHkCMM zop4MIQ;xV3>lqCV-x1 z`-x&kFHqh3kaK-$$XU86!KVVj)9I4%Vna&_{0!A_fwbQ!11IbFeNAy5Y#H@P~&$4>CE z6W7_;iR*0a#C057O3&39z}xrdbT|*V{%O0td=;+03DN2=)V{!`Y!>B{?`LV{~LfJ#-{C#^7C~41=>tOL3!R%N9YmY zllFqbvyj-4pLTf4ivfe01FmnUOHaD0biD%5T;-;8<9fgYT$f4ma$0;gVtj5-#b;yw zj?meS&e=^c|3)}IC(unmHK0zQn}KRT%YbeMx&>$%&~l(#fKuFJEH-Mh8$q@ajsbmb z0*bzzK+%^56n$+5ioVK#qOUDL(N{Upx~Zk{{eYhYxA z?zoXW^@p|<_Q_m02B+N$-%Fo)%eK@?Y)kDS8|-UT2FEu9b2^saKV>fn)h&PW8+FSa zVf0~JW+zLQE+7q#9h0^_p{bgY(s?x~XTT>YXI+(KgIzczxA7 z+4+X!rso?dJ>Nj-`Nnv_u^^7<`36eQH&A-MfztC0l%8*(!Y_>-@HBQnY3xuB{xJSD zc0g(DfYR6jrLhBgGu%LEIBxb*+(3#Z&b@m>6hly4lg?^gKLND5M%Oi1O+&U}4z#kqLW zisvzz?92xoTOE^F|2mgCZnfn+IeEeHopPbSL(<=LA z=g{QdoF3qh0N)GzQQ(gO-v@jE_yOQUz~2vi82Az3M}dz59|L}3vIugGZta3~9)ztM zwnMP>z;*<-Uf7Ppb_}*Y*aEN(z!rk-e%QjWjlebvTNJh!Y!j2Z>!eN!4_seGEO0%- zz6BB!@T&z6F-N{g@OZNZ{2IaA7{6BV_~HfV*9jh9Q~0{wcye4U!$`8qcXp0Be^@O+(H1kcx5 z4*YhzUDxyTrybiL)AcGTUbrvE3wb$S$jk9UUXB;?a=eh280{{ew#C1db2=XNvUOCDfBEMw%f(VTccAQG?!{6y zhv*g5cSVBHN*vdn&&NsU9_5FYak+#6eH*oRCm5}(#JN}gh|MEm1LX(TaJhsLy^`8{ z5{&M|aor_+oP-^eAKJ*}5{7gawf80%ZFl4Ro8MsbOV~pB!CSao!kF%+_MHhv_u;ti ztuSu0lVZ$S=7?A7l{0dF!!d`p!x$yciF`<&rst=}%V=*MuBrMhHeSLh$`96axrA4g zcQ}9kR&wI}U zZ?O3lY#kZratT}HFtyjbjFzK)?+F+u#hqdc^E1zFIOga#U<}Hn94D})zs9e9M!OLc zJyUP7@d~z%%y7Aat=?~Pxq_{uf5hbqwtAlUkoi}zb>xq^T)|fFpMqREMuDxPe=#3h zALXWFi{>q?0}{4q-a?s-EtX3wcuq~_EgYv{i{>qq%h+PM*UfWqDsSO91zR+4pwtnSnzy9; zX(Mkb_tm^65?&Kic?-uW*rIt02+5DPFM4w^YZ{auvTQqN>T*j8tml*e3I8MP9 z&08p!v8D7e#{CwKQ?Nzz7RqI8DSeM|zlGxzY|*@hav58!CK`Rxg7;fkSEQOq@3&C4 za7{FNE#rO*$1B*Pc?;zVwrJi$xq>a4w@|KNi{>qqE7+oW3*`#77L&KmL7zDO+v!j3 z70r4=v)&Q$W!5XIJt2-W>m3mvX1$_WPiWRVqPlMX)yV?_?tyvF=`1)Pw6-AFIdPdPJeqv3pi)ycE))Sibjz;dn`XS{mwI{@JX1ybx zbF*I2tS2<<9ns#bS5$jK9B0-$;`ue}70r4=v)&QyZ`LacY^gmV+M4x{M()D;WR|;7 zZkD@HZkD@HZkD@HZkD@H{uy$Yj4kXL1-9-}u%-5j{5e}%PiWRV8hH!rhm0+yCp7XF zj+5#no^zVFP%hO){v0o@Cp7CF(Vpflw3q54f9_}WggDNucf|8c^A_4mb&)?OOzR2F zdPlT3&0APocwIEGrSyboYt};=c?*3g*!nW{iZZtFS(m`p-&;)b7WPH_p3AH!q^@L`j}+zN3=K1Ti97TQbLQhP!pZ{axU{vOXS&08p! zuw~X0n)QxoZ<@ETt|-{zHPPq^(blYoH1ZbuP_Xr7>J?>d$$9H3`2Jk}-bd{fc}*m8 zPiWRV8hHzAij=q1@2ZTvh2s=#(Y%Fn8Cxbjp;_-}7+>9WnllklDvzMt0U zbi`k_%)q-2j!E&(9ln?K5{$=pIFfcxVLOb&_l@wqn$!3WO_PVd7g7j$-Rbd!rahkW zX*zZ<9gFWG+3tjQNAP_&c!y+q@7{7k*!P))%)MJ$0S;FJtpr*I6kkrP1zHER9jFK1 z>xA!L@q1W8!tdNd2;2u!x>aX%TY}Ns$Vw`&^fJ0#XVjHow2G8bz1z!Zwa#cwg3(%1 zOZ9bLM(cG(8xo8*l6I>1co}Wd8Qqa!^mbflF?>gy2l2vp--_()T|$&=PJ8?}NqD~v zj$;$VVr34z>;to5J6w@RbU`UEy5{U#0MFg|All8ilV_ z_&SBJSNH~nZ&Y}X!Z#`W4u!v6;vtq8?^bxORMaSMkptt4%PVwD9cz3dpViw=yQZT!dbY87EV1-nayjHO;&k8B=ntjcG z5R`Ado5{C=9NH%?|Mj)p{Ku(&+dWLbO|iS}UM9ayvfD*cc9n;jzEZK<{&gnbF4;9p zcCP!F-lf=8`Ix*)vfC}$x&2J!E^GbF#L8h-!>}m&@yjHSnk?iV*n7&T2 zt3S@<^^#qyWY;jv^bLw#V}!{YCA&7s&hr4%dlb8-F(z-4?Aj%}9pg;DL$SO4K_
    ggpID=}x$`?LZNrD}j1}F9F&Ld>K$ks!ERA$#dK;-A;2(bAr*`JjZ!?j%(3N zsJ=D9XdBOQ?L5bIeCC{VR^+744{ST7YfJ}zm%=wI{BDK!DtwE=w<>&_!nZ4Yhs5)1 zz5||jifg_#2j9<2uUWtToNHF&wDWx9Bifl_%@Lo{wHc?@sN?%WU)3^(o6U{3_+t=r zwWe9{T@%nfB6L!H6S|W^C)GNk8y7mM?g`z9&`C8==mv#Os)s^%Oz5QAD0GK~PO6ha zcTni0nkjU53Y}Cxg|1!bq*^L;yM#`vt3uZxbW)8Kx+aEaK2%S`Wg>JLZNp)E0 z)(f3flZ9@z&`I@K=n8~Rs?|c5D|AxbMjhSFM`!zfUzC^Y2x-4u3&jgLhnCpW9)%F-NejL!B4b>90VqgZ>vLdbAUA(W{`(*Ufyw zy@K%hH)cM$ej%Ti&3wZBhVc29WY=J|wsC*jj)=9B9e@`*KTo=>>{5wy& zyk1aYgJ+f}uP>`)bh(?+6?KdjdTjcWVDm-S98wsge=CRcev$no&bzGHrsKR{WM7H% zF88u|zsUX)=Uvgt=KVtZOqjQ@9WX|ptxdsmCPE*#BK>$B{!hqKM&R$v*ayGj#m`AD za4?p@AHdPif89t%aL2>-@T`M=dW8ed3$kx$1b&rthX%h@s_om?uGP3*@K39#3P(l; zgHhk8KNiA2>i3OBe6bMx(SLMwc+A)DAMuTikB&y7F&NQ*+&?%pG6?#hFA(e-AM_24 z3`Bebk*Lof4*McwzR_r8U??0Ms~U^;SB1k9!`ne}f3QDB8Owf(_BZDV~?uo`~Iw7;)^t624XieFI?cnE#+wsc(qmseD#%UO7rkXIFX z86V%dJ$*c5mylm7<2`)*`-;4c%QeN%U0mLS&rGGyVYnSL31N4J%k2&6@(Wyk>|nb5&$#@>{&e|eF0W(@Hk5zOI zE`R%OSxU%1aCt(Jzs2Qs1L@;G;POyUy8I(9?^fjh$>pE)rjNhD<NGwQPffFO}+Ze<(I5O@E`Nu+p|Bv!ml(?difIsFZu2}G)7#-~&9_j~iFdU7* zB_0UC1x{T3kzxE15rV(75%c!}4S_Y=W0ZvbeZjEL?XKTr?0Q;^-Ifkxx4p*LZLLo2 zHX6Ha9n#%ygh#(LXYjrF#BjOetdqSMh(CuZwtY|P}BIWn=q zU-R&_AHJ(){~au838-a65w`eMAbxF0KVpYd@q5`qE$Wlt(H3i-FcWo9&`v%O>mbf$ z=e#IyD}ev+WD=exvyZ5Qf|hf+&LLufAdJh`3^) Hu;l*$zSBwi literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..697bf1870a830392710688a64e1a0a645ccbea9c GIT binary patch literal 24592 zcmeHPdvKH2mA{f@Ba96uyaX5|7(om%LdI_bMm92Wh#^ik50a&oB}=kwWLcGD1v1q6 ztYS*DX+qp=HrvfMFGABt+U;(#O}DM3W?%Uu>_E1&+srmI-cCEyKiZvc+uiB3ooLUw z5Bci)8iGW#|3u8k@1Aq+IrqEwp6}dyu0HFL-u^>olgVQt>{CJJh>4dGJiL7VI4ks5 zBCms3;s52NhExF?bv;bH0CU`!Kr63@Y|$oX6@jsykpf8_qfrlTfshT1COeDq*S9iS zpsa`Y`kQ;1{5KQD7-uB0R`UXH&kW=T(JSb4^}BzP3S$30XnSJJ?*ct+?|MJj79CH# zVf-j)SQxP-?;-_4$W}LdoQp6Cr=l_fTRa6#_{xm;jzH8TSu`E)GY>1ODjCu9$x;^n&+` zuJLd<8A_3tUCEJ$L)Ndj(*Ec~$ai8S7JQXe^#vxv;ZS0lsk{{T2ZfSVqx4GPl#iMa zmQttUp_fLQ_o%n{C3=hFvFNF*-r+&fSe>S^Yu=$vqjaP1uv^@rL(|wb@329mbffRE zN8F)X)7UldutTG?=pAw|dD$P0NBp8$53^tqE`iQS0w2Hq;yz}qDl5Cbvq5tmD_QZx|XFW0~i$~ExA5)5285*!&5 z%jVm2C0HmLgddk{;HTvp_*n@CE*zBy;TI)XC>n%cmuuiRtnKTazhfG;Md$BZ zuHnS^L>!tQ9t*`%zVOJ&P|$}d(7)ip^xx7$NjQxxRPaYfhGUl37w}Q#l4pL zFC6u!d=HH$PH^K?1_2%qC*R%Yd+=zhAC52+EggYyQ`>N8Y|IyqH}65hmx}p1+fKH% z_yWlY98g9>aK4Gr(Ki$g`D4A`cEuATX@3gX3+|Y_=1PP<0q3Jd4Sx=n7L(~KuHQ^U$xOcPddQy+g%jg2kT@P5ak85ZC6XiK zu^Dmf%uj-G0#1p=IXkb?1-KGU#y$L;-oyD=UtD-ZZzT>qiLOK5=~EN6?|$i5 z_g6p}FXV}1YADWs${7Swjxvnk{9Sy&682*LljGkT%yZ^T2cY;!MGZLvofqGO%uD}& zN*40^tSN6go6o~{I92A{9NdNh_zS=sAf2Qo=G^l@djNBQ2A}~*D$O|$fCq2@X;W3s zfoQJExe+U>*l<{?a!8U|a}FKB0CeE6Eroi3o)usR>FTQ70N|sl_LvfGkJQpg8bQRMWuW4k=$e1%VcKuGI<=XN8vgP*Tva9`P?jeS)M$> z@lQB@n&THKuFS8bV^Eu~rMNV|lwxJRlH#`fHi}U1)A&sb)T0RXC_+7oP>Z}9wYRIZ9#BPFb-KO8E*-ywBVs)IiT5BxpZIN|(`^+HNv~LC- zn=P_72Y7~+b2f(Oscb{UzM(#1uiqT8Z+1lNj;-^$Hv1Inugy7T9A_Lec87f`L!3^# z(;3-DGTQdfU9J5LYxnt5+Rw1|r=gFV;a)7zXG=FB&yniwnd)k%WtlUA_l0A+9md8W z&&{F_9I#hqE68+VjDMTYWDWDlNyGdwT&LlhoW$|01s!sEa;|2d#Xzj(xSZn(j&&SY z0@lD7kjrVVljb_;7aSHoy&l>j5_cZUC$Y+z7Y{upV$T z;3mL=f7rk;>m4(zfWHdH6Wd!2i0!Qb#P-$#VteZVu{|3gwznP-+uHz$?QI0a_UZw# zy-k4F-ey4b&5X6$`2_7>jbpl|##yzjI`S!?^VXU=D?HaoXR5oLHt=0X4a5bz%gDQS z8wLcA{BAeJO&h?6;CarIS7$M3o*^D*9_AVzL9zwfazNkE*U#IizZmvS;Je6&dqDXX zz?P|H*+bwH8_znPXsNMI8ZFOFS#Gn!vu0%TR%`Y9`A6=wtgud2!kpNxX83;j>^qia zW@1@pjaZ=Upgfqb5kef*%SWfIHIbI(Pk*vyd36+RSZb|g$+G2S%34kNCgcHK0l1{n zGl}ScXR6g436fehQwYrzc6>J#&4Bmm2XEk7e^_9MgRM@KTs#bM+*yDTkL; zZ?{x@e)4?vGfpADLz3S?p{4V!}W2v2H^_86@)7cR|Kw6xT0{y;EKbQfGY)8deU&6Hb~uFbE}96 zd>P6nc<6y%EoizMx@&HYpy4?{`C37PUn9Lv(DI7xWTFZxA$YEI__d z&`TLzFKB$f0pyzmy^PVDfo`-|45P9#Q`LCZAa%#*9D^B(TY?#5m(^+*OP6Mmivgb+()G4J|Zpm5ox)PNXvahdXYYEBrn3c^TMo%|Lg7F zoH~8yV_Dc`Wm|3bPuuyPWOZ#f-IEaV{(soPB|Ho*#7t?chhYt|P`b{;a5br-w9Uh? zo>(dE@Gx|eYDzbF7&h86mh9~nXR?!o-Gh_-=f4h`X6ohV26=(XnR@m4G2GOaJIAmE z>jjtcdeVF#KU~ZC(!3aZsD4L|;l2iJd*zF)9cjLhA6Ucr(!3chRNs?h=*4=$^}L=m zpU4ky;(TddjW(*^mt)x9i0yCrYu3Ir-^dSa<9un}jSi|mkYjic)(h_7^(22les~w> zOMYSOrTWKm43E39{iZkT^IPoHzwKMAvyDdMG=28t8CZtlK3d=X*3XQ=dnk^qdku?Jw~#v|zpH2(PF3OKgnuC4V79 zRNv}hScCH#`2@@>wxjsV=mh6W{z5`jzs|#OHR?xC@p_8C#6HRSlE08)s&Df!tjBqc zOu@Wj`-;DePIJEGFJy%3J3I`Xs2}|duc!D+?6aIN`3pHg^&3148*zRkPrj-57g#G$ zKKiHJUtrA;>k0Csf6nw&e~JBNvA;xKVE!@sGQ;SX7{*>@{u22H^N-Qj8AiXwF!tSf ze|fmF*k9rpUob{W{*u7>f;`z@Sj=ASioYZnZ+{#`uDK*k1uH+U*bHzBzSyD@%WOK;>&+oiv48} z1^JS{@b$>X*CQHV1~I;1J;h%F7+;Vt`3ql<9DF^Z@g<1y1?wsP62|y~e92$fq}UjV}?5FIZ3Umr;x_$d~+uuSXuf z9?|#`#rT5t6n}|fd_lhCFMK`f7xLVm%V^MO}{;i7zoFzC?I@8RhXM%HvCn$Cn6?FQYuZ zM0tFPN%7_1l>KfT;|s=4c|F4Tg1m+65fOWFj6*4mFIZ3Umo&x~wy5y$v~ z^%Q?eV0=No>@Uh*AN5lhU$CCyFKLV~$d~;^$qjJ-5y$v~^%Q?eV0=No>@P|#f%++o zFIZ3Umo&x~2e`m&V)Xr%aPPv2vAMD7s7hF zoC)V#mm}$NAzjXd`nnuR&4sX@E@wiYxmu1S?^)zr3H5Y2lE{rvj=9o}%8|ZnF5bha zIg&0H(&bF(2f7?d&4sX@E@#4d)a6LJTu7HQp}sChQgb1!r^}ge9(6gAE*H|}OsKER zk*;bPoY$-6NV31kxf1H>awOSbFjqoOM_igF=c&V+uT%aQoG zQ6U$?db*qm=TVm<@pGo4Tu7HQp}sCh;^$I@TnOvwaweQdU5>=hv5ImbUCxC1x*Un0 zdlhmatf$MFaNe($Bgy_E=Srxj%aLS%!CVRb*;bPoJU=bq|1eLITPyZawIhu!g{)#3FlFlBk6J>UCxC1x*SQ(g|MD3XTo{ET8(5NP7LO;;uNNO&G^>jHC&Z90z z(&a+BoC)=HIg*+SVLe^Wg!8D&k#xC`E@wi0U5=#YLRe3iGvT~nEk~05WmL+QP%o;< zkz{|tT#2qnHy}s)EPU@MejHC&imDJB-vjQBECSbgnFrK z#23t!=z4SmawHS@3+71p-F{W&bQa$U7=-U4+U?cZ|29p-_q^4U;+-CR-{T*lK7OB_ zw|3Qep&q`MfZz9@!SD7DbVcwTggVH*?Ok1wOjnmPL+g5JU3_Q3((8@b@O>JuHF{p7KLtA=r)CJSLhCf?o{Z#3Vo|Y!(8Bey$@%Xt83kFcEJ?<&XS?e#EbAui02_A zJrC!gj(8sMJtcU*#qnabw+?W7B|H!KE)%@}RY#u~c=dVpRJ*aP%dKv*n{oq&fH#O{4OcJDW=G*))y7#`rU+rwjbx3QkedvXj9 z^4RU=vAfsUN#%!f3~&3uvGtsYt$puV_DRpW0rY-_?o#Lj3hhzoZiVhq=z|LFRp?%Y zKBUmMNi>g5{|@J#A~t=$0^ijtUc*;GPH`jG@b--TiS|!9&e|N+*(t*@ZFjUA_^!-H zYy0)B?OWO{*^96bP}g`9zP|yo=Y&jJ?}h9cA(PgAA$wBDql|m-%dr*e^P~7`u;mlLSm+zSWFngbC#gvcsMENTp?TL!_!XNF4 z$PM2U;kxr%hzH*ze+TcsetUMW>HS%P`y$vUiFol#kQ*TXo<@#(f-iDbwD}XAP1tJ* zoA2pta`|F5FY9c=eofeXOJ|eI7qj_oolV&D37dbTv&rR)*~B#-Wkur&`$l2&3p$%z zzL-s1;}_V3y{EAG8J$foU(6=1@e6Fi{#Dpa>1=ZOVm5J&UtkmVz`~|aXOqhpvx#f` z0-Lap7B&y-Y;yTxHgSz#U=#M@!e*<^CYLW}6W90!Heo+6Y&vu{xqLC3xW+HA2|0nV zxlCu1%XK!z8vjqAC)W7ChrjnI{H<5r>A4!G1>U(iJy+YnaCsxc6>f%gEeuz7Sd7;| z_rqSRNnMKmZ5g)vVV+}PyR}^w1Ka&D&oi*yEN2b$*VVu?;N+-Dtrhar6)rsXB%*=~sm z#c9qzsnkEu`8jrD1*N~>{JB7ZMGF@>|JnPB`QPAta(6NRJDh)fpqT&9oZqML-{Jh9 zg^KIH$N6vY-w77dpK|^Mh5swg|NVoaCd2nRzvEaje<^jKCB&)lS9AW46+7_f-tYl` z_;|6Mot*!M!f)mLx0LbpaDM$zv7NV&Z?+Qgd&ok12k>>j`*05Uy5D^uuGn}gL|nmB zG59G9;u?-kxFY^!gz*27pGpu{Vm#!i{bM6Tzz#(dJNmXKDWE|P@&k-Qz-Vh3q@~7q1e+rU+gRt4;~c7rjDjU zv1gv^Z4ul(9fe|RZ+n5;*(-{j&4q@0n+py1>?zdkb{8sib{4puoqL3Z-b006boUnQ zcJ~(U_h56O$2}c|3OyZCg+kHWT7~NX|e3B0Q!IPdH9xt zeZ=}G(8>8iF7!DY{;r$QwO7c+dMePN=!^RoXa_mI0xtB$`Ylkb-`ouCgD4~33>WrG z=zGNPy9mTe(#Io}vA7#9)UQGUKRX$S-)k@YE<0zdH_!@v)WtsFN5m!ZJL+-9F6>Xl z_|~ol$+mJ rEszel#Qh8Z!Ei3_9g2ldA72+l|Aj!bE9eLmboy^_{VR%sLX!Uj>muPi literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..333832ab1031d81881f6160dab86a0835b88f4e7 GIT binary patch literal 26672 zcmeHQdvH@%dOwn7TQ&wvco{J0LJVP3BjmSDz{n2>0YjWD*(F&LS+-=$Aj^&{97tJn z?O=zcX-L@ZcDJvPgph1DuT7KPY?E5rwB7C>&;gp6CX-~w-O0}EAKMw4b~`(p7WMni z^ZkT+Bf_W>6m=Q z?eB>l@P~c=o}S?F@Qcx)Kj3>P+8+yoBoK%K&o?~e?+Kpi3lDbt!_T=R{(<1r?a#Xh zdwYk2G4h;yxc}jx^M~#cfA~Pqx4%CUc$umCx)1dB2BQkm1+jN>J^r0jK1hAY!O#zXa>9L6|U77ebHCg zDz4C@8SJW8*r+kec!fMlp7V!?LVhufrrc9ek99gVQ?${R3jz zJUg9*hqOufBv%K&&eg$hvT$&EmploVvha{L3ICa^gWu)q;J>qQaC(O{3FmAi3m5V% zoU=J{_TbFfLrE4M&I}KQ`(yjXhvJ;AGz%x`Q?Vdt4`n%fD9^&fi~YmC!2>a0cU66A zI*cqFrB6pi&K}n0?4j~nJY?5ZIo;atA5P7ihmCk`&7X6&Mvd9kw)HPSGs@SUHoEs( z?pM==I%jLo*~7M+J#<`)2lZY1ob9@ohhkCRaB#ps6dH^MQ}16V8~a+{zdahWjPKu1 z-F?x)14A%;-#{=D^Y!*03Dfq+veG$i-Gx%t7&ervod%%Ct z*Bkc7d=Cvq_jBh|1_>UUlONRh?%P%8hZbhErm4HPs=hBcFyQMQs;Nf87mN6s>JQe} zLR%A)qOT_$^hdV5?H-EukN9Ij>pD^wO5SisgAYPmG;O?Z!-`_F{m?z!Hx%{t3?7KY zhMsp19~$T$41Z9c#TB^s2cXYoIXh=3ueu-dj|6+8gD`D)_g>M*9SKH<`v)T@MUxqN zz3K}^H|0S$SoG7mLLJ~d?0}1YJm;gnIA|;0OH3~`zmd0V*KfbSfA!A~=fOE1_(HyD zNmB;?$>OfFv4NSghMakVS0=sYN$1MxW}uJcm5`%wQ=-qWsQkYp9pud^Thca}Ou~0K z1@^==+<^&r2`~Xj-?&11;uO$czyzQPXabUad%_Fg1?)uHR**0d?F9)BVqQU_6wy(T zKp?a2iBgzu6VSkEF9JP4?_BVMba6qV6Yw~(8^`SqBk6FgNy1zw?RJKQP@E(MMjX!V zVSwhhq|sGgC?O?J11BBsszHk2pTc@p+2#lJn>s z%uddxSd=WHn4iq2xH`F-BIrGe-?V@pMbM)NdK5vA;?lUwbWOTWEOs@YY+lrCHjg(y zUS}4>?c`|uy}7nTGu&>HJUeC2D=19bZyE(EnY1rJJd(688ciM{WW^HKNf+Ex{aA6_ z0J{=0{{nH@;9IweizS-~IZB+y6HaG|V@X+Pang0N3w*j(fK6FhXi)<2C_7J-F?@o` zmWEtQ%R{d66(QFOBjhqxrEJSwV`#r9VVpFM8Yf+bYiyiURJbZCLaWKRcKl;k8h@OP zdn%jp$JzK3aE~kCS{!iCjut|mBE@UQi;F89vnoP(T{x#}U~WwE)D*_RW>-Nxk4#u| z{QKm1+_WDXHSNdYcnprmM{z!9gAF-9I$hH4FcD{SoWpT0$5M{-083yF$oUa&Gs0~~ zrkTyiG_x6*W;P?!Xj4w#!%2w4m!?d3fAD_Boz9B+ct1^eKh8CjE?N^hR_;2vfYe@? z4~b?W-~zx!fC~W^11Th|6;0 zZAKpbxcAjamn zv$)~}y}uG;qNJpvU{-PH5ulUKl2Rvpu8|%uZmuYU*fmNZFSy)h(!Jg^A$cU%dnhhn z3NZwq=OlSKZT<|LzuQvtXB2ru$CkTJu7vRoxZhQPD*FAMU~Bw8Qt=lNTMc?8Gt48FIk6L3wb0 z{e&3BbM}onOG34CzWdGEImKc0;h60tg|p_6F=sL5+mHuz9$;a!S>Q)6{55^pq?O&Iu@taGvVAMTsJ|A6%U$9aHzz&NAtKOlYo0qOe>NZ)@z`u+pb z_aBhH|A6%U2NdJbxdfWdB_N$kKsuLzbS?quTmsU$1f+8bc-1(9qJqUyCygUW^~BZL z8KN--i*(XeWtvX{R#ls(0a#r_ano2qya{qZok`*ifa5gI&P?HW$DF51XwEuy3i2q# z1LPtD&Q1Gi$RB4$N$KwC&by1up}CL6lXiSxlgYb_Ag=AjqqvUVJ*#+)Bk#r0CyHOF z5c01|@~=|)D@08pLj0MC(`nKA}!A+((-&FEzc*?@_ZsK&nMFId?GE+ zC(`nKBAsbIt3d|u3!P6qm**2{c|MVr=M!mpK9QE^6KQ!qkSefq6i~X%%dng{DQ$QeR*+&!uk|vlG@rcu;aWnv5)1=n z;PHjWfa$nRdr`BF@owbb65VAzg&fhAl|iZ$f-F6VqH=FEC(zb(PA zqZY?sd6A7T#Uk>%S987;lV&5e?@Tb9(H^QD+IH&FXM35I(car~-Z zmM2%bXw14+6~`;h$_Y8&qE2@m=qPztkL%2J6Ugtr$G%+TcccUN z+Lt5t-jBHQCo5Mu;@>DtkJXSLVij{M&s!L4`+~2RCy^)RFXE?jO}q>ps2A?%dP=ND z1~^}eRnkN4onD3|7*nAK-(}+{vAXX7=S#6lg4Dj$%WwhOhYxW*B~~NfQmrC25X)ZXwitU&wlx451XtC2tEd?{ARermtg%disTC-mem zXT~b7FFg06&b~hZ9pp(lSj4J%k>?_Y0n`tFpX)2J8u?4kS7J5v^ry_f607^3<9sDn z!$07BB~~LZbG{O*p&#jEb>Exmu^Rp%~xWT=2hII zNU=)uD)OXQM5~G^D6SCSmiaq z9$o`v&Z|ldK=bOAYk*LK)c|3Pe_F4iz7nf6uOeTGRhn0kuf!_NtH@VkmF89CE3r!R zD)NQ&TJVwL7qXS7MdsRpcwNO7kl6l~|>D75Pf6(!7d%C01!(MZOZNx#U#`#OnWi zHh#rkfNsyP>kVMM>Usg{o*(sey#b6jT`xel=hyWH&|cRIQ1|?(r|S)1yyy6Y4kb3~w6UfvH*pruD2dKRO z-JW098?bUKu6I&yRrma;r|S)14C#6Sx;?+HH-PrKUVysiM?GC{0Aonk3()QPb-e+! z*YyI_JwNK{dIK4AE3TzHw~AQR?fG>*0xP$o4<%M_q+Wm=tJo93Sk?6cUbd&ls@ewVguU)LM3@+z)(a;z$Qek-q{o>UiL4AH!be5nrLedV-0zpgic_B5}ey;KMAKD4#x zM?GC{0Aq;eRkWAt0N%Gw+w<#s187h4D%wkR0Pk~Kdw$f@^#(HLRa{HeSk>(Lbv*(r zuc8knR&S(UfE=q=%ByxIR@GjBZqKjl4On><_qdwwggqMo$h!Wg1?75P%E>h}D)-T>Ouyo&Zxtg3r{ z)YJ6_GUnCHH9*>)U)LkB@~Wl=xRH7Ra;#n{uf7D&&&c1is=WZ+o}cDbTto~yv;D+md{zm||nFT-^vLwADVdNPm7Jzj=YCd29k!y2-R z%4@w0>r96A35E@%hRPeg44X`b8xjm}Bkfea(aUg?$*?)Wa5K4w%DrBOEhfX(1j9DG zk1}{RqyXj>pV?jDWY0n)-*7qOpC#eBT)58_FlRUA!LxN{?Z=MFQusL6UT|!N`9OM| zLc0}uy+V5wx=Nv|6}m>DYZbaqq3ad8L7^KJx=EooDD-U#y-}ezDRi?!Z&qlpLboV% zt3tO)G`tu1eqDwh+Jf1hcbnlY_{=d*KNHWuXF_})#--20G0+j82YkL6o>MiRDef!< zT$2x<2Ydz^o*OTvpACE-RrzdeB-_qc-fM>xn!HfCxWEpn{DO1grv&-yzQOqGfDbK2 z%5NwCb|Js$aVmG;&*W~!ZoQB3*GqO=Bs-6v$vui)RS)A=Np@Q$yXqj5S1WckeT-it z+1)PL)%G)atzuWVpYiJ?yKRzP{Q#5KD|QWojNc&H-67dEKFH*aie1w%<2Olm+ahsTL8kO+qHsB|=s& zWKxYHWb1`Ys#k<;t&mBzi;%4pGO3OcvZX>M)igpjU&y5TM#xHpOsaK+EMLf^x(8)g zhhps$hrXf6FYnm@Z)%^b<&-aLqU41yYohd8_{*AzeBm__t~EY>;N(X%Ja(H^ShvA%BhI1PkmTqJ?&+9PX0Nd-mR+G{g{Ws}2?)B_r;JCB5I!qk* zdiFDL+&SCWxYx65*q z`e-6z8UBG6KIVt7!<>O%W8f>yfc*Ayau81&xg2_X^tD$w;Ix2_{s{aE`33`io80K= zXg8`oZurkwyFNS^=?g}EL;hF@f5gW(Jm`ys;75Lm7{x{Nb=~aM(8#9qjE72Zz@WM|;+X!y^OhfU_^y6QhLT z2?YE#{y=YCeM5bH(Bt>_cmkf9s)m}LV4$XZGdDEw7iziVB(J|hdsSw+8*>wlu~+c@8&_0r*AU&3Du|bALjbM!M>UFnxN-}WLnXGg7bg(HAxvx|D5y7_NVjD zaQ=G_r1O8o`HlCb^WWk8@9atE|0Cy{3jZSKS47hFKj!>P_owrJ&G{cG{Quzm+Maa% z%bY*dmCi4s0aQr#D*OeUU#|FppNWB2A#wDj`?;0#FZHJL>o}h%^XcXM-3tH5$hSL* z`2Q#fZ##gm`yHtxz}Nkb6mdreV?p8$9E!kiVG(y<?*Vo&936y@cpv~DaN_P69Kat9Blx>oF@HB;2&~x|LlX9P2g5#(r*5lt z+SqEHwtB47_D1WpwI+4iWSzFPiPNeUOSfvPrBGc}B@}9^TEuBXjWt?Ljb+wavxO^E zd1~v0ZhK9gIBoL?1y5^>#cgd7*IQNPu?BA2YF$~|)&}8WOKrP2ZEvy^+M6T=>vU_a zb?IAcgf3i}HJYccE}LKC%EktNz022r@2-}ecd(=-LMMfzHlAHv7C&w*2lR(|5q{z-`22~SO*1GaK4ZWd(MX6VPR{m zkc)LwprP1{>lf$(c^(|XUaZps!`xl~-h=$Q1qkE`dwM~rp!_}@vhd}le^Y(V;Eg1HMH55&xf+A;z}q5HWwHoR8mMB2U<#8fAi0e7)#W z##2bL6V#EXERRIkzslTMe}%n3T*j=qv$P25^fAK2FKjlD1{~u0MR|mmDXtw(t(V!& ZPR0ETff!fNAvn?5U*h(0#Xup+{{pcU_OJi| literal 0 HcmV?d00001 diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index c5e297b0ff..06a6c25efa 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -59,8 +59,9 @@ def run_gemm_b(x, weight, bias=None, otype=None, scaleA=None, scaleB=None): def run_bf16gemm_asm( x, weight, out_asm, bias=None, splitK=None, kernelName=None, bpreshuffle=False ): + sema = aiter.get_semaphore_workspace() return aiter.gemm_a16w16_asm( - x, weight, out_asm, bias, splitK, kernelName, bpreshuffle + x, weight, out_asm, sema, bias, splitK, kernelName, bpreshuffle ) From 37fa79bf9c977377cd0187325875a435dc2e5a34 Mon Sep 17 00:00:00 2001 From: amd-ruitang3 Date: Sat, 20 Dec 2025 15:53:49 +0800 Subject: [PATCH 2/5] update --- aiter/tuned_gemm.py | 13 +++++- hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv | 38 ++++++++++++------ ...p32bf16_tn_128x64_bshuffle_splitk_clean.co | Bin 0 -> 28656 bytes ...p32bf16_tn_160x64_bshuffle_splitk_clean.co | Bin 0 -> 32720 bytes ...fp32bf16_tn_32x64_bshuffle_splitk_clean.co | Bin 0 -> 16432 bytes ...bf16gemm_fp32bf16_tn_32x64_splitk_clean.co | Bin 0 -> 18512 bytes ...fp32bf16_tn_48x64_bshuffle_splitk_clean.co | Bin 0 -> 18464 bytes ...bf16gemm_fp32bf16_tn_48x64_splitk_clean.co | Bin 0 -> 20544 bytes ...fp32bf16_tn_64x64_bshuffle_splitk_clean.co | Bin 0 -> 20496 bytes ...bf16gemm_fp32bf16_tn_64x64_splitk_clean.co | Bin 0 -> 22576 bytes ...fp32bf16_tn_80x64_bshuffle_splitk_clean.co | Bin 0 -> 22560 bytes ...bf16gemm_fp32bf16_tn_80x64_splitk_clean.co | Bin 0 -> 24640 bytes ...fp32bf16_tn_96x64_bshuffle_splitk_clean.co | Bin 0 -> 24592 bytes ...bf16gemm_fp32bf16_tn_96x64_splitk_clean.co | Bin 0 -> 26672 bytes 14 files changed, 36 insertions(+), 15 deletions(-) create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co diff --git a/aiter/tuned_gemm.py b/aiter/tuned_gemm.py index db96a98bd6..cb81a650ab 100644 --- a/aiter/tuned_gemm.py +++ b/aiter/tuned_gemm.py @@ -24,7 +24,14 @@ import torch.nn.functional as F from torch import Tensor -from aiter import dtypes, gemm_a16w16_asm, get_semaphore_workspace, hipb_create_extension, hipb_mm, logger +from aiter import ( + dtypes, + gemm_a16w16_asm, + get_semaphore_workspace, + hipb_create_extension, + hipb_mm, + logger, +) from aiter.jit.core import AITER_CONFIGS, AITER_LOG_TUNED_CONFIG from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.jit.utils.torch_guard import torch_compile_guard @@ -393,7 +400,9 @@ def asm_gemm( inp.shape[0], weights.shape[0], dtype=otype, device=inp.device ) sema = get_semaphore_workspace() - return gemm_a16w16_asm(inp, weights, out_asm, sema, bias, splitK, KernelName, bpreshuffle) + return gemm_a16w16_asm( + inp, weights, out_asm, sema, bias, splitK, KernelName, bpreshuffle + ) def triton_gemm( diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv index 9f2183e46a..bbce04c538 100644 --- a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv +++ b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv @@ -1,13 +1,25 @@ -knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias -_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1,0,64,0 -_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0,0,64,0 -_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1 +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias,clean +_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1,0,64,0,0 +_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0,0,64,0,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1,0 +_ZN5aiter49bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co,1,128,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co,1,32,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co,1,48,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co,1,64,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co,1,80,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co,1,96,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_splitk_clean.co,1,96,64,0,0,1,64,1,1 +_ZN5aiter49bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co,1,160,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_splitk_clean.co,1,32,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_splitk_clean.co,1,48,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_splitk_clean.co,1,64,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_splitk_clean.co,1,80,64,0,0,1,64,1,1 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..7594ad577d64979760ef94eaa7e8f8fb550f09b8 GIT binary patch literal 28656 zcmeHQeQ;dWb-!Az<+Uu^GWbKbW$)Su<4}9OvLs6ef2?KM2!A1n0oypNSJKKWkR`EN z3!6If(;^9>D6&u#g{EmB1PJ*cX;NrNlS&&VZITY|*d(386gp0(nMr4omeQn?G%efn zyC3VjS62qKiTV%AJbv$-bI*Bq?>l?mdB1%tpWD5Ek1Ho9=w{-T%PzAVxeo}+{q`2I zbK5NFs+foVKc5w_JW|6qD9n>EDT68W$Z=4sV@g#9ILm~nkQE9Uel!KfmJ6BPn2A5$ zCuD`XALja6MA%>J&BS&gvUT}#M;^~b;(PH^{_6eR@w0d%-rsvYo?7$U$q&bSzb_n% zuO}vqmx1mjLY+n-ve|_HzL+r|;m!j!x76LqSmLYUla0}iu>bx@tToco@z~%q{&4I> z#~W;a-&g(NhWM#)ODNpX5b5anQY;c~3_TcYjz`GS*cc;SsG~jH5P9}QOIv-otdk;UovJ;R-9hCa)| z-gSoSEKa}g8CI(q1}q1A*BMq>oTi+i%#vrqE$z{;I&4rxaI2I+a=lvwJg^smj* zfB!W6pRSL@(+f~HjR5)r9Lm!FaF+hZrs4lwM?BWt7^w-UEI2-mAo?OSX6YZz(w}N3 zO62>>VzmgRY7yF}5ky~vc$WUBv-CeZ4S%@^yV9BPiZhKU`YOEAlWlce>fI zE1eCm3{N9U#zKr{8{qM51AJy00c1w(ypj=5P9sXjLOh*qfakIe@RexMHb=>v_;C)H*1mCMtJkg*8=l5K#WXB*%b(+KeNL46V4nnr|-Mfg>=0e+Kh zfd83BfT#Cqi*O-_O(TT93Kw$RSx4|>9idj=fui14N6j!@gFc&NT~^OfcBO(Rg|a+G8pVO`b{*1w+!(_>Yhu5J!@Tv@k(IO6*w ze=g)yS=`?3w*EOHqeAr=M-RTA_iJIHF67i?9icYs2>ahp1oN!DN4oC$_INDR5jh!d zkG91kSI%FZIQIKJe}^q@Q=Y$X`A@{!PPNnFPn?Xj#zRfbrz4Fa+yebQi0QxNq$IJC z=>g%E<`b>%7bofAav|sVOa7DL)1ju8a6I&2TkL)roZG0tgPeSQTj=P)iZD@_vGOhT zO{JSpL{6RzHMN&*f)I+ghDtY8o!(p?s_%#rL)j7`f)nL2)X)+Mx9*+_nIWUq-``AEHLqF-2)f%v@ylDj|P3P*0%y*EK-y7I` zux96S3s?`mdG$SkZQM-i-0a2ioK#Ls+lb7@XU3tj>FgH&uhdVC`ks(m72xz`qcFtY1||DriI3F zq;VW+97h_*aruPT@%DMoFZFI8*}i1EvpunWsKUvcaIv0=A1}yBZl|}KVxJ#(<>t*! zxvuFVDV1_90-j5`mUN}gF}7-%cf?EYX>)h}#2WG|VAoz^-W>YYt@qV}ZH)CWk8jxH zDR3_SmaxvF|7!C{h~qz*%{&rf9=-IQSJ9l@ zvekm6&i`k9Rq)Pt*1p z+CJNb^<79l>|)pDg8gm>I8Wkyi3=naN?a)M8j05u7Sh_Vi=8q?r;O2gS;XkPEMjzC z7BM<6ix{1kMU2kNh*8YXc^~ENSH^KZ@qQB?Psw7u-z|8*p0!+_b7Qo>n9dO^zqFVN z%TmH6ghhl)36~KT5iTcOM!14-Ibkv33c{6y#e}N}R}y*&R}uOMy@aa?eS~WWR})@G zxQ4KV@H)bDz7mT!Zcjef=RiktF5&E1!7iYWKD!k@@5l=8OWoz&Pjpv!*+i9ZMXwK^ z#R{)DUwAJxXXFh(UxfU85%Tjz$j=ucKVO9Wd=c{VMaa(=AwOS){CpAe^F_$d7a>1i zg#3IFs(E;ay<+`H$JdW=TX){XPWn7nIBa4EVS=BNXUB1l`#moe@H+C+OH|{iK2crp z(Z1vQ4tldV_iO~Cywhdy|1IR9i#1D z+K$s!Pg^5xO|(U6yN|XO+FEIAr!7WXoVHHdPSf@fZD(ogqRpW#L0b=PeYEw{HbC1T zZ9}vT(>6leC~ae1j{mH~3U9x>nB~xCh4yo(D3QEG$!Ow8UaDj?b|e=mnLam=mnj(! zZ;-rP$#~d-C*WPObw>uUsAUn9u+8bQ|A2(rFLkgv8z{c?>4jWr68 zADt(jN7&cb2(rFLko7f!tgjJdeT^XNYXn(eBgpz1LDts@vc5);^)-U5uMuQ@jUel5 z1bK=zTEU*8>-y)$)%A0k_nY1E^Me!A^-fe2c^~o09&ypUoxDe6?45tQo>DR>(9K+2 z_5=kMFgKSAg8~<^JT4an1r{?8mwiEjB`lxI>w*H;vjQ#$f&$Cgd@ff71#T@$xF@!_ zdnRH*F~{}nCqHs>zEP~#1?Up9zENz}2Pep51(O16Fs^5g9H*Uc=={N6YVKc~ z6u3WtV=sJJ97oFs=o5>jUds!ohWpng1s=k^v2q-L z^{e9eT0TLaxL)eDymD%}|M8^2MvUwEkQ}Gw8}xk}q+ZKAXD|1UCIz-s;rOLL701`| z5&A@h)N6U^?C1XNNr7>U>)9g5Y559$-!`e&^42-X{ZA(ap52P$?|i#Bwc5)+zuqJp}4ERxq6LzVtjUH zKKFN1KBKP5`h0}<3OcP$v2MN|EGW>8af6S_aYjB5eO&6bd}a@Ge@{?g0rIZ@ zv6sbhjC>yWq||Hq%zC+hVNl>A_z(Vx9B1V7(4R@Yme1@F?q3uXSd6^u|J;C1sn_zEo#*~lL4jM5fBoNm+n&#Kt%rT!Z)85xHDC1u&=3Al z_?h`UbRjdJ`!9+79ry=3Ce2=ug{~X&lqRq^BC$g^m;yvYf!Dc7V-K# zjQWgmMm~?AK0~kPv$$r}$!iy{&!ec%7-!`380s_hdOnM5++lgG)gd>%o4 zhF;HSaZNlfuZ_GukD@+foRQCCsL#;r`7ExXQF$%p^?4Ze8RLw69zlJEUe9N7&25+0 zUS6L^QJ*o+$mcQCXXy2O7T4(0@>`aCA<^RTSXBeFh^%KAJe>+`Uz&m*!vkIMQyrq$=W@-pkQgZhkn4=tY)sL#-8 z`7HYlCi@LupL6KGbLEwS1QShDY`rygoaq&lqRqa{~1ldM%%2zfmar4PKvn zP@gf*$mc%PXXv$jmi^FFQc2J)&&dBEk>NE6OKFfZ?C;JUvpL6KGbLE zwS1QS#yZ(=tVexyP@gf*$may=GxS7dr+S-&dBFJ)Mx0me3t!2mFzco zeRig*&#K>0_1Www+#~CApRCW0tj`HqpL=9|?vwS|k@Yzt>vNB+ z&wW~beri@`KKG+OoBamrv%TL4%6@~_=RwqGj5G3i2=y6yEuUq-Q7iimUZ49>pE1tJ z=K<7b=(T*7{YIVaH+X#>M196MBcF#*pP|?CS@s);Wxv7eb3f`c#u@oMfcgx*md~=^ zI4=7QUY`e1pE1tJ=ONT*=(T*7{YF&w8!f!wKz+tIBcBIQpP|?CS@s+4vftqKc@Xs( z+_(j&qK04_sjY` zAnWs>tj|NTKKIM|JRs}ypsdeBT7CXE<9=`$^%>VDtv-*SK0~L~XY*bb{qrd5GsYSD zJcjxVy`Ima-$>v4qW>L6ea1K=pGQ!iq1W?S^c(4WW%!SxK4YAb&ts_1(Chgu`i=Cx zHTvIS)Mtz{@_7XH8G1dRMZb~07l;2S>NCa}`8*8x8p$Dz0l*uizo6vxqMR<2Xz%1LJI*2lC9uaoD&F8|MLk z8^>XC85n2dJdkHLj>E=f*f@9*imto^Pkf%0|L-OirE(7CioCosE#&Jj_Br$05H*$lx+;oCo}E9EbegAkm}4%u&Ha2W#@&I5UB z<2Xz%1LJI*2lC9uaoD&F8|MLk8^>XC85n2dJdkHLj>E=f*fWjCxCG_&I5U7<2cOc3NX&b zc|dRDILzk|FwVw#AkSW`YyiGn?G^J)l1Kr<#(xPpz%EHZ)rT9w@Z2U z7S_->JXeY56?^b(;=a96JVRMX+@5#u-e_X)-jW0#TgS)ZnM?QWbM>(^7jg@kxWd<|haVHsf+VGZF{!aE6rgiVB)z=MRf^!z+M(~DA`j4LM-QrG~u8kjo6Y+>k2_d9xu`8gi8(Z!zSX4Ebh5-fGC(40*dD?=a+` zA@4NgYD3;-$Tfz%+mQDd@-2p3Ysj}6@?JyUr^%EH$d^v~ZgtVTz{}ey1v>H){4@R( zea6-2JE48P`)Q2&eB-%udfwRgRDN9{;o4dB`Np&9^xS_T|E%M=V&k*EmDOHcf5b)9 zcCT=(EI{n zVGkI7r42$~s`(w%{5C~|eUsr=c0%aOG{4(4zw%~bFE{)u?iczB&96@L+k8^kHyeJH zZ9-qE`Q5JhRXrf=RfgY|4x!(o`Q4%U-E>OWZ!-LDeo*Ld*8D!K`E5NT>{|`LZGSBE z+cdvJn&0-13j21$Z^t>I-=X>4srd!Fg*|Ba?R;41cWQojX@1qc!d`9o?RrG$cWHii zYkoD43VV&=xBFv4zgzP=toiNvxUlar{BAif^tWh!M>N0M$ArDs@VoUBLVv5~cU1G+ z`$=KnYxwQ^l+f?f{O*x{jD7mG>-STs_YfYW5A#Ao`k|JyjPMSU*AN~e`9{LKNG>Bh zO!CcyM<-ot_seVT0ms9yO$U<#Z+XXUTcpyH*@>Zq`-UL^R=#DUF&M!bnn+{pF{EiLq2H8w;6JsA>VGucNp@A4f&8E z-)YEq8S>qReAtkW81hj=zDJW~t^XZ;ucd1JZ`}ATL1vE^qI>q=XOCBz@Sd-H*wxGqVo3d$rv9fJeHmyfiwvEcB^~=h(PT92H zS=m-Ao7P7w+j3>odTM36R@t=vTG3%`QEVjjz_Nihnu*IZ%4i$5@EvB?j z74yI8dIMX=deVK1iupF#C#|QnPZjfLwwQG9qhh{kiz)3>#Y7J|X`Xcdq+#Y7J|DJI3)Z{o`M$H4=LH1LFBiC=N?_qufeV8Iuc;AuZLQmRi~iQYO!tPYFwXy03XVII zye*D9@1UE$qpQ6*ll(1?JHJjGcP4pU9CyJT;5uWn+D~?b8p5rij#KUJZLv6wXt+OoqPg`1*(0IGNd2i3q2|`6wop@B zEEH~O3AJ^E+GA}^%`K6RjUBOujV&#mCpVDhzDPry3%)>OW4J8b*i^B(a`Wa$ARKN8 zGzQ8_E6W-pjb)V;TOy_O4~#U_H+)FVdNShYDvW;garTcgZFm$N`f_}Ct-Y4={n_lI zpx?*{^D&=Me21>Rj`022?8#D1NxO|w@7t8A z=O>xsJXw*ce@KomxFu8HEA{I(XX=Ng{;j(+^?xe$$p-c;j{XX<}0^)DFu|CIW(Co;#sBlWK}Wa{Ve1e(ph zXXqD6eWMYF{+t`VX0u~0nQ?BE`dUL@A@$E0>lu{#f>>spzk}Z8Vd~#IQr3N>xBXqm zM@et{yN=A?+7^#6f8&`}`b(9}f1>r2KN{|cGWmb#<1yxswKaz0VdjrVPRIG^@X6)| zQb$^1ZS)auY@`o3^Eb4e#2@!$_^X%ka6Ms^yv3eC))KCdw1fhIiaqJws+#m}b$NPs z_m$o1%Cxrn%GjFpfYO~6YC!4kDz&>QaAo(3wz9lT4JZrjRJ+xcyX07Ibn*6gWP zySu8=+Uh;&=~VC8q}(>`Nyn|JR@y+#&h&iuTzSX4D>tjSyDLl8?w*P*Dp+8PIz(w; ziwag+x?9b+bW@-#yT6B-oj3lPW~k=K!JP+g6@^Tt-?S0wJexNBT#&!ijzsnETPd@; zZYdeQ_(`2|Q`bj@wQ?YSHi%<+q^vG`_&MBvrBd`Qy?CkXqrwuYS9awu)%17O7{B%^ zySkn#^cntY{tCTh&!tWItLwKyNBS$kd(dAukN}}N}~|d{|~13#Lh+n_~Yl2 z{B@(maG4lKOYj+J!pZpQ!aKlzKe30-f@&?GuLDCV?;E z_mgCVm216z$vnE(I) literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..b7a2b37260c89983c3214af7da0bdac6d9158e97 GIT binary patch literal 32720 zcmeHQ4|G)3nZKC~VF&>Nq6AEs0RsX;oR9>90uB%`YQTtysL^4POfrFx#3V%Q()fsg zXsseutJc;SV~jDzG)+^x?UJddd$!%9d(?JM@oeo$d)htSv+ZuzZufM%U1z`F{WILT zxdF{sj%Sa}K`Zz4Y&@8f=OEIyv- zFg^ylFA?fE3Xx4E{P&9)^AT>^UUF^uEsS*xgbq|kT0(jI!_5uh`j*EYelai9T-)+f zR(5(W*1-PU2JTqzxSLy+@cPaX*t+=dzfW$`lNffN%hcYIoNr7 zxXj}8N%wHG>Y?9qu=Dn?*y1!{59L+zVyM0;5>kf^hzM?VN#No-Uj+R+D@oG7G)e!m zN%(hHhFjwUP(FzO`T*=o(tl5q{uPt(?`>&quB#50_|;XgcM?JLL8wmBKa!+B-7|5E z{8(498iaf`2u+g+q7OoAlKzL1^glcae>n(S;#b0(&LpDfqwwa@WCNT?HUQP$$zBax z;#b3)CnpglVIUq)Ho#NK26%Q70pu02>FgEJIf*C<1JRvqfZk*S^iLvy8i=!3#NZ^N zBn-rGvH@OCHo$i$5kL;a=CfBsbP`b#2IBk42KYg;0p6TM05uS2uZXuN5hYNLoqk$2w6!- z$eu)mzPgrRgbH@`UI$0L8< zN-4Iuoo`v+OJr2AGU4cfkMr|dSg5yBN|KIHnskJ+j}yT>YafxWd$p;xIoJ|D5Ne7v zHiyrizjATxk9+?1SllK&f8WllZEif+M2D|E5N>D<*3=yeR|l~K`UeoxBb<~ZHZtBI zR9{!y;Qr<~JzU;O+539lfzY8~O?{{}cu!;Vei@wG=mrmR^1Thg+jkU&h{9|xTvu6> zzos^P;6SjZsbDpPU~5A#e~tgpn!;dZON1E8`Y;ikxIG4|>cgRit>4LOYOZSwwUT;V zm)US$bNFr|NE43yC8{kcDX-+U)HXE-s~QhBv^Kqz*K+TH%EtQl0!j5HZ+|s$U-7VS zx!7xY_k`NQHO-AQlo*kK;&$7@%`J6}4W|_GEcsv)V4jmoEV1I*WrJ%7XA?6XkUV`r z>d{_ppy&&QJ^P9vP2spCwuB?KdjL;b zs2xXY$C27`q;?z^jCvjKaqpw^ycHF*PM2eVlmfT;C(UICGD~M7+W~sd&)~c)0!jcqq*dl!7h4-c~j`A+p%{uHZXRS zd3+~5o(%W=tjN5W_f#jv^e!ZytgOf-QNp8QKbj@*B)2Vyco$?xyx9vQ-i5x1*S9$0 zU7Q>7<}QtRmoAHVm#sJ(E6aNX@h*w_PWg`dPI-OaBOPpIu6O0i$a2w`b&NW$!FIo6XkrCgqt7desbJ(a}@-46D}oOMtC{la>8YVD+rep z#;@Nj%A0K8sd=QIN9Pm$%_2m9=M$p83kcEQO9|26Y(n%mhYgTzDEl)Jnc^5 zb4T3Qdgxx_jK(~k^bceA-RPd>X`e#ZgxBMu=jX@Ybx(IO_jFIhP5VBsqwA%P#w&g1 zz9XKDNa4&UKVLXAy&f^#GdyhS^qK64C!On4pd&ezaO#vmJJ3h>h$5f&)TQ3nj}#tx z?nsfBjTZYZJ?2Bs7kS0`q@RVKPeOh^3HkXXdx^osE%9Uou94M);OucP}xk;6u> zCG6nmtmibYffJrH8T@{B<_z6)={`yKB_HiOuAkDqnU*z&(y(IUM+)wgA`!c$}Ij+6*b63(_4ZB4W_)7DB`8*PVZ`y6eDX=|s=p{;|qqqH5T?F4NP(e^NHkI;6Kwo|k{PTP~T zJw@9yv^`7P*J$gct&6s9+Inc~rLB*)e%c1woxH;i%e-l99!sHIr2Q1SX_B0!WK2j& zp08v~R7qZ-WXem(mns=wMj<&{$@uaH$vH}%D&&Pq#>9{8UL{WxvQNqQ{t4L^DS5h( z7b`hK$d@U3hLCfWJX6R^lsrqwOO>1{kK;_6){(yIA;-B=8)NM2V+>gzW61g#L)OO_vOdO;^)ZI5 zk1=F@j3Mh|3|Svz$od#V*2fsKKE{ysF@~&>_7_vUbko7T!tdB8deT*UNV+>gzW61g#L)OO_vOdO;^)ZI5k1=F@ zj3Mh|3|Svz$od#Vo?whGWnDCHdj7PUf6VuO>&W3pA0DMu$Y@cP_W`e*v(KHeiO<;? z`|zKxpp*y*bTb#1Jpq9k%+2M@fWWydjmud9f!WN%WnVyGE=%X~vVg#qEQ8DbfWSgF zlgq^cf$Lc&mjeNTC2S6tO9KMSvO3(O%cmS2Z3~E>!OwpFqt28!vi18ObX`;4$TshP zPC1WN9u>F??YgJQcG~p>ea{T3*RCsPC->hI6b2|6*~R_$ zMg>-*UH5$1PP-nV@5zyR?YeY!bN@(GV0|%;zvvg@_}cXfeb*AH*RES<5BG113T#EY z?&Y$bc0EJivr6i<>)N@U`yYx5JiH#q&;LLiU%TF+?<$gd?Yeg=xW5w>coglr*U5HT zK0x2ILF%=(K8|+X*U5HTK0)8J zUFx;Gaw@t1Q&EA>mf-j|&B=)^^78wWcX9gYN@wLUUONUm9N4>VnUe!~)o@3=zCho7 zoA8TLuEB4Wv+5Z1J$KH@QMINN@k5JqfL&pM-E{(c4lK?ogZ=K+9J>w*?7mlE&*xX? zl*4{xGsmuD0=pj+*z?8BIlEwgY$wOACj@psEwJaSJ97eT(YF>Yc8|7AOU$?4=PBQK z9T4|t}e`!Eq8SKmUEoXxi()Hmq#d=oXhT-I)0--c1&(9X!W5!5&6 z^?VaGevhp6yuJ;hzM-9wZ$qeW(Chgo<^+4?+<@1&VbnLYGxBW&^$mJG-^3guBIgpk zz73+jp`DR$L#S`i>-i?;98GfW!Ry;F>Kob_`8I<32ECqdVvcf1&Q*AQ8$^9WJ0stQ zP~V`}^G(cY967h)^=%mS4egA48$o@8Ue7l%2Rb3=LcG2WqQ0S>QQwA8-=Np?P0X22 z%DEG-Z^Ni>XlLZx2Cq>#p~PPMD>m4?Z~SkRo`g-uIJnE1odsisBeR^ zz75IxHZ1Ggh^%jevc3(;`Zg@<+lZ`hgR;I2$@(@d>)VK|Z-cVF4axd8EbH5dR^Luf zP0Y7W)HiJ%jk?u^`ev-B<$RLK`6RDz-KcM9XXINC>KpW0zRCHdN6sgCed|PhLpvki zx=`Ps*YZuyCo|=IlGnFx)Hk#<@~sE;4SFr#qLD+J0st^P~V`}@=eYs zeR4j@>svSK8`>H9)`R*6y_RosKDkWJCwYDAM14a$Bj37E-=Np>P0lC%az4rHTQ}+( z+8O!QgZc)&mTz)CSuE$1yuNj!zM-9wZ(XQw&};c7=aT_BpXBwe8}$wCjC|`seS==h zH#wgymGenn-#RC%Z)!fN>YF*AoS?q-81=1F*0(NM-@0Xe>yh=XQ`WaGS>L*4ee03+ zty9*wE?M8YWqs?B^{rFZw=P-Vx@CRq(dyeT(h~En7xm4YPolor=ac1fKFRA_Kk6IW z8TmGV`Ubt0Z*o4lN6sgCed|SiLpvki`cU7X*YZuyC-=(vB(HD%sBdUz!wIiEZ!=aam?^`pL_osn+?sBh3~`6lO+Ps#ZtuW!8*)i*VtRQ1iAPfk$Z28{aF zE9+aItZ)6Yz75Fw)+_5sz0!Z~d~q4QTZ( z&B(Vw)Hgn-(B_k?p$-nCzM-9wZzHI0(Chgo=96*m2Dg)XXM)m>KpWWz8TyU@@)|H4egA48$x}9Ue7m!%Yy$f>Kob_`8I<3 z2ECqd2KR;a*C6T}+8Oyag!%@(o^J+M2LEByH?%YIZ3OiVdOhC^ZVl_NLDV<2GxBW+ z^$mJG-^6?}&c(rh81)V9jC>nGeS==lH!+`#b9b084o+0x)O=FaH*-EYL46xB>f4~K zZ$q-a4a@pABJ10rtZzfIz75OzHX`fWpsa60vc3(=`Zgl#+n}s(L$baN%lbB=)wdPI zPyXaj9p~gIZCs>{vqa9>I7*X?L^~U2iJY-ZzLzm20bxk$9L zahAxrb8(dVnoZ{_;b-G06*mbxxXMopN4ec?%qdah_`E{YI2#vf<1CR!HjdKdBGJyq zSt1W?9Hotmv~ia3w{esv7m0Q@&JuZG<0x%hq>ZzLzm20bxk$9LahAvf8%Js5B5j-{ z{B0bi$wi`_jk81^*f>fX7ir@x;cw$8O)e7cY@8+X;#?f1o^Lu=2|pW0splKGO61$8 zfup=TH8I~zj#55HEZ4b68)u0;vT>C1xnrD*L^~U2i9E1zl=3-c0vBoHEa7kCDCKj_ zI2Va_HqH`xVB;v|bI=4X(#BcB-^NkO=caKk676i9CGx<=QOf7630$O&vxL8mqm<8O z<6I=#**Ht&fsLb-&v6sDNE>Gfe;Y?BpZmtSNVKzYmdKlPag=(#>0BlJY#gPYZ{RAC zZ=VK^@(EXBzL^}Qjf=E#mdGO;M`?1AXlLUrkq0)8(#A#FI7|53I7*X?L^~U2i9E1z zlr}EX##zGO#!;GFB-+_HOXPu#qqK36HqH|MHjdKdBGJyqSt1W?9Hotmv~ia3w{esv z7m0Q@&JuZZE{;;qw=T7oBd!vD-Ig_+o^RkP`F!$Iz)^lXB{APjj#ADi6S&Ae3ulQu zvT>9q7m0Q@&JuZG<0x%hq>ZzLzm20bxk$9LahAw~UJFNQ<05UGCH!q1rO8F2osF|Z z9@sca8y9KgEa7kCC`~RB?QEPS^1#MX+PFv?X9<5BM`?1AXlLUrkvHe!DD`~n)3{3b z^;S4p^7-VafTMh$zIUvDr_1Ch<$N-MiyX3WmdGO;M`?bK4ee~4CGx<=QJUX- zLpvL13B8S@G`}Z@b~erud0^ux&F|HrosF}E-o{ay-@`*Y8)u0;uyK^;_x8}v##usd z<0#GV`JtVSvqT=)I7;(-foNyrETOk?l;-yc(ay$MB5%&cQR?|Nr0N@SmGB!rPkjSd z$>)=w0**3;@(mm%-mUoA)MKM~C!>?T(zDi^KAMtxjK1fd-mcym!}~(3s2<{dfS6}@ zW(l>!dm(t=;wav&xOH~~?|@{|vjgw$-I0#nyK_5u+X~(m?~J(btcYaceXWX$NJm9Q zE@R^Rn~a@)?+QN!oK1+ADHjtK6BZDzCoCZh5Z*$F1J@9i65c~tMtFj-obYMFUG)AT zy#s~swlY?KCXX?HKw!QjaCKB*0h`V3g#m#@j=(ihfos`fZZ8f9T;~Y9Dk|`5R>1A+ z0|GZV0yjnlUc*YbJrEGM$q~3YDsT(Ch1*L40=GH>w?zeB%WAm2G$8OgN8t5Qfj6*w zxV;$)$2L#^a2;3PJcr$yN+jj*7-r@+{9Tj*h#xaY&8=OYh3EnN6 z>k;qtLGSZ=M*k;9?~lS)UE#i9pH~KEm+!_8i zHqs^duTwktJ*$iES?Ye(q1~@eP#bl>!h3}D{+F*Sy&{uv=@hzO;ax*|-!qfnyYQZu zaqn8sO5gT}T=Xb0_KtsEnu{K3zT>(00|tG5mC)yto*rp({T}w)ccx5xgxgn#g?+W* zS5PbT1)AS&nqOg^uooJBMf-)mNb|d0^ILO3*w+|-Ya4}rt>$-!=2v{TuooMC>so|< zo#t1e`CWBT*sn7DuD(a;uh#tT)cn@pE9~nHzYTvS^cyt4yEMOzpBMIxhTk>!3H>#i zUr_T491-?_;kW62q2HwW?bZA?9~1V?hToP4gno1J z`=YRKGyJZ7ROqkO{Hip+(#M3o)bP9ROG1C0=2xxxUH^ozUvK!`@MWRDLGuf1eq~P! zdzs<4{VPJhUGuBa{C0d**moFyH$EryH)?*hnqT?z!d`Cp-SmRc-=z6PG{2oM3j0pO z@8*|;{$|auPV?LKvas(m{BC(g=x@>d_Gx~*UlsP3Jl7*(Ox+>3OOsdy)r7WO3sa{<=iOjT+Qt@QGvB`ZWNJo zqdKRI+xJBU?*GVhj}vO{QTo1nk5&sDl5aEQ+YR{+L#{C7I}Q0RLk=49UPBHUa-|_x z8FIBDhYh*LkZTP&V#sxdyw8yLYqG4JUizMps-4+sc-J*?4e-h*yarg?;eB-N{l4Q_ zzVy)}j_;V)x7NYCYM-qQcs~5USsN@`>mI#?t{roYkb?J%$TmyawDp3rd6Z3CJ1AR< zvT5rGW&0DYTln#`HHEVMUfHzug|hud*|fEWvi(Zgv~`EF{Y=@kHHfnPMA@|Uh_bz{ zY}(pH*?y>O+B!wq-cUAe&7y4IQ8sP;qHM#;rmba^?Nw#d)-}r3qiot5N7hhw)RoB$COQ52PxY_%BHP}laZY}#5(*)}Mfwk}gP{Eir&584_{ z*{)PJZM~*!%al!9yD8fuWz*Ji%C;T2Y%!&MqL}M!F=_3sV&>ao zO8Z1Hb8Rtc{jOqW+hR)lL@}{Ofi2-$A`U>soN9|H?Gwes8f9Ee;t^EL56C|5`jqyG zVq%RlE+%miDkh$@kBceo6UD?DWn4_+GgQnU*kVfiL@}{O85ff{5f$?rwwTgBQB15+ z#>FIFMaBG@EvB?j6ccNdaWRRTQ86F1#gz7mVq%RlE++9iDrT!KrnFBK6Kj-lF^MBm zF@v_4(mqj4tWn0rB%Vpdyxta5+9!&MHOjb{#8s)7MYfpIK2c1pQO3n2zDvdQ*GfEf;w4F1NFiVqfUB2+M5czxxx%y^znTaNHTUyB!?&LO!p; zac5SD<6g+;RygjgyToxXWH*>Ez?uC4gh2zey6vw@g&$DpcIo0C07xK9lj(bs! zIPQgfzJ=q?jfmr3(C1uq+>7_oy^DV*=m>qUgz?vVfu1b<&-iP#Kq>OqgdsmJ6?h#N zABX?X*dIQOHDb41+TBv$sscz^L0Me`{WZ|5eDpU$ePv}OzSaIb`sZ7AWqo5qZMZqu z6l#s&m+1#v8iTD7`epmgO$S+y)!w;8>3yj`vp=rlJHM9t?;HBxOZ}f)65D?$^@qd&#*NeX1vHga8~V9Y z|4Snd{knGg(DiU{JOtlaA@yereUa3ka1z@Gr2fx_{_ml8d6@cp#>(pk(%b%S!-J%^ z{oMwZ*U;D+W_i{3Hqc+9V0pC-2lFDKmI#yoL*LrW@|qi~L#-i}*BU<5%3FsH)K!r> zT;JSCDPCPo3C{AW8V}%)cQE`_i`GyjVT8QJo zlhm{L(TX;2OYZMoBbF8i`YXWCpnonW0XpR$K!@lPMAmCxu~Io4_;p{Uh4PI6T`< zHt76X8|*6o*9^V#SGb$(YV7o;XXNqyvSTyI=%Y>bUtK?UkxuoUcHTyJ$;H^_>%$i`A4B{HT& zj-(yC^W+n2DMlK^o?4{C79eioI9b~?9gwWqT8Qj0VEMz|?5uw{SOTa+2V_7Cr2BFH#l@ZEeeo`^k_PZyKWlSt%&7Axf9)5&YIne0?N^GaYo zK9~H?8~Zl?*&fBGpCZVlj-@y>l$lp>eS3kGQXsWyqt?CxR7Wf z^xD+f7*)YF70>39FQqdK=ImW(IKc;VnGAN%8M;k`KIs{b@fpe{gWYq6 zeI`Pm^bE)O41*?v-E)Q`CPJIekY&j$@k}li=fg%d4FXRIOs;!r(C68tMfsx^<&V`N ze|0KZtUBOC9SZagc-*4=mn_OZQHT6%g5_w-) z%$+d6osg?TiQWlCi}GhI%Ac!4o;hKnnhDpHI<)A$aQ%W+1sAO z&{E@wZ&_9FM^+WQP=^AR5o7BaaitC|HIBGyRl#dk6@0%A1>6zq8S%q9wA47_O{)st zwyNMKbtqtt7+=qbN*!8i9P!sy6})3r!Sy;6a7V0X#LYUi)Hvd2Ru$Z~s^I-P6kL5Y zk)Gq1&FgD*XsB_*FRUu~(5ix8)}i3)alI2hszXDK6aLMrg8#It;D72+aP@OKC)}`+ zI#lSraKk28)*xEe;H*Q#wL&hFE}rCz;)cythmu-XbXwNnwyeQZhlcN_3$g5}Vr*(4 zvhEJK4n?)@@LJZ;XIX>)el*m#t8#Tb9WSi=EvSunf15uyY$GOO_j;^<4K|}xOfnI}N1(sOjp;w9o06~_S!IZ4(zElzTN~&oa>Mr3w*zzW zGqIUWycl~bn?K2v(>^HhxHbjCT|Av$=`$h(VBt336~a|?R$a3 zY%U+0&Yqet=3WgH&dyC`Gw&U?GL^u|1nhiOwcfCkHv&(^PbX*cS#Tu!Bg%JnPbc$* zbasA)Z#%Qy;Eezowl=^nG2drrETaHBU}rqa_UWUHj{N+jTl6I$hytYz%K!Yv-#zhb zZv&9<AAa-{2${zM$HqzUA$Rhw#C zR#kY1(`2u#!QcwOj{sHx(!e#_D^~!I0;~X304e}UqrDOZ5CwPwaa&VGMrd!U1Q9kg zY3>l3Du|Lcd&LFrQUJ>6wie(67;OhVh&!4pCjdS}?D8{qK~@F9r-H9kyIq5XP`FH* zLvFMQx$5QEjB~A@v~&f$wR2(*k@H0w0CIM%vH(DtM-PbX4SrR0WejycOpEm+IK9d=LzZFDXmCgQ)JOm z?gBX{*>;OaHhAl{^se&|As2`!Ulv8Du+yD-;0o||!|8UXc2odf(ApKZ1~1dTE{)GE zEh6uZio7CUkXIyGS}YNdSMqpL5-FLEb8)kAN}B#F_Zp|9jk5%E?uIc1ut69jr2T*a6kmYT{9WU1=2|58h3#U5Qy?Bj4w!g*u?{nH3Cc2c_GP26zL&=Emp(LK*TdEK4i9q_-DCi!UyQB~l)dcP+`7OJPa#C)h&$ z36S~|AoV9e&L>@#H2Z)??E`pdv8ntp*Sx+KCD;CJS+%22RMpo4F|s=rus-@)9X_Gqv4zWy6JFlGN7uiH z_J0xjANTe(1Q#A|7+l!f5L)<5Lu8?^VFd64fFA^WbU}pNRO~?*cM#4HoDn!j;5-25 zK{!Vjl)yQKxW2H~PHd11(1tug+{JOcfCApZalEJkz7=r4ASg;>W2wo%3_6}z>y%3r zZugdyeo<7EmX?yE{~1NuryB#?`Y{mKkAe8z#^_^X_zhzObz@*#KL+CZF%aKmj9ug< zShL?*#kIFn`qAP!h!b2b%3-(kw8Uy_XWJO9t%Th9hi%}Fs0IaMr??o^pwnFv%4yq$ z@@b;!aFd_^y=T*>y*ol1J7RINu1?j#vM%US-Y^Cz!b(_ACA+8n<9c!T#Ruc@N^-L#1kLk?!58$yLBslv||wCYsg8KJ1&+xG`=B> zuOU}aK7#Qz?Zx;S zY6;6{!t$BM*NgE*K0|zc7+<98;>+?`Wcf_vD{mHGp3gkK#(dr+zCM%q{-a_2_F;UD z`Hb;3&u3Ru%V!#2KgJjN4E6>vzDU>E%ktSB)$*Cfw{Nrf@_gp;HRkgs@eP>7_nN`p zAja32&lq3xd?vgWVjhGrzGnH1@iouqYVE{)4Q>`+p3gkK#(dr+z7do7J_+`I`%dYu zHQKBeo7HTz*{ntzYccYf)oir+ZZ%rJX7#li<;-d{ug%zxwVK!HenX6nHQKBeo7HTz z!>mRdYccYf)oh&C-D6v)oirGtVSDaG4h$!Y@GMqYP8HPWDV=>#ahknJ!`PnSfg1!*VN*@CN&%FFssqVT8w;VH5=!Bw;HXtcdxEi zqnzKQM(gdxTFvb}WU$v*qgg)J)Z#&tnvHgt)o7N_)mn^v5tEvY^S)b+*4sPC;|sMK zquMO2YB&0Oci~uNw{Cd z=V5%#ynxS?dweN;_I1Itf#mb0NrM0x#`7oU}det*i1`x<_Ks^s^33A=YEWc9sm zZcx|(5WfG#eFJ;(6U-a; zBRWNG-v;TjB$oeMg*^utryJ(9r2(E7mCy%*+Xau~px z?>69)4u`qme7C@TU}szKuSQ`Ct@aZ2`Agt)?(dS$-xq<0`y2N@;JICX$>Dba^fiLN zaeoB%eq7Y|_4l9)sNO-_Yp0b#lErx$Z++f44zSI;_zpo!o;= zj*vgTy-kKtiU4~zx>zcJ_ZaYA1OA{6XR*Ho z_YXYw&)RU0p>_@a{wKT!hfC6>;jhcfZrM>@ROBT|9#-%f{r}q9>=_nd-wW2OVU61G z*%xc1w1@pnS}vEF{tYyVy`|NojmYk}pOfO+{C=a?yM553s5I` zKKvH+E71Q_lYW%rbbd`jop_J3K_}E2t}|?=ll5<=6W7QFolvj1PT5Q+>)%W#u8|Eo zq2_U&t!6q|znMHRWI9UGzga3X<&0_Ne>xcUSdK9?= zhI4Q%JrBP&Iv~SukmO^>Cgs6k0RH5@eVOe1Y%(9q#fvF?SsyE8W5pD_%+KfM3bE<< ze5`OPm&@jhz%hL?KAWDOh5lqLk(@d;8%xj6WMebgd@P>H#Il80E}xxAXOe||h5Yos zOy=}lKTw`bP8TU62NQ|-P&_dcjtoa4$zVJ_9ZUp=28M^GlZm0>@c!fg{6J}XYWiV5 z>x~e%wh8>I>fC2)`<#s4>93_f!049^bhy=kqnUhUp!YNS3kLcKqrYjOPcZtLfqtCP zZw6?i`tQq(er{itL>mc4f5AYfL56wWrZ0TgWxmGvud?6U@t!3{Cj*?K!OM((^k6Oh zPZ(Wzq?Ue-(bw=kTX*~!Z8wu&!241C@e@Xm8t8w+=zatJT}F2u(Q6^(14ch*823X) zPZ{X{%IIrjwR-L_`uovZdJ9dUX7Y}K-pT0(J@7ghj%IRM3v^Z4UPeD>poba#my=ai z+K4i`Z1D3tNVki`&3-S*r9KCAv+qJK0^RJp5E7Wr7Lz29I6DuYx{$!^{HZ`HUPux4 zAL+$B3FNbhcri`_#pIbH<&Mv#r-7Wz8E6A4)0Bru(w!;gaqel}B#PXSDUwARuf z$;79UnOHCwK2mLtgsaW5P_;QeQEiS5uQw;F&G8Z592lyK4Xo=Oh*ZS}#v`0}XmCGo z4on1jGqit@HzN}Rycr&@4iP$16$?$Si%pDk-r-<%4&n7Vgu`Q;A$+8&EIeMFS$MoU zaQH~I8HrStMb?!?#;c=6CPplMP-96Lzs8A8e);&=qrYj$VxEQf4>6sa;k0v6-e8db zK9+a$>lVjR7O#a|j9(u)JjxjH`iNt}YkGR{%U(NR|Dmc`F3+!z9C{f)@8|N2Oz?9n z)Xn?(^^`-IG4g&se-0(+Z_vr}>o^D`1etY57}_X{HsHbQ691ibP9scFv#{r){hd3Pf9p^zfGOS@iHnBE9WjOi#dH(z^ i(D>Y;Sv^t*v?jNockppJo`Qy%{B0(G%OGGt$^QaNrG_m4 literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..432c63e0d0886c12442a17222121c239a7dd85e0 GIT binary patch literal 18512 zcmeHPeQaCTb-$!2`H7+|%9d?fR?H`xmSsn#sBgMU9Lka%C$Swj$O?DJ!cZb5vZh3i zq#e8U=qJ`vgv^LNb?AnzK%B&JvbJeEz-_WM5ZPhC@`pXxS^r3u25^B5D6kDMwr&{q z;kR?{yZ7<(`YDdcpy_};cJ2QX*YPX zyOJLB89b+=9**_cK34y1B~2SF`2n9i>(37IPou~6`TU;xhg=lr_i6j%-F^}Du)oz_ z*cZhU2aK0Q?`$x{oYJ-+S_Ah-P^SVNJu?32vEL-5@V&@PG@gz4PsB2_v1Io5FTC!L zWTvw}BZn8h=Z{S0PDYa9$mC=!oBehs7Kw(R&m?j&sEI~1APZ;Hk;&NA>11jml6>7i z8<~l{GX91?H8quu<;d&)Y~rgitr(_(Wnh`j*i&JaKYrDwdfyR9;I*qFhO_ zP_Q0BOK$Mxh*=o?+85>N}u%zNBIaO( zLjTG{EN236tO^4n08iJbf4oNhXR6SDGn>mKqOtJ+_k!oDFd~8wtx-Q-qdvq;43Vqy zVh%zd2O(XB5fOx3jryl+)IVE=J_BLQ^n~kL6;?zPuAi^fz+$Zi;MrTPH;kFyaQ#vh zRw{t_gIW#zQLP5PRfPfO5u;^~xLk#m3Lvi3YT(UU4Scr>100C5NBp1)D-}RotJT0e zwHkQ03Ihy?{bi3RRbiz9h`*}UznM@10sRAHe4gkRQb;6G|L@ZVJ!xblPm!VMd# z!i0#z4V$B83rfuvTvb@OnoTDYxf6U-+_1T;Fj5IcTg?_cHCyPY!os%`*>LJ)EEj5YqoHxW($W`V?kcEPnoWJE1k=Pv$2^-I-bhJ%IkN`=zF#6cicj4#r6Ah z|8ypGG7a5N&%|bP;i<&wSTu~k0{smprhi0}l8}uw8$^4Q?~$X*4cl|?_-7)g z!&AvfF8q8dbAlPCbr9e&Ir-s0__vM@MIeQl8Q429)i*pHo0$nur3d@T{fs$+fOzd|cEn3mvH{gk4vwg>(ola-Mlc|%l zx%3N*Z}O zlqzOTs?Y(X_z)~89Z6ZU8pto(q-U7KubV1pc+t8Z!d*_gn%AH-qui3k=h$d0i<;eCG-p8Xec2e z&GwQT;MRbu2zw*60~%@pJIFg5O2>e{NbKs1c897v99=qKt=sJeC4|xi(x4WhY$xAb zt|NcATvs2+pDSG`-Xu%QH_1zIzX@FEIKBrLB4^#h_Vl zrnFISq_keIr*wzDgHmYs0={X1c9cRpN}(O4(2mmfqF3`ScrR}9?pxZoai6xYuLo>SIW~LZn=V7!t#Erh@r@;*=Z$j7W6(>quH9(o@y=n} zjU{zSJ+Cf#RqtGZbojg-9dR!ySoX8HQa=U5{^k4iQ!x6OhcR!3J{>Rz$0#8$6K7Y! z>GV08pl|3OG1>@o)yT`sxTcSI8;W&g-o)%b^+HjzAIod@r{I1X?l0vLpL)e(~L94b}k1Q&1FN3yxIS+FXe&^Pw+Gg^LtSdYHZT`#>jsK#TY+NRbwF{Q3X0Qrk=k~t^DdXKp~)Hl zI`F#Uaw|57oxm5IT|STQT~ZyeN4$Pb_xEZVY#_ZiK3 zQJi=6b5A;2lzctR!K>Ke-R{EsjwU;CG%0ZhlvOH&`6UQZool}`r?}#SYrpxsgKM2h zwBcx0$eO0LWKMBXxeaB&*8yEqAIc+D!Ow?O@6s0UJ9C3`FV78mNinEynOD(whrGu6 z!My4E0j28)l&&98-i~4cJjDVi#R4eB0w~1-D8&LO#R4eB0_c7H03MKr9|Wl%K-ePQ zU@%U70v}QEc|2eMzYh2wheOlq>kAD%3xJbh zr;XS(9m~wl2F^1(8#&MHY~noZLA2}PJbrWl{ASMMOM2kjIlso>w*bFKQ8cZwvEbbE zqNeo<^TDz>ALPaPAivUl`VBjDK3Ep#gS|NMSS)QdAu)p*)zMXd6Yi03Ay{xdRXd^K^??Sc_m~}m)3XpqYgsUl0ki* zf}^{P~=BBpYC4;tMJ6|JfC*X|ou4X0|@YdE* zeNV}t9X{;4b*V$|^isUN+nmK7t!G}`E7)e+252Mg8NRp3Q*?gaA%pf{d*5c(URX1f zcRj%5;@Y8IaqW;7*A97c?T{DNP8|ClZO=U4Ya!D4cIP%n@gM6dvF`2!tc5*@Snb{c zSV?=4bh33dr~}*e>|*UCto?hKT)>)aXMSK%7tX)C&jM@QTd;r7@3GD9A!vg#!T&kd zT888QD%j;v0uB{zAgJTwwJJ`evNVwYve_&kr%N> zUc?&tRbV|NVNLy7cvhNzEj=??3?VFrl>HjpS@|`}1;1u7L}4*xWxvLKDqv0h8f6w( zvlybX7*h6YY%gI={Tk&W*2s%kBQIi&yofdOtHAp2tm3s4L#SU{#}IeOh#_Ua#�c zG+v`zz?#JnPsoTNEBiI>Qvqx0*C?~Vn#GXrk`Y77evR!Vtf^n4T*MlA5o_c{tdSS7 zMt&7opOdhrel5ii8n3Nm2;t9RGhSQ75E`$oV~F|OUNK(dJ{7R0evL9Ktfgna8LzRu zgf;bRl#5s+FJg_nh&A#e*2u2{Yad|!@&6P)@hoC>=CC@OAl_DI5&6tv#&9XFAU5|q zi}2^5)tSTUjDj|Qoz5cf**^I!Vs+-QI-8ijEyWS}%z^E!&L%7RHttpC+Z=1FGl$g~ z#q@2oVddK>7kwLf(YKKoeH(evw~=3kZ_h|r%V!a*Gl$jL#Pn+^j>u;Y)32qtvZ7z( zUX`)7ICEH?QB1!^8xq!^$5~{ZgtdGYu{v{5zn0D>re8~E5&6tv`n5$2u{>kr^IpQ* z>dawvMlt;wZAe&u9%qraA)j;4?~WJ;V1D>JIEx__XAY~giRstYF~mG`n0_stO;+@4 z+@})OR%Z^YGm7cgXhXvK^EitWpIgwM7j1JkBCEz#7jYm>=F_pD$j9eE*2oS^PQViZ#eR^7k>A+y8B7i@8i) z>2i-l?gjJcm@ht$x#68%G(YNw^O(1*D_-d8@)c;?AZ?2|cE?`Gq2l`x$fXv7K_6kc z7($jmT(5$`R-hoooDb%H2uXf`Ih>F|J2iuDD;abtbSK*JK;EbU z#*ObW+7#n`56V@qQv5F+-c>+9TVYIV>mcW<4SwuU-SBuEyybWV#)EvP#BY=M?Go>k z_#G18CGk529>#*>T?^;Kw&uWJ?t>vT*$Wi&R{?X5cR|2=5!!IP@tq0e-_=*0K{wE@ zdcYgstH3)WH^m=O15kwZ7tQ9Hax)o-br8te{2ZYdl7g`m81%#e429F7Q7-cA1b zR(<13w0_$GqkfyDxBXE=zFpAUCFuDcGwOYk-i{{>`3^yEx1iT`#HjC*^maaJ$ae~Q z4>CPM{`iCS-LRAjP<`BpdctzDWzeQ9^epbvg(-&y3obLeA7x3HEP@Vg{_x5Pgv z@NDfb!ZV$({RJE5=_~i(*FNJtI8yLl9QnGs;8C5$IZd7Ssv{caVSnBBX6J~bI1F=_ z_oxl?jZoLi>xBKv>$-WJuy=V~C$AIsF|WgSy|h1JPxHFA;1vSS6^vkEf7k)zi0^V)>c=-#qyUPUQ z0>laK5068=2K7I&s7F06=g%ax`LLBuh&9~ikd;kVzmiSdBlnCG;uW{4TG?dvE7`<7 za*s`jdE91`l}%P}Ws~obgP_M_n)iU$rk2caDDN zfc^eTpL5V}^KO_sJyXrWna8;9Y%*vQ<2{Vxul#tOWdf(c5_p+UuYbF%r?3!Y^>8kr z*E8Gz<2gB;n1#Pm{(=gByR05QJg)W!{P0ih?oFmCLI+BayM}FaKDx8bMM}V31 zOg20jnGI)8rqiiR4q8l}h)gGDr=dO;j>aZVPKOh-Q>pM&Die+*li^f0oX(`C63JM$ zH=CL4O(svx>;cJFVv{+_sDWrSG7yPQ4GoVB561$L$YdZI80Z@rn2bdSMuzsr`rr?n zCnqL%@loGPF`Y;t;c@oC$~qU5xAj!Y4S3g(OOkvWYkx{D z`DID|uT1WFyt4gWCU1MJQr<`%2oh$Jyp78xJMbA9Tx-aCPgL63&E(f4`4E%)Bs@b* zeq56O2<3K#9DBCX{u3a#`p)zm$gRFJCH~n|E=K&(GqdomEb&jzp7h5f**IZ8QJ%{X ze_$N~{_)#&z?{0IE37~P%HcAF1$;d=3 z84d)74w}W_kXalZG>iMk%;M-sxj1eX_Xl~gZ)m?++-s`!m5ucen$p1$(?b8yC|4UE z4e(-c%pBQBaF|O+0_GTo#}9Dn@c3R{+}qdBizCBk7bC;wAcKLju`zSTBV*-`1Lfj~ zX)IVa792Bs3yu$0^QCSzI{0;Oc>K#pM~^&a1g%PkTP25X3%iJ>qo^w8rSM{u|Nj|= zkkFJiFC#@=eCBaA{v70#haG$lVqXf}c=5+u3zYv|*Hc`dKLI+cQGb z>Rg{cH#t=$eg1sq)C={Pw&(i%In8O3>2rc}5U-uUpp5I&0bv8_`*9TE6{0`OKk%`F z`urSWevjU6v@!o*7p@`=0KZp9U7Q16JpS)Ja~~+<`Xv^>*vCJ_ zhdI$?mswOK6WUY d*X8|2k4~On%9d@~mc=JUwq;wQDT<?R-8n3oFMIz&MQNSl&F{z zIg)nl&Xb>5OEJd7Bu7GT3(@O0k!y!Zazx%WKY_ndQwMuyEMQ@}#>qlzpM6T3zdVAq#} z`o$yFDDx8q{;nl9QUhZ24e06>G|z%@sxUhg^ER2NglJoImQ!NadGv!W5VBS0$(~ZG zwCOyjq968E8DRP^m(rxplHc;dv-Zp&zZ*a9&&T)JKV~90zPoLYH~T2~VSD#_U|SSV z>@Xe@wQSPGMXIx;7U*v_5~2bfJQ951*cS-Nem^`FiKWA?6VcRkG@kzY^RKzWsfqN@ z$;kZoUE#6J$#6Ur9vh3M)89))!;#Q4smV+fG?7ROWTA92JQlq=5l@VUuVPktq;yy-d>j-QN%PE1Zm-quY+qbJA5qp3OF<<(?3!krWYr#D8= zglGsJDRU+neRZqU{1H4gbfVavBz=2rKhp=v2KcDb06#Ayz?CDB$thko zuPv1kp#+3qRvO?}l?M2A83C>w6+!s4j0hzl{A;BFep_jP|0*NEm4^iouA4|1Aw(3e zn=BPaP%4gKDOsn6-Vf}mk8zUs$3bG45ycIbL%bM+vd-8li$GYPCx74fX%4TXi4jVdl|36 zF4T2Xu;K`ZD~>R7FA?Osc9+|AZznUUP&zslPR0_c=<@wLrnh~s_wNYMivdi@yUs4%a2yb(dD}7sdrsd;nSh< zcsLV!CXqV9f>RwV@VGhoaewGbM|;Asg_-jDN5{K-6Va)u(0H=jgG?wh9rF55`@Esi zbPP5q<5Ae(#MLo07LSIf5BRa>yvjz5|QRaB__g??q$Burj z3a;_wThyp4IVu5xRN@Q>mUQH9n9|b56}U@4UF#Ij}IW zX+Rsu4!qE#)#S|NeD1?_ru+a5w?JN9G*{Ks7R>8sfGZTtjY!WG%$sHk=Ll)u;#_dT zn0&L=+&1vDkqtMA(*$qb=0339Psn+qsFxJQX4&G1ZGH!Ao8fdgVw>_n&+FHDhfXh1 z-PV|MYm;v4aL%Irro6hKo>v!~s&h6=n%kVs%`uf^4coe~T3cDYjmvk_R#tCo4*K2< z?OI?gmO(;ZBG#Q*tF_Itra6XvK&-aHm^Jd!BHr5r&YD~mnJZ%VABAjAGasAL%#XqO zIGkUcL7ZyAhuoN1vb9?@r1gxh1zH2+A~#Mk-6^I!wS>ARiuF9?%Ug>Y+#Bp)R#DpQ z*uVYQKV>K7>(tnVCg(x}@!qtwz``-2ikW&_eHHkEVzXZ`!MjP|v(}C_ zN5Q$E!iz)A>Cy_WeVXQgOxowBbo*AgKQJc>L^5iZ4)# zFHnjvQ10ivnm}@Y9Ag5Kh z*33^q9*)is`{PS4cb%48_k6BkMvMvtcOBd_vvmg7S@#<2PD|DJ%q452jqAGveHYa~ z4fu@A_v(uZy26jZ7|Jq%~3h_PkCz(9XLB>{^`kUN~E=xel#kj^clSaQQeWpnQ-HMmxTV#SOI4NP=kEgi zFsE2<;Q0G`VEiZ(a)flz{*!=CEoj%jpV>+H4cyP<0)C{M`YQpQ+HidSK^VW3-ze`p z#N+~gq=)+313GO)yZ#4P!4Kvk>IaT6{9r!vxr%uBA2r6W?+c~)^*^cOH}Lz*__a#- z1u*|`J_z^?Vg8{^#81z02g`Mue?iPYw3F}~#{5IMh@UktsGZFvnt#F7@(yC>QW!^H*W>m*$@v^AGJL{5+U{C>QW!^ViPiFU>#i zYWc_KFV8=D{;raL9x4C4EdP8${$(Zn{Fs08{Kfn;p1%$@e`)^pWB#F?gx>(>AIb&% z*!)%5{H6Kl$NWP(3BNwfKa>mjvH9E1<}b~^{?+o2&tIN@^88&T|N5l->u32lAmpD% z!Y_dNhjT@oznFh0TRDFTuT_|*LCin2lkgkH{6o2jpH$m$eg!c9&`!c{2=fo+B7RaW zME~Gw`N!uk&p&zou9AO4QvL;5{tXNH_xDhLowu^Lt-(gM)~F^UhDJ45uC-`qRFe?{ zqZ({fYmI6$`robw^ED6c#9E9s#Ha>~wHIsgYBl)l(ld-)gN8{ zqneDk7}a38)}ozJO~&yX)nKDqYgCib|8_N4#LvgqLa4=vi`!rg6!F7ajCMvf_+<${ zxdyZOTT*NL3~DmsVpN0WT8nl@H5tcmRD+Fbtx-)z|J&7I5x)VU79%cxgBmR2hqV~( zjB4d&2Ft%A(axwQ89d?Tl(N%5PVLMf`?&{y{B9 zTmpBRo#FevxM%nb*y6rULFupu z;rAl$x8Xk4dEB$=?10_M%Zs} zLKZ*X)C2(=fT}=Sfr1wIGx7I6A@Psk^$Gm$q;$Kc)8@QRouq-v)qqZ0G@Z8Qb$TCZ zrTVsjPIqWJ-I>?vE^NmE`w2BLKHNubRP=pkl&el9_n!sWD}{bEL*Lg`!9JDd{lwB_ zhes{%P0Io3FY?!AnV@|8@96UFAcx0PDsLyhxmjKJ0@XL)uj`v7Kj#5m z?iBoX3x4XLu2&_$mV>&yMey4r__ZF=^{tZMeGlmJ`vkwef?wN%y1q^F+wrh2-y!&Q z2!1<{==zye=`7Xh)llc+y#~*EKhtyMm;ycF%pdBmbOgo!1yETQ*fjxPh?qzeP zgUy-FyPg9V_#8O=iKSi0DGm7D62C{{_ey++#CHlj%h47gN1Gu>x4Cq>-KS-u-8<^0(IV@?>wgteN-3>n2_`qg6MnzEvwu-@~S4;jT|`qg6M znzAA$)C?Z8$0#P#uND*6loc_d?(mqZQB0;^EherhD`GLRt&&kN- zH2mx0J{A7Wu{tsmR6TAN{HM0>izlWhqNz|aoQdJbd7*S7l!?K|d#U79Iy4rZ4y8{f zlZjLYEXGcRCnl#SKpzc7qN67#LX*?uiO_f=6$;1Wp+q{AOeMxAr-H}MRI~*DB@%8$AQFk~z=8m|#yL!9FqLJ?2p8e4-_y@^jqhq^yuUAss z{wDCRsL$>#)!CTb0ne`Dv6;zVl;o{Uu5^mlgy?K7xgy#7nf-f`{2-HmD#?#Bxno~x z{3n_GLrETCatr;CYuSh1WEk(|u4NfseTA~MmesG{-{CbvlK{))*z4Hix5#lJAQ zdbm`6i^<;_E|u5O1ga%3O7cb~SEM-b`4=3ua>bRrWauE?2b__UR{CZtu%XQ*{}YH|$3(ReBWQ#=xZ2~J#Ni7EUTnBZr!neZsk7qn=!>)L5lz=7+Rct|Wc z0!oxgWsq?HABfJ(7DYVH7vY*?hjM W&lS8~&d1=w$p3xje^YXhSn}V)yyi9l literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..191f0841aea84f87a1f224218765002d966b2c3a GIT binary patch literal 20544 zcmeHPeQaCTb-$!2iJ~Q&vSpi=W%0?8ZP}J6N}?nxa}-LJ9e>AhlPn3cGL%S(iYbvJ zX~)St`H8g@!wX_hDY{}UkT2Uw+oa8aw9VE`WQPGMhIp{E{*fRJ=3*Vt0UHqP3&B3j zopV1PFR!2Ch|Gd?7!CUO&bjBD`*H3$@9;iSXZ9T0YqeMcHo{&NWRzfQ`BV)_e=*xG{ltE{ANFmU?j=a@C)9b70Az`C-lYqj{SW)OxNWz_9xc-KG4JdmixlK=z8LS z@ulF$YZ+UhSo>;Ve+X?F;LzcbdyYOpNap*YschEf}ydoNGkQ+WF!<0K9QVEN1!DfP698ON`%HDS0-Zd(NOHNXF4<$d2!?w zPkekl6-krJp48-15&QQ%CquClk>K&k>G11JG&p)aPaWZI(V5evv)hi)Wuo+Xk8ntgP&4W5wj*paQTn_` zI4nl!HR0EiD*c48(=Q4;x*K*IyW!o7C0Hq1 zh_9Dx;G5+d_;v{f_>LGV?1&diuu`-Tm&!HpYPkkpE5U$Rh{BF|qXa8O3-M;T2Hq;y zz}qDl;0rNa*b%uBtQ0N8UzTg&XXP4rw*&)XAqqR=E`j>wDfaS0ZR7U5sZHSkHf2L7uA1D6g;i*VIK zN-!a>!c~i{Yzy|XE!37^;YunIn@k@UL2=dMEWt?eQZ$rp!Bw_}rV=cCcQO@>pGXHs zyZQ>tp_O2$csW|jw$N6#h4$rGDD70`((q&`RaiGK>v4I9KUXb&6SdoYtbY{_qk^MF zy$6;vUz1a)tCo?nE$l1X!lC6@P`&mUkLzAfq?5r^WGa-1#*>kP|BkZ0m+QZWOw^Y2 z-=BIWlJOG>=zd}Pd5_5^mNeY zKh@`hV@*to!Le8*G`;61oXaB zxVDmSELM;|TCAw_&Ya0Tlf6b37O#mJ&jLVK#n5FQ3N@PAV;w=>(<@#?hEVPgA0S}2KB+r z;IqAYRn|(*W`DlQk{g8S=E=*8){3g?ymj>qQ2D&I0r5=Ux^5fjq3U&d~4b>jF2 zaD4Wyl&;aD=bGFL^~87G2?=2}U_Ia(z}0{afNKEP0yY3rzqnv8G-(S?;5lKgs9O(+ zx~lM!M=4xwOt5!IoUjsUCuXUcYz_Ur9GmiFFSKhs# zL7LXx9zE~r)O8o^q)sozO^x6mh>1LTy=ZRhac;j=nA`dqPxRa-_ks)hUk_v70O$g2 z1l$0)5wH=k32-CeCcq}ZX24B=bl>4zH)#v&fwvy!fVNzKXlnx?+G+$uTN?q62AYa8b@-0!_^=1-V6^(P?pCm{7FAoV98^(Uav zqiX{+T^m5UHh^?(0O{HQ(zOAkYXeBv2Jm+K0;bOQ#UR}mkp75!a4<^u2;8{x?k-*5 z57^bM>l$Ep55>D?tFj#sKfOB1?gY%xeRE|I`#Wb3*V6b7hatZ}&VV@9;99pvA@&}h zA*vBE8gnd!S9s7q8{{ zA}!AsX?ea#%kxEAo-fk!e34#az8lF)klSBe#QV(#_m5{!-}_7!Dyb~qJ9}O3r`^2v zXs8>awFe zW?S|hTe0u_UEn*6YaTPW_ILJXn)1k#Vuy6mwIczBHk8{nz~vO*4c^81lJ7_l)wc&2 z*5bPQhhSZm9E1Ej_i(=CJJL(_odJdoD7S0BiSIBs@VG>o!NVYfJSjef@AUJ0Z!`4a zn)~mAHAi{HcXvL(`HJs$JRUM@)h6FxI@0|JEUdbA-xRW{X>OG+)1^F5qC;W!gD6!In!pu z9m*-bqj86P$#*^}U!*@?9zN2xc4gmySEkLz3@k+#$~_?pUt$=NJ~^4&@c!(YQmt z;yW65$X9$v;|}?|a(YQmNneUWZ&bZ&9yy80=cgR< znxZaIYf98Lt3lC!U*=k~UHP1#)|zHDrCIH1#GaBj)tVCJ%xX_uKeJlXtfn-pJyHL5 zwWjc$T2rE~Sq*B$9_BN%*h9Wq>>=MQ_K+{f9@28`A-#;)`*TZiy`t8d{BvYcO=(tp z8gZxOP5!yls40!OQ}QUTAB{UDZ}QKnMKvXjJ0*{zzG>WHF5-F9@SRdqqOMsDYQ!Dd zP<;1gs5Or$zN2x6bsfLwnbnju?odX$2O4pQeCeL2)|5uvp`3IN#Py?bhkWUtXI4|1 z)t;zt8h2Q~DZb-5)2Jy?*Q^FL;tp*nzWXxNntu!DueY47VBX`t<2kdarZlTPjkq(< znMO@%#2v~hzN2x6e93oaHKkeYiR*6~cbJD1-|?Jj)Rd@eR)ZRGhc*=7eHm)agyK83 z*5o;p$Tg)|?P>WsKE$(c0n%yXu3wukF%R(m4fH13wpna24b>YCM{M%(ZcpFMT7)8{Qt$fNG|_GqTPy){E+2dOMR!?WED&t&j9Ej*ja3=XyuewIqe;s@(A zFjx-=T(TLk4ZeE-XSsN$OGxZ}IE&Q+3|n-Dn{y2BAoaApHNbF-&Twmv;Wo0FwzmZs zZr2&^$T8fB{kY(~xC-Wn&tDtt?0GfvHMc$cpLux33gc*ov9GFtXFa;_Bb(-g^x(U0 zyBo%fbc;f7R_Hqv8lJQk>2Fc!tqQ$Oq1zOCyF%|!=$#S`bHVxk71XH>b>6=igekmZ z%~0RI1ilr1%}9Pd2QtF1_-q)SBWf=>2AzOgE5Wb${1~2>JE@QHd7m0=gj~0;{bz#wmfeis0(>atD8HTj)Ah=lXKDN9J*<7RqIbtVjDLru*CFY( z?q}_-ir$ukjK4+F^GJGI53}~Iir%(+8GoCk*D2|>-N)M76us?VVf^iqo>$V_@c?Vz zq3G@WZN}dz>2+~ELjL&ub?vbA?0_C{PCX!e2}OySb{;bwx}C;=C&#dp$BdW9OxI_Q zfpa1T_I+e)m-dtnbcaHF6uMKPy$anW(R`1#NPDyy_NbbZUi_SMQh&yMq5o^zyi0Rr zXLW7Pt@Z2poar|;SGDxpvd3VJ)SP9(=OWN{P_#+;OSFxMHYt~hw!1`|l-ERCpJ$~yoqgi zAHdxCd+=F9R%|58rHcsvdJ?x*C>;Mw+17Vos&TqKxRA9+J9=&j(UPGY74YEVP+F@wXhj9v&q|+vWYq6mT^M97d8)=+2rj@*~FZ3i%qy^ z2%EiTHhKF}HZiB%ViWE=!lq_sleaHr6LZQfHsRhSY_2e~$=l6rikvbIdLpNc!FPO& z@Ao-R->B7W@cHrdjk;EbE8A@PBOvod=K$hN)9({vzh7j1h5gofY&!P)h1OZn?@BMM zIsL3M3!f3#>*MtdS8&?J5sxDAb(%9Aj3n?SJo@^#oB23i_<1|j!Swa0Fu-X6hbE`t zuaexU!QU#;4jmfNy1gFwr?qv);?on6WH1p*NAXMj!Bjk$j>4DzlZmNRa4a+(Or1z1 z;>k2fj2#b6OioWgdn6c+jGmYXPEL=+kD}ctfEvZ`j+@)!#D~3HS8(4n(@(51EXOj%^pCz7_F! zjle7B)GwVdyQT2_9s<*g>)RQQ{@yuCw~CWMjI z+kcqze@4IPU1$q){wE5bIvM8sNl!r*ub$%aui=L-=`}~Mt4Y8oG2!we=eywhi}LGR zod4SWGMA7moS!>f%>NV4Z#r1af1C6D2a5UcaDJ=8e^1CiSSad=gei^mKT^kN6+pH}$2od0!YJp-J7 zP2v9t`Bpm#j2GKK2z;~Om7D>-+3!k-XF8sa5Ks8YY4~jn@k~sg@I*tYDB=GjKb<6= zWIP;7hlnQ~IhCf;p{dC+U`JxfID~jO3;|9&WAQ2c5fQ=P*`!0GfKkw9R}4ukG#ZHo zz24rv#-+c{xEv~6?(Qyd2MU)3wc*`@+toK>Tn-y*UBi0>x4YlC?Cl;lxC35s*}vQ9 zcxcEF@b5JQ{Jwr6>-QSQ{9a>Z!xDG5F-ZUJA)z)fV01CiW2g=E_zD6>Z$sS%cc{xS zHZ;^LWQT@*rTj9d6dn9EPjKYnBSVMxv!qqL;g(6^w#CO`c#KS6;)0|2ds(7c+@A!E zx_HkMYT`a9a32@M`ylpZ=d`%JtpfVr^ZB^Y7xzJdtz2HT3w_R}Kg&YfqFvlK1!{`E z7{5R`N2JI7LSNjc1;)6(0GxyT+6DyjggzY*3Mju9Ll(XQ^#8(tcw0ezQAZfRKWk(% z#@}%hD#`%x_i3n$k1X&d@{jmCZwbbxk3{iOte=zf@wS6Jq5lTYUvKPX0!M4OAP1#; zf@vIi-eNH#^nal6g}y+%Z5V4O$v$=uBfNZIbwFzH7ULKD=c~XIV~0!Qb&=nv#Qa5r U=vUBDxG>Wv{Jxk~6cm#D50GU0_5c6? literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..0d4f08a2f2ff12d9aaa1ecb3035e80261bb200fc GIT binary patch literal 20496 zcmeG^e{7rAagP)wQ?g`Jwrtb3Vt!I&M|LEN`c0*dOv$pWD6*ociWX;SXiB6+)s)DR zv||UZ{E4v?FAEY!4Z7|Jmeg^atWBD%Yu9WoM0P094zLG1D~1GX01e2nVpxG-TZdpj z!ri?e)Z^19TPCyoG4wM&-@AKv_uhB+?%wfzq|P51=rx;69t&Yt0a+#{-bUcz?Mnl! z@lYYs+{6n1Zz83n2*@bwVcn}>iYpV)%K4Bi#^j_dpzma;fW*ddl!GY{vWwy5KsNm3 zUWN-O%i&xm5(n{9%y8@$U4(v8X+9 z!gwjrvYnALkfCG)z+Y`6L<87$tmo*^BZMTr)4_uuLg~xoM*PJuH>A*L8 zUU!BkCSrj&dCeJ{dMaT3w)3Qr-+I>kSsQw-09&fao{hs9tvgTZb)L%V^{XFbC%F+D8=AhwfEE1ZEpAP6R+kLEm0}i9SV_Bnn z?&Nt5PNA-wdh*uLm$!z2JJFzeZJoz;Z$;u!Z!9qFiv+{bz>5Ed*w}aKzef$k*7V*_fAZm3HZJE5$JF5VEWJLp(LC}W*B^-smU43_gB$TUCp!c!CRv#Q;MqAut6OCs?2p@47Z$Pb;7=+tRn9LTF?Oupxg2EGF4qgexg4Yn4O>D$iO zWF+bx51*QeM_zZv&Q6boLmwQ~t(T z;F;RyN|KczS4y_tBn}h2b(?>$^nOAv5UX~{YAvF49B~;)_4gv&MWI3FTuu} zhiBLWV_V>vEM0`WOiK18N=j-h8?_+L18ueko?Ry|FQJbQIf{}6WIkiV|4b*6y7|Pc zZXSc%5BJ0@+N&63$c@?MQkO+XSjOQd4mSg|!ZVT^r#a7Q&U1R1@tj^po=WP21z1OK zF6rPioNvNvttrR(cHn%i`zT&q9lTiSSlmKdZk9u!Zw0soUe>Pjb!`QRx+(you5AEO*LHxYYX?BoWe14sB4I75 zd4bNYRGTj?ttr}A5_}r)w6)Y`g~W?^qNJ(Do^~v1kc@PPQ%^e^bX|iL*x;h@u3a!E zh@~`nYY7w91=3QJ20y^^C%a*+UHCjO?@M;-OWkpowzv!FFbC}44X|ZyW3m(cwR=H( zp{3M1tGB#7XE|zxb*?AVR%^+p>GKa+Hd|*4!JZD7SRq+>&$7`>EE}ys3$!xlK zqLpm=@|?9a*s|&CUv1e`5<(r8GAr4zaTA%dmQcD0X@D01+)(J5MX160Zq*!%yB+V& zwamRd*Xkfix3+s;!!_ILVD^S*q4owy?G2FH8z8keKx%J*)ZPH8y#We6j!O>q{6M46 z5AgoEqGThi#a5jp8vrJ#O|LGYk1kpxr8Hh55r|ocL5K+r+PXOo>-h96v5hRd?<>|L zo1aUj&1kc9`o3cDp}Axh^YwijOZHg`o|?T>a;8S~e?jX10_~rL{&O`43p}&;7j(~d z7WB*>D(Ia(SkMP}Ki~s^4+1^}_%PrjfR6(HDBzC){y5-I%-SGUbh8KM)eUzK+`Vx3 z!QBt{0NjIc55YYQ_Xymha6bz7V{ktX_Y4=?SR)=EV^D;m?)}S(24Ew&3;1lKN+Ly+N8Iw&gh@F3%Bhd5(z7b3|O8BjWNL5x><{5trwPxI9P10(H(T+ee%0Yeas4hv? z>2>pTT@NL6?C;)txf1c+BZU<990&cDFLv z)4^cx{ok$hkg6Y4Ra=sSmTZ6Yb%8%{P4l&h_Uu0b_C%VrMoAZa9^zrpf_wuHa6ZK! zgU2{s@&`Fg<*gnDOVMt94};xs9K|2~k8rx=4|0Uc+dK?bp!~pb&Zqce@JpO7`GXv# z@^%k{m1w^{AG0eyXO=%8&tQLlfcpdTjmSkvADCitsy_y&v;EQc6=whbQw$EAWpMDT zEB^TBf^2^bVSHfxO8ywe_&}QM4;GUek4+jMBN!jZr}$$O;{)lkKUfUc@>r(vF@*7f ze2PDYF+Pwk`-8>2i^o2Vj}eRyAg=pvj1Ofm~!&2hUelp08+pcrZSYPw_`L#s|_RfAD-|^xu5 z`0%V1A0l6g_)zoJ8u8Jq#D|B+M>mg;9v&aPQhZb^{^-N_Q1cbWhjG5rc)p_XF@W)b ze2PB?F+Pwk`Ge=HTAr_HeDq;_AfMuoevA*KOa9>b%Ej{)jgJA059Cw)F^KVjbjcq) zU%7d{qVdtUR(yzjCE`QPS8K$_pb{T_JU;q)d<^jT7?k2;N!ibbFg`H0<$Q(lfwa~6 ziilbQ<7foq1NjtxjADEsUG@jdSDD%a^ZyXW2l6TY7{>TOy6g{@uQIg?%8y`tAfMuo zQH&3y%l=^bDpT8FJ{nppK199}@uB9cHR5AbiH{*3AHzI8MtFRTO7Za+_~Z5eN!+r= zGODGFYA*DNQH`b6QpjgibD>R*YAmB#%Bbc-d7~Ojt)-C9sOCbS+^WWs_YAq#LOG)v zOVn1_kG0l%Yph?JviEeg#xkm6o4RCA%7jA|^STFR*ALV2SaORc4l z}?yWOhBlKmmqS}12!W6A!&S_}QLo*L^##UE;o#Xsk0axG<4bD^D$YApV_C{s%z zpHa<)b~385_~)psTFR*ALV2Sai+}FQ)KbW2RCA$SZ&hQ-{*Y@elryTaWPf0-h5lGi zja9GsL#?rlYAK_d3+-%FW2v%~$KFu>y)e)EbND ztE^gT*r4V@I~&zl>NyVb8P!~9C!-omJ@-LAqnZopMm3gtPK10$H5c0TRyCIFk6{rX zP-~&w$Zg^SYb~0u)=^`bz#mv+;ko$l7tAM9aGu`oC`nG4=HYy`WLCU~!gm-i0xzBu zr>#x49yqVT^L#ufzkp}r%}qf(ySL$aLsL^Q(bQCvpuBF%i)a0o4tLOw?}6OzV8ZRL zA^hx&kfjfHYM`(jAW%s)z*>OK0PEl#CA??Bb38&q?^hB6dl`jQI)l|I2Ja^2lwRXu zaIenbz7&J^kZRgr>tS%e&ftL*gZGkV+F$2kuwG}-nPRX3=V6C)#3HZ*zW=GPviD|4 z*BsX5f2H9a56p89Jfp1u&d2qZ4=tJvO8AzWmP7D7h*v3iwSwQR;57=qSHbrw_&o|< ztKj<;{D6YrtKfACUa#Oz1#ggWum#!+&mAhtTz}OGCV09aLD$e_SVLm{B&79o5qQM< z!FO5k9!0xc;?~KO0?3_`XD0O9x5c4Xw)zuZPk%wH-xf3n4eH+y0dx zz3MQdR{%5zMv>e5AfxY<Ivt9_XD*D7-RzsTtOCAk(!?!Y6g|9~QQ?{6{sy^>t3Bv*Hw z_17tK^}o&N^^#nhBvt-04PHiXkmTCA93g-3{!SN!iWOiJEQWG`Er9O?*tRMc zxOgsT)U7mjn^Fum^IXuvb3v=VllHfz7;Im6EMF9{-1niyC9Q28@J0o1Qt)O4Z&C18 z1#eUEb_wUP^k?vySj5shW_$;houhyH8RzKsgyV(wr?myUR+604wRwluuH!qF|L?iG zs@;;D0b8m$+=TBMpl?$2N%>s#oe+IeZWn#WMW2-CMc=UKlXAZ3drBKmfTK50)8eOpDJw7-bH zQqd>vHKMOj^hx^;_TfH^d(S_?=P?m4ADRAd_MY3t$Zz(f^v&PwN#rx%lQ0imh4ueq z@|W=5`-e;SnLb@2xG%vzCF0|cpIb1yGt$ZX*V2hOc$H45L4?kYMml-Fkxr3=7eG$r;HTkxJDKmp z+0HGOY8H6Ea&Eb-hQUp>3~sKo=x2cU^Uj^b7N_68!*M^)x*o?ZYqID#?&n$GsK$QzD%Bhm20R45Q@ zh(*U6LZQ>s^+5S@U_4F{&E@y|ntlF>*0%Pxwt&m$8+Z9#&5iBN;{ktjduvCa5&i(l z_}KVIL_&H@R?7({G1+A=+w#Zyg~eIC|ys{_6AaMP$EKep3~n^_%Ct# znIqZ!f5_<{eKDJUmD61hX4C&v@E^^l|A^C5z1j4ia{8MJ{XI_q82;Xke0|92&-Kex zLVn5VXB7Isary~n@Ql!>oPKT~L!ymhx`4o*3cZ5U`xHI!b#J&f5bfb?JqI|wRH3(W z`ZdL#9!|H7Wb64E(#=-#Ng$j4Akd9|_i+*EM!)+Y&Y5sLK%D-wGw@Ru;+&j0dOS*;(Xiha_Yr42a3)T$HUY3;~#=Qi;4Tj00u#t zwHPEJ-&i2zb-7x5Gfj7UrrG7rG!L&dySg*f?#4{>uuC)>TQj_k?hJ2ZN2b}-*d=&d z+^wS7)O1)h+j@?OW>x_UFb?xyC9hQ^HEj>gO+-5r^S zcXwoF*3sA`#5z_UzTE#J0B?X#bH;hlRY@9|f%8@}l3$ zaRKlhX2X1^WA)onu7fuW= z5B-1TKNZLA2IWP4k@?-&E+&)tyKD?)%%Tmj6>gNpXTVG3CGmIEB8;4=PlWw!oZbl# zX+l0Y!8(FG|9_*HGjf1)m9JxfAWhi@iI9I=X6C#|5_(3VEo_H$aoLZumWxlU3{VYj nF@Ldso&cJdJ2W#_AK!Px^NS8KuE2xPFp__l%imN46qNigBU^uX literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..888bd5897b51c4e16e671e3d10a24761abe1596d GIT binary patch literal 22576 zcmeHPeQ+C9l7EtATUPAEPMpMXoM@7Af)i|6mcIi8*;axHP6%w^fMZc)OSVj8ODicM z*~&U&5t}SWk*xP>zwU!<2!UH*m%C$kx!WyEOHsA=2f0ePtvV={s$3OUw}0GLQTJ6_ zcT{xU?_>1zJR-5gQE;`Bn*65wb@#mK>FJ)H8Tr(nk-b)n#cLyMDI?d2g>NHx`Sxpv z*v7r($m=6^_HD`50p)xgrjp@6QVSo6!^c;`r|W=VTC%?`HQWc6%S_;r^Dp;J&Co zal*JL_|aO%7ASmQ3GDY_OamO)@4M^Z1B9f07?=u%69LzeP<%QRNqpm(7hHk(MB>+E zWd4V)z*zEVAmR^ZWHJOJ!C)MC{zNP=7P>SMiH-&$FSw=yQ=#ws zUUWsr#}lC>dBK&K{A$Slvg=qNax~;WGC3W5m5KUCkB*Or;&V*p`B)$*l!?8KJ6I}h#6`oJGW>tme{nRF zG#qfS2m`VM4wb0?P>K4F7NP%AA{n0yhJ0=j1&50;B0C{iqJFqUeaM-ZB9|4#!U-M1 z39%xK$WBO>sDHdf{gXxLb0-WLk?^)&gcaEfZ=WvJz?o7FK7!tR`MM2 z$E6zhPN@d|vpOc7S{9Pw7E2L7&818)~$KsX{75m$<^lIMtDm1^MKQVsmN2m=@Q2PdaQ z*?jw25f<{C@L{P2ep9M}e=Wkm#RIYvJ}SaOo)i9~R0F>&)xgI^7`S+kQiKWF z3zsdnk}cRvwoqM!g-eN8WHNa~TojirwM7`ocSU{478**n&{TwlA513v(W6QKXh%=Z z9a<5F^4-x~vW1qCEwnDjLUB)3E)Grx5;?!Q*&dho@aM9n&qVD;ZT(B|Fv>rgw|DPy zj@RTV)MbmWWDCP3TNqi61@)}m``#u|2VX|44wvy=jnX38?X+afQ#qxoR9Ki#Ygd8c;0M*hswws{>T28pE~$rstmSq z6_28EEN>~|s7Vnz;HkXb<`c@S>hecr>P{d|l~t3|a8TkozdZg$+_sS)FIuve`D_-> zVTCnw4GuyF{0J}uNKa^`HFF+lFJJ~x2h;&cxi#Yj@B$u0+ES6x5Umv%H)2^urWVmw zkwG9e)=Vw9TL;w8?NuNL=v@VNkakpL4gx++tlHC7o0hfNwr9cDS*w*HAr$9Hg_eeG zEBV%98Ts~NS-E@WROXrV6|%5+g**%E)3DCNdTB9BzPX4Im?dX9{t?HgIKD`+E?Y-^ zP?N2pSe31!Se`AXxGlSlBFH_3Z(2Z(BFIq$If@`hu`%t`o%7DK>z%t7cCXv5?@sN0 zwp*`ATgmD4uU1(yyWw!NfF}v(0eqI(cpp&+(nkigX#7Gko~p*;HD$hG%r^v1=&PM9;xI#L+8@I-E0{ zZ^~|OuEY8E<9zMgDP6TSe5T2{u$pvTt%F3d25>cCJ>VL^wSe`2>j2jRt_NHP*Z{a5 zX*_2Q5Hn5MLLKnxz!zw1H6Yqr1BkZj0nyf4K(w_E5N)joL|YAj7#AtKqxmeITeUV< zUEN%6F9M+>muHXpqcwr%TVe+I3xn2yAy#+|&qjf@>*DUd=o9 zMm+Z0a>w3S)fGOo$+@r*_TLD{(ge5>a1&q?;AX&0fKI^8fEu6^a0{RYNaGcceUrA( z2)sr(2DG&i5N$O9qODDUXlpYd+HwM-Ee#NDZ2|0>U6H;U;$!2ycDAe9KBIR%H*34g z4%d~Q&f4vc>)BKH+E&?T%HbHC;D~ZE|Bh{imDpC;!#3E~C=ZTr5^OtGK0a%&4tK5m z*4MgLIwEMpR%0iXD^`+OyMyvA$OF0zu(I4cgQ&sv)UAO(oNvr_%|18V?Ih_wZR4DV z*GRXMonJU^dVT@v`30os7m%J`Kze=w>G=hu=NC}ep}qo|`U;Tx3Xu8=kopRc`U;Tx z3Xu8=@aE&hN8<$2C~^9HVG%P~XNRr_AYM8=EN(oV6z`a=NO!>X*R7LuJ79{&(WOP) z-x>SyYMLXCABQ{zc?a@<2HU#zBwV{QGo<#=wcB@B>9JMMq_b8$Z`thbDu`RFV+L!~ z?iG&hwzAaBIfvdX#_yEI@1*1NFn+q(TQ)p1P_}Pouxw;zsO-Rux9lL$_XB+h=m&v* z2p@uWhxHJw55oEotdGF@D6Egc zdKlJGSc9;R&*-j`I;p+yS{<>#^#$7&NFqS57Bu)2={17J8z!LZ1#M&WT0!H-4=}z? z(D<;{4TWvO7FE3A3w9e~#2b)JDvFcge=JVJnXg-f7LGyWR5;UL3W?6C0&#y$tI~1*IFj44d@8 z_4l`edWNB|A!SRiDLb8xdD(uO$v^#r*Rs$gU&qKBEMI6+uV-DOwjupfY(_a>6_=CF zBl7pwaK3at^{rH|CBv|_1@~Qdf$c{+&&VHK%lXpz)|;umJHxOG<$N2soD>Jh-@A$P zrFhV{Q~kaS!`oYN|6Bg5DZ9l<q-&Q9P+t-+LmSGC+|A{bSU9ks^QBlI zLsZ}HWmt{pZ1}zlY(GjY?0bOorC1<)sD7=NVLj@PJj~^kSUB)y&X;0=+(q>pybPQ0 z{0s-+{HU>jxsB%yl-U;o8RSX%L&So9fnRG3eJDRN$>o(;I55TeN-PY2mIj>_c)lv84X<^|--v0$a= zqJ`(QTwXvqB^GF2K)xIcEcdy24$S2Rlv84X<^|--vA}X=AJ3VE^8)5|DHdp6K%QAF zuzWkr^KLFLpu7?bG%p}ui3OS$kgvo7%?rp^Vu9ub(2=SfF_U z`BE(K8qvUOMB%)E^+1XRnioE~MtFIR$mIo;S7L$Y1>`HSK=T6fl~|y80r^TS(7b?r zB^GF2K>jjfp;3tinirHBLGyxnjnH_F$mIo;Q(}SU1>{Syz-vScuMxStfO1MK(7b?r zDHeE*aPu0G%L^!{!~)F=$d_V)*N8q|BMRpQrAE-a@X0k|IKyg0E-#?G5(_jhAYX|E znir6-!~)F=$X8;4<^|*{u|V?z@|O_{-&XD&>HPxcQ>jMK`vv4Jtr0}r#~SwwD5t~% z%?rqvV}aEOH4 z>?O>42D9D*dqy@bt1pI4am z45mE>&gl!(OPsTs+%sTK<+;JEXE5t6jNE{_Qf^Rt29z`FE$|$f^%7=1gIRBZ`ewa^ z+B2Y>S#MD=H(;(6xq-)mS=K)pnb5({cC!Jlhra?fDaTNrr(^RXNYO3z^A z1(cKO2A(6D7mzR23I5z9uV*mpEl{841=N@71b@zA^b9Cx)>{;jnir5S#e!MS zVAfmUIi`65^`%%)dj^y<>n#fA1-u_uVu9C)yq>|d$G|yZUO*d4EPNh&iAp6F)Lw$u zh`gS`thX@of_aTFdIlpepqvs5G%p}uiUqTt!K}A1@&f8hv7q)0MqWTU>3*_cUMO56 z@_Gi-9z)~>w4ubp=dqVK4e!q6=g?{|!D|GOdj_-K!pIBeHNxl_jJ#k{BWPYQuMv4Y zgIRB3^N!52=C zC!F$lnp0HPM`iIF0oxvU$BW;Oz`Nd*&(}-{dly2;;(Hr_B6WZ*fLj4u0XqTVgCBfH zgYRkxiM+d&kY+E#Z92p48HTrzIy&CsWw=9UxHH4>Rg-Lc0{YU7_6y-J#GPh3=GS@CEv79{Sb# z8uy#K!39s2rDzOYfEW_-lak`+49JN1!Ea#TdkF1s2kL(3KqI+@t7qPB4C_q<0(F zBjkJUZs>wb#SYj5!B7X-2lNKOT}yI77taOVx}9FTJsF0*JQwuwT+pv?pyRtT3~#&X zYx#`0mWMyEbxE``ZkH?*V11@eOky_m*B;dK!GG=5PzXzlE_0F(&16F?LvtNx5B& zJuJqgJTJ!X7h_V+7i0H`F)9CxF`pQdYJnKLLySpvL5%f?F{ws~G5o%Y9=lX8#8`_M zlWK<;+akuKIwHmz#h6r6#Ml}!Ce;@)246 zvWYo(iA|{6!sb3Rn|!>GP0YbdY(ni9HoMJi^6^48F$XWP3HJ}ere}Q=j zNo|sTw~zb%Ebr@azcoEJ9rycL-rwVXSN5^}ex~>Nu-{d?;9S#Z0JG3jv*n}p3|DZv zfg`?7#pNVtI2cLb<9WLL=k;t9H~RQE^nG-BR2bm2fRV{*_$y?)H2B+O+Q^7c^SE8` zPitw9M5iZ0aepk34C9aZ{E4VP8HOME#bZ+m|5#w!pEw$eMdL}37&{V}n4F$~@sK|l z8a+DUpPU|#`p2Vje;^X^M-%>7JUTuZ2_@PS@v-(ua`b*CLP~pGB`7MtLNroSA{#*Cw^MAwnuPOY0=lt(Hm@j{w z^It!d&#$5Z1o5Eo>p5Rn?7+{&V5uYzu@h$)+sXMph2PEjHpQP_&L39zKS93LPTc(O zD2g521AMdJk)8s++3!e+YdV??5m)fUH2fBpxF)8Ly261(nDGCRpNtb%JQ@rn1H_dK z9Zyo}z|`azutSk}6fW^#5H4`y8jDWhkA?~Ut~MDM1q_2W+hRx}fzeRJ?{;_ZH8%Tt zjm?2RV{<6CIWTB&2Rn?-A-CA<=rxRW&Y?x4rm+rWTNSQyA1@u0_;{<~Sy5}{TUV%XxV1$_NO7qP)n{C!L@Eb5b>Q5S2TP!n}f z;4l}&I*9wSb6S+QRY3n&HX9ZCq7Djd=JH}(=yNvx88+G$)zla|!J7VszX)HRgFXH&cfY`5~!?0nd{}I1MOr(rhmggT$yr&TZ(yWAQqE|!gG(S}BcsXAY_v2o zTA*x)*J?e&bBk|+ju*W+x5{SjV9}Rf@p2wnN@c>Bt{wVM~u}SYp;PPlV;`4@Ic1(E31K;a> z#SsYxV}Uq%*%2H2M!@=-GcaEtAWw0zEd9R zLU@Xwnhd-&*05W>#6PE(7?}v4y5SXeh{nnd8oTKgRvH-nuCK67T%l&r*iEmn#=z)z zeTD7f3Y`Xx-Si4w21ZL>A)h5Ld&83z2vo9k@o$jpZKF6Q7R zYapJ@*THx4b@1IB9Pk;@wlE`J$iYk2KwQe#!K?W?_(2X1#6T>}h`-3eOV&WVk*|X{ z^L6lc4i5N0v@gtvR1RLU2I6P=I(Ro<2k+(JKn%pfjJTGAm#l&KdA<%l$k)L~IXJj< z*grNdw$1nEbMTNg2*1qN!6*4T_~#rPTsk5T!e=>n$Qp!S=j-4%`8xP*4h}BeCk?_? z6Uo7aJPKD$mb^V!^Y&1dgNMtp$?#bGm`IANrt%z|WDi9}-X1FR_E43BhwqQYJdxvZ zkFUOYVK}rL9AyuOJ#P;+d3&h66%V=HRk_qY=8Y|ko0GM8Yd3$cnp_5EH~U)uD%_0n z__A7e-%5WC?m}HPb>{7%D{l{nZpDKdwJ&(P?)Axd)DsJgdnZGYXka0J-K_0fjbFck z*^=@58^>rga(oh+9~}=C{WlL5a6zXJU|-kAPtdQ%eaMrIVe;jz&P%a0fF(d4RW z_)W*S_oOEn_QpMrMWV;Jb1H)c9&b*5xX&|iq{$1nFr$qwU$DM;G%!By2~IZbM#2-H z@VJ~On;Sj8SO{)Vh68Ya6IVyiNI2k~IQW)hGCFp`8wd8HD<*F^qJgi&{n4z3zYSZ9 z$@H2dHaZ#gj6{x4#3x^I#7>R-BH<4Yh~}b{6S1I(4Tq&Dg(O*SPT@q*0W}=963_#5uLM6xmlmbE0WT1lc+Y-UIZ#dD-cOTxLCJU3TB{&23K&^dJ`^M5YoX>$eQ znY4M`RQe1dTQ=BcZP2IY>C&VP?8?ZR>%?Y)r*1Rvm+dFyEU{`At=2NjhRV?Tv~3n+ zVA}#Vm6f4&DZsPrJXOi?B9(0n*)~>%Y*kxAwk_I%U8QXrZP%r=S?#PgYtwAg2~ush zRab}XBw^V0`5U#JU~Rr|t8FJ(+cVI&EpQzc=&PlTkmpJ1_C#r^-LkAYgx7>)Sq)>N zljr9!_6KZ5$pSKy8QcF%Cz853G^Lv#g(D2d#1xL>GO!`nrsm5UEjr?Ijw?8>*cIbWhBqZWWEyIhb!hnA0H+c>M{h)xH|9e-B>2bvva??4k2j zw%IkL@%n055GnxI0ImhB09*&S7H~b_I>1W6^?(}yD*-nGZUEc_xDhZD_ez*=Rod)o z;ID@9LjP+3(SHRX`dRuy`Y#D0SH@VGe4gkl!|ko_5sfdJW9xIw!@=8(}^`UZ%)3Q>dnXwq!6n{01RH%>o4-DD$4 zm$qp}!};E1W8)8fq2muo#~+Z6KlTy&f^nte4@k!!kd8kf9e+SN{(!=-?V^p12k_{4 z0PdeIN;biKZqiAz0Wd+w`0^aa?7a1K8O^=Zr(rFCoQAy8;9NKBkPC?^QrMqeI zS3a3cn{f=&X?F?4)Lc46_kv}m+bsp>rY@GAwF`NhByXeg3m`vlZ!d69briHubry6^ z9W1zOs=c5K=zD=a1oRQ0yMgWjx)!uO=p# zn{aM|Bn5hnpkW>(T_I>JP=H=5Xqcx+uM;%BgaGtO5~aH?=z5*h2RxVifVA8Pq~$&! zE%yOwxerLoeL!071JZIIkiJnL`gtD)l|DGZ4#t7T0ng<=AT9R+X}J$b%Y8sv?gP?t zACQ*&fVA8Pq?hQ!Ch{Vz+1&FPo_`a|6~%XlVnq+?Fk#-%PW?*(Y>6I&;PLs zLh5E{A!bTj-3-f!h0^72h83iU(v@z8Rm4hZ&CSqWnXn|Uo6jas6V|3ie(?|8rbLxI zkCE3?n5a_cvtC1e)ub5KqF!$a*OSH<`F+bdUm9n9C$)E`7&h9m?Nzg^9clcL-?NtU zr8wxd)ZUe1xDWMuH*h^EKFIIe%=uEBbO*I}rx)io*TWxf{*{VyEwR-Ig&9&hKY*TO@yI!n9dW%1i$;Vo3xBd#XOF?YWZkN7m z2KhZrSE@vABHg>^N)=+?{wq~Zv}=2ZV$VT_z4tKeJN! z5a4_%zT_clU+!jDf%b!ATu+Jb&^YHy@g)ya`${*%Djfg*uS0yWeI>pF$2nh$FF8u> zH8(>$+7F%rJ)`)-T8i?aZ}RxUnkv>?j935k6~p)r%w)%R@LMdtL%+}B+y4xU=fEE@ z9Q-4OLw~vu-z%2v_zq+K;+iAH*N6FwJUPB>4XNR43C&+W<}d0g@eN}BB43U#TXURz z?V5ozsQ&4%ho6tU#n>T`Z0e|Pl<03^B4JYeA$}j=4%_x-(k#O)KlW? z!~8|Q9ACBucJZ~4=C6OL{Dpl6;~NzD3;T^6-{B?l*QeyKpXYCo=kGAjUmwq3KhNKw zl)rnG_;zFdN_7V2We?`BQg86}neg?Q=5H_NFX}1r?Zf;gA-m&o59C4YN){`T?w?dJL0!}GV7=Wm~szwZ=e=Wjpeuev^C z{u-~(HGF-h`8$aDi+W0YhcJJUFU6Oy&rZHR)BNqn{6#$_z5|%Q$d}^F*Jl@BpK1OM zV*a9@65k=rU*t>i3k>M8N{Vg4dtj;~TnqP-vU7xk3*1~Gq;FUMD@J#qaU#{5M+ zCB8n)U*ya2W$SaMRz-XNQu!;^XOX|^`n*K``jq_j^ZX6+{2k``>*M+B=lL6y^7nnH zpX>fRal;zgs8%+rsWHYzHMCkQqn=SsjWIH+p^a)~qnaA+jcRDMRz^Leni^wdR6`rp z%0@Lc+TW;#7JDx0W!2Kyu0amVwKLYzxz^A}mG4Mu4Q*5_8`abpH=`O_t(8&FsHVno zH>#nHYGtFE8tsj0Xth>GJ)@c$$K9xgHma44YHGB;M$b`5e^jxW~I7~foK z=wFz!_h7Y#=HI)sYGtFE8slbEL-X(HnOYh3jB08eccU7bf3MG~m5pj@v^T1u`F(&) zt&Dm`H8sY^sD|eE4YF!wqnaA+Z&X9e@y)8GvE7^GFV@l+-&|_wF(tlg4Q*5_8`abp zH=`O_t(8&FsHVnoH>#nHYGtFE8tsj0Xth>GJ)@c$<6~4q8`a82H8tAbsD_r~+b8y7 ztfjHto8&Ln(sX^!p@#ky+?ST`VW~AVU!SvTKWD4IPOL@v{9{W zR8yn9Q4Ouu%BW{lQ)7ILYG|Wc*{G&Q`y18Ja(sv6S{mEEN&aFjP1olfYUoo+eAOD7 zug_Vvvd^HV#<&^P(CU3u)HAB7aomk+X!X7;>KWD4$TzB?)%&!lXH-*Te2i*n^}a6Z z8P(Ltzflb>$2TbQ7i(#3_a^y^wKVGGQbU^{zF0%!v$t-`OcI~j;dAB|TWRuj(+oVP zDV-9}&hh!`YoL$MpVC%Wc_--Mb3=R{cNU+$?Qw2Yac#oct@CPYJ|!`sLzD!02CZqpfVPcgimR8x74o8bve{^Qw$sMIxFEBLJ^D?KEtlCvUdWIui320|4PGiYUsxn7>iW}@O(~h{Me$E z!`H*c>y`u1U!=DxbhSd?rqFhU-lovo75a9Cu2JY63cXXI?@;Jkh2Evm4u!5$Xs1Hg zEA(!KZjfjgOC0ZZxMx?f-1$x`4B-Jwg3i?!VXlgKnvmw{dC(E_6rbP2b7Aep($;do z?S(K;@fkn715r-rEI$8L=IlPwbxmtC!&mC`b!~l-*+R&5>)L-M$luz*_*;PwU+pQs zk$if+u;eK!uRh4+)r#G1cQgKNl3kNzXTOKZ?TX#D`xt+lWY;X&Z9mN9+ZDUp?`Qnm zCA${MuI2$IuTkuFJjnPvBs-U6xAP$;->KN$@i61xA=&Mb>}roPd97l%>uZd^OS0Q5 z**P9za))A9=VAOh$?i_c&go@xr(#z>!ua))-9E`~cYw)vD|QW|jNc&H?dNuc{P71> zjgZ<_z-E|ys{mal1n0ZT{ccyKk1eU+wAX>tF5Z*|psN zr#&6l-)oSEKP7()??1gYx7YOf9Kn4Z_IZ(~?}A(h`A-exXeapMy9)Yz%g86x3c}}K z8TsV$rF>pC@(J~Z@cCy(KDm4;pU)WigqlV8{FaeVE?>$g?pY|y8c(QugwMEgc?rx#P?Sh%_S~h$|vqw zi+n;ID12&0KDm4;pSWi&@(H!1@QLq@FS<@HH}Wa=ES#fa&w2^oYt6iaUVeJMOtZkV z=F{`b?F?7cFkD&7u-s|Up97mOd(R=|ar(|Xw)QFtbNc<>2CqNZ)ZEhC9B_KQBTm1w zp}wVIB;apoY1$L0hd)R*;v3l^u6i-zYas+)t~h;1wyccvYw+Hr^jS~Oi^-b`znb&A zc1g-`%E)5!GezIU^*>Vh9h|Qze%PPAD<*H(W&1zM^}hr6PUVlE^FLAeG|153t_E3~ zkZ*APU*e55=`%ymi^&U${zcBOxmQw#(?8+-$**PeFLVB{yR-R!$@%1PHvetT|L%R+ z{J-P;^9uic&j0Nr+4>)IzEf%U6V9(v`2WWFZ}(>V`JD4ZJ=y#cI)REwufnh3{7Z@- z`1u!niixE^+s{tU|LoyxeiP^SE92=F{uTba$TwSw`1^>$+kL<{`W@(b;2Zr8lsG0L z@c?o7Pffs2TZvEuVXrR`_BfqQcV$jpZJE=ygPGIL-I>$&`i0Y$%xQ;9 zoYpsIoYl8AXSnSdzq^|<&YBwb2!-9v84oUJMz^D>Ln!QN?-Zx54rk`HIdcuJ=8WHt zyE50X$F<$x?#lGIy(QzUqbbw3j^;gi{eU(vWBm24 zr}OBMw!`R*v(*b|0zTT}>y!8q z>yr4ps*{XO$B4=~UCt4=BOFsBd{Kfnk0-m^bIL&CVzBiGHu%FbaznC zxgSqg*UurA(oD7!F^}`kIrp4<-+8}x&O1-fI@P&vkJV!F*a-XNkn6<4_mOz`exi@< zd_5OsZeoZ3my<%02V%7KF!el|;LZfK^KmE^b8=A{@su)FP*TKrw1XuOvYzo|TRPpj zlktMecDPp0A*T0Cq|-6Rl0-h=;qzI6{VaCEUL4=Se~tNRTt4G`BIXCc4(D6!2j`;k z#0BF=p&zedVnIdhmw@;^&}l$h_ji1)`yN83f9M^t*Ar*9xO?hAQ+0|S9*^!t&3&+mOOG8_wl#_x}S%o`o^4FoO^g~I*5(DTkw-$>wl z9WOY;gM-mPj6Cm*4nG{QzvP_og~kKkJU3idt1j^`=@P@Ep%b6F!geuPk;!B?U15od(dWIwRSkLB?4}miAp)+V9K2{!)J+ zW(1%+0|#;dda|_d&CH; zjAh_N4nizT`-if$Kbe6&4???vgg5mJyvR{_^K`Zj&SvWXYHy}!XgARC=7kKrqy^%; z**bVCTL*uZfdh_+))XS1&A>}qATDL=;Kghm{2&7dA`mG=ypn;Jv_SkMTL-UZ>);m| zIN*V3OCchWftR#E{3=@qzs}adn;AF|fk+|ZY6f1?0`YdX4&KSu!TT9FxU}CtJR-{G zo7Xe&kQRi0$=1O~**f^Q3>;j#OAf-v8F)ww!YA1}`0s2T{4N6rm+q8;aK%D0a3M$G zip7?-2Yc2Y3N!FOK7?5WD7wqaj16*m`~ad8iSu2|eAW;bi=UxbHI-u|@NJ&QSB zlc!KuEFD>U=*rr|zQuS@&)R2vT=()=EaHs@Mtozza3qjAf8A{E#h$-j6SD=+-`_Zg zBH{5dn0{y^FdFj?4nGv|d$9%jM|?2-Gx|^xo<oBzSx{`EH%}$XyLSc!U|LYwjPcX&efSjCcpf>S~ek#zwtv*F%l< z@K_U*qIVz^@Qrr9?i`B@PxxX1V;!l5lAkyufd}BRXxe<=fr?_WyyT1yjYYfz;qlSf z*bC0+iIM(r=-tK)R^UADhv!^|vny8eBj z!D;`aYE=$A4HnPSxwZl5N_+w?p2u@Jju#s~ioamd;D09Dv*F2}zy0k$Y{>y0HykG( zN5gLFlfhAw3~V4$sX6EE;p(K%N@-u&-3|PyoI-LM4oW=dSBCuFu{QG4IZM)VCYglq zM)ItQ>u>}*(2s#8fYOt<#F}^(cn{D7P#vfal;m0y9v~i|-N;+=5*ku#Uc!YmCofTi z)Rvb(LY7$*MG$TssD@$BhjBnXE5Hx(1$l{XppOx&_L$YCC2h7_lMw5q)ygO#l%64Z zS{(MRKx?(p!?ZPzvK7#cx_*9HlUhQW!@mjH7gY+@U+pIL@zgw9K}w zZP8n%TOM!F^Ws)=I{x+wOQHo1H%XqGv*zS2Nm^G;0+&o$i;vTaug>7kh5Z8pUEQgrrul`AbsSqp&(#^g+Yjb+I1jk~X}i5*6|TP-*Kfa-^7$3Pvt^Fi)ujH~Do7~B zKvx4@1GE_ETA*uyt^>LjXbI4DKudv^09_BX6zB$^>wy~Qy#(-9rp>Mb`6`GP`dOnB4^PZ-nD>0Nn&s1L^>}8K?%d9O!1CTY#1WtpK_OD8)U-Vxu;@5o813m}nl3dRuQVp)h2F)?M!SU);{nRs44Gt1_Ya3=XyuKP7 z?0my<)AJ3Ko^PP^d}BP|SP)0_d;_KD8z?>BKB;#@pw z#q*d9vM}a>Ed_VAh;0J*Z0)Id7A>c=W9|JxDd<^)B$zsSgy0shDc@VZ9 z*bc$g3)>Oc`d~W>+cDVsVe`W_2wM=g`(X>gHVWGqY!TREuuV+r&XYPRI&ggzvB32R z`xZz{z^@iO#2ooz!Q;&u@M{EbWBgjd zc~3K!OSnb(p(xrf>pc^EB5`@Ug5PQljE5iXaoMee2cB_2l0@Vxgv z@CKV-!Pb#+E|;)H4pV#0!)OKC_nm-oQrs!FFhBF$hGUL?6ULxS%5ef)`fL2!XS4?~ z(L41P8?RvN$PAY&*y{Thmn+yh`iESuV5|3u51D@jTSxwg%N1<({RzmWV-(mr`seep z^-*p*wrJkMIv`<-<}H-T*kZY~lIPS^-okMTwrJi$xr{BAdtE#Sr}7q#Q?Nzz7RqI8 zv0UxuIXjiNaGZiInzv9cV~gc>56|(byoKWwY|*@hav58!7Ig8NuyEeOS|(wO<}H+& zVT;w1US3yHc?-uY*rIt0ng7ye4YACN7+}qutoD0%4KXReTi|uh2s=# z(Y%Fn8CyypW881yI0aiYZ=qbqmeTha_ggql!4}P1D3`ItYNF96EqK3$bw#R)^nMFv z3)e)W*D~(6aJ+&onzvA{V2kE0lq=Yxc?;zVwrJi$xq>a4w@|KNYcYB29Q29fznlKl zUeT;4H0vD^UuM0c+7se9v)&Q$Vb&{}^@L`+pLGdr{hh@mZ((1=@43u+LYlX5 zjC2oXl`TWU{;&w(D%Gi?gR<(jHnzyj7acuFL znAQ`T^^Qi~GOvk7PiW*V9H(H5<}H*<*fQ%0&3Z>8Z=t<}Ewv{!@)nMh?(gyZ(!7Oo z30r18p;_;U_NI9Y>xzOcUK5R;5N*wRNF#5d4+UFare0CTmYlbqg744e?|sx>k=H~b z_k?D>qmj3;rbu~9{jSQ$TR2X^7R_5Im$7Bi6PopoM&3eu8CyzEXyh#%r(lcbEtJdH zGU*A;dPl^EY2I46CK^2<+M4x{M&2^1iC?B(QO1^>w=94y>=p4Hnr_=nJPGe<;rnS# zj)M5hmKk{0pkEujVwqL(}Z0?}ZdWUU#_N!D+X< zVw#TKOUL57NVYrS-4T4>4c;M{-n+Mg5cYj0A#?ARRf5BnK&yb31I3pU>wwk+?EvbA z_d4PGSNtB9kkC7~5CZpsly21--IieVHnNh+t2~Tu*BNyt7_BDdRPXXITB9>sn_#q# z)KPuChtUR|(Z&R$O{9bB-5y4pbw+n27`+|WSpwe?=Rv&i-M3;pdzTR9n!_IdZ4%yZ zgX7o)u~?Y{@3`poAJ{63;IgW}X4?(Ni~LrF-=^@lDSVZ}Z&!Gy!dEN2OW|u2zE|8#kcPVx?1592c*?A>D`K5bCk)OCA$vEZpS#&?@;V+e~`&< zm+U&Z9U)J@Q??T>Z97l|=t`g-;7ftF0bdRjlB$y9cJdszOSjXU(~@9xH_ve%p5t2e zQmSuDFxt*@TnEo_ou4@;ofSE$>jT?P=^E34-=**^3cp+7Jqq8d@NEj;uJ9cS-zo9@ zn(u_?o#L8r%fa{a(realKIfX%H0?Ow^oVw*L@S6->Dr7#Ytr$3p|5Hg!_DRwpPTIi(uEOdoJC)H}9%N07QZlezGgRpkr4bT3>b=+B(zf z)%5gxx&Kc!9rN#1xDJ0wUW0dBU!U7yxiLquu0x#{*Xgf8uY>-VCVI3Ja?z`x&)3a- z!o7m<`PXJXxqcy^m(6^_{f6-Q7iK=Wej%SvnE8Zz7UA<-WYP?r`@xbG1@ zV`e_Nej%S&v*!7Pdne)3Yvz;d7xIZUYo1TI{}Mj;n)&4Vg?wVon&%Vl;e=288<2Te z;`)VrV$GW86Yc|rPtD9H*DvG~Yt}rUa4#u*;@_LiTPN3>`4lw^F)C`-bMUtz#^0_L zoxEOHX@h5$C$BH7VsyES(G~TK7P)QulVI~j*Bnw5qkk)h^L~;2B+k36#irxDUu0j2 z^Dg(WdB4d166amf#^(J(`%IX(r~@!YpRG;7b0$I`w<7&`9sWFKN zE+}9ufj@wwpZ~m(4C9WQ>)}}k{qzY3oEK!@@F@H$=?)Ekt5n;!uS2VKIpLpHSsemM5H8uuZ zKHq@L@2ab5sv8LS>zW#x12ynNrUU&0Tg9s9Q~ZjGz(eShx25X}xxBI}UCzS0guJTA z%lY`O?djtgyM+8o8Sm!f-&f@AT&^j8?&9)Zd}baJhGX}KR8nd<68tZLuGuGSTHlov!icV)^y_l`DsVS3R=E%eb zf6c?&army*{dcgWC7_lOMcCq3f%vs0{fHe-#qVVcwWv>mM_a6U!c5dbLA&@stb;h0 zo%5o+tpNVNl1X@)%s!$H3R=PCLNDyOnEosa`WAXoHwD!cd$E2&9iY#V>_wdxG{o%% z!DEm=e*g^1#QbzYIG_~&Bg8}hPyMIz)u2Lq{Mwd&wsE?iQR8>qxtI;=VXjMzzIkKM}{7hl*iU^Upp`UQTDf=sL(c8$*kUKhmi3x$|h@IlxyvnStRBjSpI H!jk_5-x5jv literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..ad1c2180b7a4360ff368fb1e928d8f7131620aa8 GIT binary patch literal 24592 zcmeHPdvKe_aX)|{1yQ6#SuaSU1V{tPv`j&u_?9IKq(GV0%e3o<>?#HU0w73%AOe7} zfP*r$Tb5)&^YczF5zF;=*{ z5_uiO3jZ%9HKYpIsOw?kd6?tI1X_7LWQ#UAs|bv>j1)-f7>#;x3xup=G}%^+zrKUf z0%bkC*WcXF31CuC1Rmy^64ioy8MZuSh7P5ZbmG(!+L%!p~vEa+BsxL4e4u=v`Oy$dQe^4k{HA*i9PWq?` zVJUSo9{Tcd^G@{^ze;a$EEYX^%{$yD8mrSZcHKL)X_RjE9d?U5bZ8p8?j1I0ly3GN z_J})lYZ|-m9d>Ax7Q935B^UkCc*HN7^)L$-;S%Uv*Rvq|nYT>+zB2U>m7sqi5K0vW zpuYqI@&MddrvA|~^^cXH|57rQ7!HQKZs7$3B^Z$hAy}qJio zmtaI5gjAXOC(6`6Rf0Yrgnb21c*`ikiaZK$ohjGAxpED_+FPnO>??S~Tjxu#QZx{s zE7!m?aTOZ`l_5%C>OmMl7ha_5qLUUXG^{zGP_BACHVBLUZ%i&)UAx`8%pnTX6oq z>l#XojmM$+q0vw* z_`*?t%J=A4;y5=>Wf0);aPqy~zK4&r`r!yO(b5qJH?<9gMn`?&c=Jvqe5shPv+YD% zi!YFjzyW161m~L=9esn*kU!S@Jy$$2oc5=HJ@1amE3QQ7<8VG&)bM9vX)&3;?n(~D z6TZQ*@mMPU6<6})XkaY*-rjO<#dSOgCtn4tm(1i_u1EdpP&hFL1Bv6|5huIpP$D@z z7Mm8w&ipiZK467gD&Uk@oU`*PU4YBsWZc8g={=l}^~Hrp^j6})lju6+ojN&Q``%Z7 z^KaoX5lsrz+V970O=$xGUuKL+5?yaGyn}iQfbb406c*GNSms1 z4n%WR&W%`6#fHODl|z!$nsev~2A~6nZ86jX^eh8ANLN?o`T-v&X2;`Ziz9EbY|X=1 z<;`Y>git(7svKD;o5|;AD#)MBR8+brPUoJ;UMABsm&ucGJr37dxL%yelh4kem*vS* z9RGylr#OCv;`01*ItI1*T8fMFiz!y-D=BWtZ=neFK84@3Ks}03k0R8g2=yqg%i0b5 zS^HCK>|N7ctGkS@OxKgGMpf2K&Sc+NX3BLzck|@SGvlJV&axW~!^5mL<*z-WQJPRu~(D zJU4?ru-9Idtsql{G5&2nlQqmICJggqaGio{Vgkpr7IetviP@Un76Y-C<5G^xIM#7o z4p;+YKrW}bPMYhaXPHiVmg%HtnNE5Zb?WIHo&`Vr#*6{;gZ;@^ts{w6*YXH{*+5p!9t_55VxDK!$a6RA#zc}U7&Rc8htngeTovH3}+Q4@mH4qo?96g6BC;UY^0Ad4_nLd6;W>1j#07%K?2uUq5f7{$kiSfbSw7?g8bS z09z)PWcPzltUv2`s-?y{VYECqX}Qx1&zh0VTdmbs^QZ5&EVE8j!kpNxX83;j?Aw+l zW@1@ljaZ=UpgfqbVL}|$OGhTHHIbI3pZ`S5(&{MMu+&<~q9seoq_vvzO~?bf0&r2K zX9Cdy&s3|!KE2-l+GNY*bCa!hlI?J;pK{3jgv`2ZB{*w4dQ^Kql&1F}7nRoPB>9$F2OZ3oQIF@JFe=kT2M zd<}hm&!2~w0%Hr$r31=_`E&3reSU(}-8b8JV6hQj_Cz*s#xc$34=jc`HdjyJnsQ)C z^;S#87bebEKjReggOdCpm46ZB&pHoP^i3SDI52Uf;?Tt16-OoxRrCXWFVOb^{Q%HM zfqoe1V?aLw^Z?KSpo2h%fsOz@0(2DU7|?N`6F{ecPEXk2`8CXaaNh^uIt14dxccF` z7q0u@dH}AYa6JsyF}NOqYXGhQTtT?Pa7Ex6fh!7E46Zm_3Aj>lr6&y6DTCDAGrNMA zz?Y$Hf`=aHm4c?bp?hXm2^yXQl&=;v_%+gN1dT5{0BsXAz8nDbT0t*j^g2P~#scK) z1-+Qj^@7It8$iB6&`TJ-5$HyX#V{%>GgXae4N`Y()-jNwxG9)1wp*=+v3PN&di&#s zu|w)3mgPPoE%y;=xsOQ8eMDOBBhqpok(T?2wA@Fe(h_u{Cq!;Mpdh!CSJ71a+@qexT zJCmpGej*FItZb{z{wX`(ldP=mrh5`XuKtG|T*AZ9Ld=x5dKlIa3#IEk3|EpWO4~dP z>xq@p4i7^osit&;hhd{FW69oCaV9%K*gZJOfBx%$X}Vs1ZjcwKoUT`&AHz*;xpNF# zuwHO6uP4n1^24>9FU^awlj?Wm818PswwJ%a+L7i9`GHlOFU_0LLiIg4hF+`}T+8c8 z^NIZM2F{n})o7#oeL04Q8nOLNf6dyL<{SBeEu1gSyU{`Q`*RHM!+OEncs6#;>TILYI7OfRcm|eXxR2I1>XF`*$`s-Q@`IgB zF9*JYdfScdQ^*hRd81y$iWaor{dPTKpqF9rE{5U5Z`XIA{GK0C4BXE!_z=VJ$9`1r zA)CIlX|p9;RaNXS5kL3~KG*#D#Q7cxP1omzT_`ti0a!s4C`@T zBa<+%*uLT~BU79&`3o7Q`VJ36C+bH(&Fd-t68jA2Oa4NRQ~d@H!$zFn$kT7={RP$v zl#l!=_ZL_*#Cn4K=$|t^)n8(NS?n*7FERfZxyUg3Rfe&bnZHE7&HQ8JRff^;GK_tH z&R-s@EcTZ;#uto{lD{M{z93Kb7Z&r}JoeG}lEV0c^%Q?eV|+or>@O@vcJNq9<4YXl z3)WNoC4un;`Le&TnCjuNmByD8#uu!o_)8k&3-V=uVKKOm$6^{^;uv4Bp5iYFj4#NS z{e{KsejdANd`V$^!Fr0nq%poAU-lOkzV|+orry zuSYb#L@~Z#J;h&Q7+;Vt`3ql<`uKW8<4XkN3)WNoWd!33@+E)a>rp>nk7#^}Vtm1R zioe7#z93)n7rq`H#Fgi7yczUq*O*iSqao#Fui7#;;UlKgNq3I zONz&rv=mGn~V1_ zYL2AKg>*R+`hhM-Qgb1!r^}ge9(6gAE*H|}OsKERk*;bPoJU=bq|1eLITPyZ zawIhu!g{)#3Fq}%Ig;!za;}7Wx*SRN7tEE=Uv5T@)TH=}nj`UZprTwzmouRs=yD`} zZdAyHu%0ew!gPS@OsKERk@&e(As51Wx||8;QI{j}bF89VNS8CAzAi`N z=U#$hi{g>2f65Uoclff4Lbs(nV8od{J{GT`r`{na~e(Ig*+S zVLe^Wg!8D&k#xC`E@wi0U5=#YLRe3iGvPeyawJ_Yq|2F5Uza1Pxe(UVoXX-m0R!+|M7zB@``@N1_@1|VLcG(1?|b|s)W`3$ z^VY69FVw^L67c)}Gx**9-mVC~gHQ*#x4o+?lIiMlW@ue6t&8t0SbDt?8@^BD^+qyY zuaofauLzlW&kht>4%h&=8L$zs84zEB@&dL1-VfLT?|;END)`+SA<=g?6XNtR++r}? znqzn?Sx)5*9)`CW47cSN-cB}Cd83Enc7vfS$8ZN}rgFE3VUxjdXO3Yr@ltt#j+R11L-Xay;Y%aRpVy?_SOMzt%T&&F=jciDND8IF4LSDb6A z%y8s%#k%U31o>MIGX56e!;ur`|NKhj;wPzm>tQC}s_5N%H{;(b>FptNdJXq5d4r;N z+aEFhZIWJ>q_^#UCf}y$-Tols-!AFxmGl}PV)8~sZ~GrJ{&q>vBk8$5#^f$VZ-FtyBT85dtMbT?L&iJj8o>$Uq z8)fn~MX!B~@!KW6UP-Ux<4oS6=yfI;zf;oNFX`9bmBz}h9K*dlc6)g2?l#s^c~6ev zJ|4ThJa+dQ+o*hhj^UjjIJTY>v9<4g%WmmeH-O%w&|M0>SD`%$-L23)3cXLEy$ao{ z(EAnoPKoBR>EGepQ^clsD)3#c;x&8)a+P%EXYBV^LrFJw;(nY0fOvd4u?+7k%bCxuMf z9|+kgA(QqBLN+dB(!N2+VnQbEA%tv5$fW&*kPQf#w6_qlhlEVpX9(H7LMH7wgzTV@ zN&62W^9q@?7ZI{OLMH7?gse@-q&Jt7S23He0kmchuQmFFQ$C7C(2*>XirqU7yf8Z zL~ij^#ysWbc`!!+nO`T0HU&!Y7bT(nnCv5(W&L)>HWE0nTlogF9>>Gv6FY0V^ z`9d~vjh|-|_MXD#r*$^Dd?A~-#?P|}`&VHzrL)Q93)#dqex6O(0}GozolP!Z$R@7w z^K8OCTG%|Gv&rQP*~B$|o=w<`3!AMvn_Rw-O=4m9{%2=@V8!dr)F!M7I^38)NE}7!=;T3m$@0%wJ==XVKH6- z-4A=MCUq(Lw`JJwhk1^H?bdc#3~cwqJkP*(mwH&cALh9Rw!5r{wfkY7Z(zH1Ue@l1 zdd>muF5eHnP0y?+;arx`*S8R@HvFH^*QO9TNnZm)`g5FtuUp|~f&uxTtNAf3_H(&~ z^HahArv)4uj=`^q?sC9yj5-b-@;Y|9UGUG*up>Ga8ww?Saepd;e~j--j`>m%_+$M< zd^G7B^v8V3@pybJk%Ah7$NfXYu_2I$e8Es)e8@K(3y=B2V+o%>8ug7OeeuLtcsLqL z?novEcSNJ<(e1z)2@R$w;cy3o{$_tL+}hUO))sR6{e$kHySb^oc`y`gZg1@jHNg*% z4h9Bq6L&oy@%20cFS(t%y;xSm`Hgt?FMZa~@*uS-tnl}7zFV<#g!3)*53vj8$wQp~3xyx#{Qjo7x>%x<4EI@w-(g6f$7p#GdA3_( zLUD@oPbl@zbAFE9SV8G8IDa-!V9~;hod4_t#r$t`KDoD;|2pR%>o4a2Gw1gy{I@y( zXQATy?{ofZ{C9$d^rxJELE-<3^MC)asLAju=XV?}<}aoWw1_wr{z}gOv0?}Q+#5dN z504bv*~a;=Dg0K>e^VJx59ikp7Tb9f`DQB-zlSWO2Z68q-G_6)*ZuATamB_`A>s<2 zjKNP?5Z6#_+!gUBBZU8t{8WOt5@SJs%1>OW(1{eS?H?T;1a>Hz7=uSV7=#C$xCX~Y z@y9<1{wzky9{`MiHY+hCQGXy5^|{@x`wPX6o?jmld)o`#&R$XMY%Vn1+gxb4XJ?^qx4TfGv$MeM?A$3V^zJY8qPw?X zx4XA+zx$dCJ?`l!ROsoDDin&|=0cCXI}3Gto99|?77kflkgBa-q-J@ORyWuDwDo)>DBFMPJ;%Ks(6s6>y<1)^CAg{pMz9A4D1PR=BWV zLf<2P-$fu+l0F`(jKv*rp?(z-_}Rul{9b$EciA~xy@6KXqb~LVKO!!P-%*bt9h66q5WO3gF>C literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..c435c53f18002088249b75743e54dcc2c2350cb6 GIT binary patch literal 26672 zcmeHQdvH@%dOwn7TQ&wvco{J0LJVP3BjmSDz{n2>0YjWD*(F&LS+-=$Aj^&{97tJn z?O=zcX-L@ZcDJvPgph1DuT7KPY?E5rwB7C>&;gp6CX-~w-O0}EAKMw4b~`(p7WMni z!V{%p+c$Y9zASq=u+QB6dvXs%})^x0FXS6`s z4zJbo0F!%0({YHAYdw?FS5 z?Cl*6#>jK-;r@q%&L6r*{NV#Z-~Rqc;AN)j>psxi8;njclV^wg0b%6S7`@Pa$VXiW zPq9No!Dstxs?|&U0lmb*Nchkdudq!FR;n57s#jR1G5Vsfuti*Ul@)>`vbvtkBEYCD= zW#L4ggjkOD2XnMPoP|A~gf=S@&YD?xk!RuT(OexI%hds_z1c=Xn-vXbpUA>X+C+RO zR|ntA)xlq6;eba(ODZBxW#J`lB2MS(;DuZryp)9lF%hYVcr6PrX%q2At`6SH)xkSi zIN%e}nu>@-7GBaO;_q{H@QYj>oXx_4n21zFoX^5b+C;pUtAqD*b?{LZ4o>e3^bd$- z^XzmM9?~Y^lUyDAI#&n3$-=?uUGgMc%ECk1B>ZQt4t|%bga6LL!RZ~+B%HI6EL_O5 zaL(q)*@H7@4<%W6I5RvH?vL#kABuCf(kz^$PsM_qJ(T6_p*#x@FZK`n1`ou1-BtCe z=`gZzls+95IeS=}vxmxS@sM3t<#cPme>gR79ya2&HGj_88Z~BD+t$AT%_v`Y+UVYE zxnE5e>YS}TXAj$Q_Rw)H9@Ka3bGGYV9*RYM!@&XnP-rk3Ouc`dZ0u`&|MqCiGQNL5 zb@xRF4-CQZeFMQr%-7q0Fc|RR7U-X_G5urOl!R`irQi?u_eC6U&fufTIa}9T?g9Tn zUvJnS^F1^e-Ors<86z|IwtFbrKjM!Ct?Ni#D0#yj4L%5M(X{ct4J(Sx_Cxn@-%!-oGk72p z8+zV7d}yG1F#JJ%7FXcjAAmlWG6~<| z6xb8fa0e#fCBOtAed7x4iBmv(0TX~Gpb1Fw?Flb{7qAm)TS3A=v=<~ihr12(!~XdPQc^DZXCBejHJV{CJA$$wA&dHLUEE57;!ka zlkZODkw2Zv%lC{PNjw%mPfkvqCy&E%9F9pi&P*lAx2G`jlH>%(KjQc($LA@|OU|Qn zFgrP$Vo|b)Vtz88;_BpTilFx>e$xVa6hV(7=urecic8}z(>3WjvDnpovUyRn**xC- zc%4}gx09ps_vYFX&2YO(^6Zp7ub?n#ziAYxWYWF>@kr9XXf%0*kQGZ@CtYw)^<%|x z1MEu3{0qcogKym?E|zQ}W<@fw?isQ&SiNn_UI*JThU; z@$Zx4anpWm)U+Rm<1siMAI15c4L0Qb=yXZD!$h3TaSq4197{RQ11y0#Am>N8%?P&{ znPxU4)68aMn%Rs@qfI$|4<{iGUz#%E{lWVgcRDNPt;aoRRM^;Rs*81HGt^r7QouES@F9d2A545 zCu&Qaqh{?>V~*RM@Yy!wNvE^;a`MQXj=9d!e7Faf(+=NfPhND)vJ=NFXUGBP2Iay1 z^%G(g&)GNTED6=l`R+Gs=M;z0hhw&r6waDM#+=2JZ$lo?d4Pra-cdvYKEri}>*O-m zTVu6jPmR^NNW9TlHeujrvd+ccf4Fb@{sYqYALjw?0ppCm|A6%U2c+*mAbtM<>H7~z z-+w^*{sYqYA5e@#=Mrc-mwwReL#nSjsQIb zbQI_q&?7(}1o~m14+A|4v9caUN|0rqYI91I0A6=!V!XF z9~@ygB5(}B5rrcL#|Rt;;dmI1!*GnkVZt#E$5A*YM@{!(la$^yJ)hX%^9$!TNOV9i z5H!3eNG}vLeD;xEBxnbt7YiC6@Bn$4pz(nSpqB``kkLy8jl~$qmkGLv(dB|JX7qAF z&tminper2?)6CBwFQ}X}%_?>u2CeZQqcT9RtcKl$IXJ~_fa8eejlp^&F^E4 zp!t2=0`xkk(=>~U#*5b-H_i3ZeB!x0pGeE|iL^YQNXzqyv^<|k%kzn}JfBF*^NF-P zpGeE|iFBsFZ`9xZtPo(AfL^{)adO!x(LE{t8<@rQf zo=>FZ`9xZtPo(AfL|UFtq~-ZUTAoj&<@rQfo=>DR&1VhB#C+npJfBF*^NF-PpGeE| ziL^YQNXzqybcXp{MxKGS_~|K8Q!H`)WbELH$Kp`A$FT;hD|0>S;SV4*@z1GXH(tPsrhieJxN-zwR zjXUCbj-&CImt9vS`PYB$vb|m|=PKlN=f7UA<}A}geR~oNYf&#y#Py_DL4NOS&X;1w ztfuyj35FXzIQG1E**H=xA-{Ve=SwkV)>3nX7s z8Q^>=R!I-FcX}C?U`&M`e3y-*#Ol5SoG-;H2~zt~FT(|BA3ntOlvs^?lk=rmC4JPs z%*(JGV=6Ruk&Um!>b?oimtvLlQ+vb9umbJF-{N{otVaHr^QBlN`>Fj}FT+ZVpU{)P zoEfXQzVO_OI{W?vbdV?IU=ge4MV^Zo22elzeXg&>YUD3DUy0Sw)1NZ`O04dCj`NjR z4gY}il~|3u%=t>JhJK`v)qQWK$7=YeELJ0bn~K%f3esbh=2cv`q*$eS6?t;3vfS_C zYd|WmqMj0~G_N9Gj#ai+H1ahgl~++uiB+0ckuS$8TU)$*jY;KI)Kg-W=2hg&vC7t> zZG25iM5~G^D6S?SY>P79=_&f&a1d~OR-AxD)Mx( z%GS#eUpG^E74?-^rFj+kO03enihLzjX4rtH_gLmDd1-*8r)!ih4?{(!7d%DOPz6;N&$xDzBoR600%6JWs%B!fS#463J$d_W3*8m2u0aAGt^^{npc@_CmtnwOQEw2G8 zGv-xX@1D75Pf6(!7d%C01!(MZOZNG_N9GiB+0ck*~yR zE_wC7d`(_eY5{ z8X%QdQBR3gnpcr8#VW4>w(%Mul~++uiB+0ckuSw6uK_xF4Uo#KsHem#&8x_lVwKkb zdw30yIj<@;0L`mct^q;`Rs)1F{%O66`bwEz7nf6uOeTGRhn0kuf!_N ztH@VkHJ7~lsQ&TJVwL7q@%%;74?)@rFj+ka;z%*QmbA?JtbCYUPZnftI9sss#j4@iB+0ckuS%pvhU59S8*Lx z^D6Q(*8tYu*{WAjUx`(kSCOy8D$T3NS7MdsRpcwNO7kl6l~|>D75Pf6=8{((5Uc<5 z+4vQE0lGcEt~Y@3s_O-)dw$f@^#(BBbiDxGo?q7+Kzm&;K;84Bo~}25@uuqq==S`& z-T>O`dI9R5AN6#-0gSDy^#a5`O1J0N^$4uoitDiwuQyUJK<)uxPasn-U{79p9ia9C zbbEeXZ@|i}xZX**Ro(NWo~}25F{JAS==S`&-T>O`dI9R5AN6#-0gNGCFF?2F*YyU_ zUe^my_xz}*>kVYgt+J`K`Q)dQx40F+}q!@})X}_m$K3{JP!%+S9y>_EH_d`_R^& zAN6#-0gNGEt7tFP0ld#`?fFqp*Bi)~S8**>V^y=~*Yyaj zyox@QSiO;Y0dlNfDX-d!+Doyj?)j~}ih9z13uB1pRpd*ts@wDHdIM-r^D5d)v8wL* zQBT(!$e33%*8pjIeqE2i%Bz|h;6~~N$gz5*y!sM6KO=w7s`dhOdw!Z%ajlbj16E!| zzLZzh?~1Lwih4?{(!7d%IaW1$eqC?C%ByHE$EvdDxAH3LDX~iPD)Qx6)$I9oy#b6t znpe?Yj#XvPk9xY^K*qe9xduqv^V7VFx>Aq8%Bz|h;6~~N$gz5*ylR73#a;kDqgRkW z5l_N%qh0V^LW8R~{_nO4cvh--R6Gle&x!s6=;JeCNoR9uJLuu_jQHH%QG7;kb2EL; ztQ6+U)!ZB!Z*Hy_r@Ea~7oT->+y&2~;d8j~Oxk#7X9Xebc?3eHK5$in!+C)Cu=Fax zTEH5>M!&ZMS_jnmrnGCBF3~R_L zDzEi2tTP$bCm1%68Y*w}GHfy#Zb&e^jkHtwMlZunCd1|g!_DL#D))LBwwMfC6Aat% zKFZ+PkOG)jd}eonlRXQKe8c68f0l&ja^XH#z?|Kb2hY}-wI4ewOX1^Od%>|8<^$<< z3hh?t^$P7#=qiP-R_Gdqu2twdg|1iV28C`^=q82UpwPD|^hSl=q|nU@y;-5X3f-d6 ztqR>H(ePg2`*j(5XbWb0-ff1r;4{ZK{Y*Rqp9%4K7?(Z|$3RDX9`N~Qcuv)Lrns{d za7{jZ9`G4xcy7Fuem3xVROPd=k!(9(d9NK(X!1hk;sQIQ@(a#|pAzJ+`v&8$13t7A zDZick+lBn1$EnG^f7*o zWOut{SKH6zwTfNce#Wnp?6ygE^#e>^uh=yVGJb<(cZX!x_#l%vDt1l7jNc^LZI|pe z9ANSdirsAwG5&3m-44la;~^&BsMu}#L&o1E*>y;E&EI75X2ovv5ysyv+3l3F?vie1~IjNc~N-O23;`Lp+3uo#$5z^xGM^8mL2y##PO(5nDD zfW8%Q*NoiV&U5z`(@AsX)&#@bdG6lEbN3zQ5-Q)GV7PK>F~ z9g4M29QuYLzr17rzo~t$mQ%j0iINw-tclWV;V)|<@`cw#xbFNO^1;u@-@w0dczbGt z?eY}Cx(MndkuN?4xe4;0Y2;`p_+rnBK7Xb23AL8+`LWI?muK>MUgs0)HR1CYI-gvg z$>-ZTpHTA&pMS0M$>o`R;u?>#wE2X(QTY5Tolh>$n9lgl&t#5I0~PpE-~PoK^wmuK>cYy1qKP)7@&+jTyno;S=@*!sjfVPcGN_6l?s?!A`94e+U0&()u^z zrH7|WDjo2>%i-zS9)@#j8P08FSh|tnJg>ui18lGNT1`r0^xvf8xYx6ff#c5J>M(KK z>)Fr1ap!Df<6h6c297&-I~(_U_BU|c(hfH6b?tM&xbt>FY}20g81!Wc{aOU^(qjBe z=%a~j4v!411J1r+PmB_V zClK)0_yfIl^$qp)L66_x;|X|bsv2r~f`OWbx~5%q z!};$$kk0=R=QrM$&VPsVzq2Qu|Bsw+D*TI_UlB>y|CsYH-Jj0?HRpe%@c)DJYkSi5 zFLVA-S319l22df{tMC_ae!1cYekKN9g~ZX9?&ns{zto$~uj728%%_+0cPsoKBj4^M z;{T%{yzKzK?sue)0AKezQp6n@j0K51a3})5g+<(bkpu3Ke>g<=U*yN4#2p-lD;pd9^)6rgy}Md=-ocWV2(_#b!xq1?$HxokOPp{N|9=c2 z7VDFs(RLOb!c44#0^w@_`oeV($8s{#S|8^E{a?u>d|Sg_VjUD%!TCZi>^U2LhlQ=N zLN3-#frer)u3w-F Date: Sat, 20 Dec 2025 15:57:40 +0800 Subject: [PATCH 3/5] update --- aiter/ops/gemm_op_a16w16.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index 3f36cde219..645f49c9fd 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -43,8 +43,7 @@ def gemm_a16w16_asm( splitK: Optional[int] = None, kernelName: Optional[str] = None, bpreshuffle: bool = False, -) -> Tensor: - ... +) -> Tensor: ... @functools.lru_cache(maxsize=1) From 52fd47b4f58e9ad53ad188ece1e68480911913eb Mon Sep 17 00:00:00 2001 From: amd-ruitang3 Date: Sat, 20 Dec 2025 16:41:08 +0800 Subject: [PATCH 4/5] update --- aiter/ops/gemm_op_a16w16.py | 8 ++++---- aiter/tuned_gemm.py | 2 +- csrc/py_itfs_cu/asm_gemm_a16w16.cu | 2 +- op_tests/test_gemm_a16w16.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index 645f49c9fd..ceea0ae73a 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -46,9 +46,9 @@ def gemm_a16w16_asm( ) -> Tensor: ... -@functools.lru_cache(maxsize=1) -def get_semaphore_workspace() -> Tensor: - return torch.zeros((16, 64), dtype=torch.uint32, device="cuda") +@functools.lru_cache(maxsize=None) +def get_semaphore_workspace(device: torch.device) -> Tensor: + return torch.zeros((16, 64), dtype=torch.uint32, device=device) def gemm_a16w16( @@ -59,5 +59,5 @@ def gemm_a16w16( splitK: Optional[int] = None, kernelName: Optional[str] = None, ): - sema = get_semaphore_workspace() + sema = get_semaphore_workspace(out.device) return gemm_a16w16_asm(A, B, out, bias, sema, splitK, kernelName) diff --git a/aiter/tuned_gemm.py b/aiter/tuned_gemm.py index cb81a650ab..4465facd34 100644 --- a/aiter/tuned_gemm.py +++ b/aiter/tuned_gemm.py @@ -399,7 +399,7 @@ def asm_gemm( out_asm = torch.empty( inp.shape[0], weights.shape[0], dtype=otype, device=inp.device ) - sema = get_semaphore_workspace() + sema = get_semaphore_workspace(out_asm.device) return gemm_a16w16_asm( inp, weights, out_asm, sema, bias, splitK, KernelName, bpreshuffle ) diff --git a/csrc/py_itfs_cu/asm_gemm_a16w16.cu b/csrc/py_itfs_cu/asm_gemm_a16w16.cu index 2f57864511..4d6b723e4a 100644 --- a/csrc/py_itfs_cu/asm_gemm_a16w16.cu +++ b/csrc/py_itfs_cu/asm_gemm_a16w16.cu @@ -113,7 +113,7 @@ get_heuristic_kernel(int M, { int split_K = 1; if(splitk.has_value()) - split_K = splitk.value(); + split_K = std::min(splitk.value(), 16); else if(cfg.splitK == 1) { pure_tg_num = ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index 06a6c25efa..d2a04692dd 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -59,7 +59,7 @@ def run_gemm_b(x, weight, bias=None, otype=None, scaleA=None, scaleB=None): def run_bf16gemm_asm( x, weight, out_asm, bias=None, splitK=None, kernelName=None, bpreshuffle=False ): - sema = aiter.get_semaphore_workspace() + sema = aiter.get_semaphore_workspace(out_asm.device) return aiter.gemm_a16w16_asm( x, weight, out_asm, sema, bias, splitK, kernelName, bpreshuffle ) From ce812e878d8d6094bdb97f95d82ddd7d000625cc Mon Sep 17 00:00:00 2001 From: amd-ruitang3 Date: Sat, 20 Dec 2025 16:53:45 +0800 Subject: [PATCH 5/5] update --- aiter/ops/gemm_op_a16w16.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index ceea0ae73a..d83ffd0309 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -46,7 +46,7 @@ def gemm_a16w16_asm( ) -> Tensor: ... -@functools.lru_cache(maxsize=None) +@functools.lru_cache(maxsize=1) def get_semaphore_workspace(device: torch.device) -> Tensor: return torch.zeros((16, 64), dtype=torch.uint32, device=device)