diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index e9f86a5cf9..d83ffd0309 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -20,6 +20,7 @@ def gen_gemm_a16w16_asm_fake_tensors( A: Tensor, B: Tensor, out: Tensor, + semaphore: Tensor, bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, @@ -37,6 +38,7 @@ def gemm_a16w16_asm( A: Tensor, B: Tensor, out: Tensor, + semaphore: Tensor, bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, @@ -44,6 +46,11 @@ def gemm_a16w16_asm( ) -> Tensor: ... +@functools.lru_cache(maxsize=1) +def get_semaphore_workspace(device: torch.device) -> Tensor: + return torch.zeros((16, 64), dtype=torch.uint32, device=device) + + def gemm_a16w16( A: Tensor, B: Tensor, @@ -52,4 +59,5 @@ def gemm_a16w16( splitK: Optional[int] = None, kernelName: Optional[str] = None, ): - return gemm_a16w16_asm(A, B, out, bias, splitK, kernelName) + sema = get_semaphore_workspace(out.device) + return gemm_a16w16_asm(A, B, out, bias, sema, splitK, kernelName) diff --git a/aiter/tuned_gemm.py b/aiter/tuned_gemm.py index c7c8b5994f..4465facd34 100644 --- a/aiter/tuned_gemm.py +++ b/aiter/tuned_gemm.py @@ -24,7 +24,14 @@ import torch.nn.functional as F from torch import Tensor -from aiter import dtypes, gemm_a16w16_asm, hipb_create_extension, hipb_mm, logger +from aiter import ( + dtypes, + gemm_a16w16_asm, + get_semaphore_workspace, + hipb_create_extension, + hipb_mm, + logger, +) from aiter.jit.core import AITER_CONFIGS, AITER_LOG_TUNED_CONFIG from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.jit.utils.torch_guard import torch_compile_guard @@ -392,7 +399,10 @@ def asm_gemm( out_asm = torch.empty( inp.shape[0], weights.shape[0], dtype=otype, device=inp.device ) - return gemm_a16w16_asm(inp, weights, out_asm, bias, splitK, KernelName, bpreshuffle) + sema = get_semaphore_workspace(out_asm.device) + return gemm_a16w16_asm( + inp, weights, out_asm, sema, bias, splitK, KernelName, bpreshuffle + ) def triton_gemm( diff --git a/csrc/include/asm_gemm_a16w16.h b/csrc/include/asm_gemm_a16w16.h index c7788bb3ec..26a207882c 100644 --- a/csrc/include/asm_gemm_a16w16.h +++ b/csrc/include/asm_gemm_a16w16.h @@ -6,6 +6,7 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 torch::Tensor& B, // B:[N, K] bf16 torch::Tensor& out, // Out:[M, N] f32 + torch::Tensor& semaphore, std::optional bias, std::optional splitK, std::optional kernelName, diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 135e8ae03b..5530e1f3a2 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -456,6 +456,7 @@ namespace py = pybind11; py::arg("A"), \ py::arg("B"), \ py::arg("out"), \ + py::arg("semaphore"), \ py::arg("bias") = std::nullopt, \ py::arg("splitK") = std::nullopt, \ py::arg("kernelName") = std::nullopt, \ @@ -1537,34 +1538,34 @@ namespace py = pybind11; #define GEMM_COMMON_PYBIND \ m.def("get_padded_m", &getPaddedM, py::arg("M"), py::arg("N"), py::arg("K"), py::arg("gl")); -#define TOP_K_PER_ROW_PYBIND \ - m.def("top_k_per_row_prefill", \ - &top_k_per_row_prefill, \ - py::arg("logits"), \ - py::arg("rowStarts"), \ - py::arg("rowEnds"), \ - py::arg("indices"), \ - py::arg("values"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ - py::arg("stride1")); \ - m.def("top_k_per_row_decode", \ - &top_k_per_row_decode, \ - py::arg("logits"), \ - py::arg("next_n"), \ - py::arg("seqLens"), \ - py::arg("indices"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ - py::arg("stride1")); \ - m.def("top_k_per_row_decode_fast", \ - &top_k_per_row_decode_fast, \ - py::arg("logits"), \ - py::arg("next_n"), \ - py::arg("seqLens"), \ - py::arg("indices"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ +#define TOP_K_PER_ROW_PYBIND \ + m.def("top_k_per_row_prefill", \ + &top_k_per_row_prefill, \ + py::arg("logits"), \ + py::arg("rowStarts"), \ + py::arg("rowEnds"), \ + py::arg("indices"), \ + py::arg("values"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode", \ + &top_k_per_row_decode, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode_fast", \ + &top_k_per_row_decode_fast, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ py::arg("stride1")); #define MLA_METADATA_PYBIND \ diff --git a/csrc/py_itfs_cu/asm_gemm_a16w16.cu b/csrc/py_itfs_cu/asm_gemm_a16w16.cu index b627f06af9..4d6b723e4a 100644 --- a/csrc/py_itfs_cu/asm_gemm_a16w16.cu +++ b/csrc/py_itfs_cu/asm_gemm_a16w16.cu @@ -9,16 +9,15 @@ #include #include -// start to prepare the input and output buffer struct __attribute__((packed)) KernelArgs { - void *ptr_D; + void* ptr_D; p2 _p0; - void *ptr_C; + void* ptr_C; p2 _p1; - void *ptr_A; + void* ptr_A; p2 _p2; - void *ptr_B; + void* ptr_B; p2 _p3; float alpha; p3 _p4; @@ -50,10 +49,12 @@ struct __attribute__((packed)) KernelArgs p3 _p17; unsigned int is_out_b16; p3 _p18; - void *ptr_Bias; + void* ptr_Bias; p2 _p19; unsigned int add_bias; p3 _p20; + void* ptr_semaphore; + p2 _p21; }; std::tuple @@ -64,6 +65,7 @@ get_heuristic_kernel(int M, std::string arch_id, bool bpreshuffle, int add_bias, + int clean = 1, std::optional splitk = std::nullopt, std::optional kernelName = std::nullopt) { @@ -72,7 +74,7 @@ get_heuristic_kernel(int M, HIP_CALL(hipGetDevice(&dev)); HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); uint32_t num_cu = dev_prop.multiProcessorCount; - // printf("num_cu: %d\n", num_cu); + uint32_t empty_cu = num_cu; uint32_t pure_tg_num = 0; uint32_t round = 0xffffffff; @@ -84,41 +86,44 @@ get_heuristic_kernel(int M, for(const auto& el : *cfgs) { - if (el.first.find(arch_id) != 0) + if(el.first.find(arch_id) != 0) continue; const auto& cfg = el.second; if(kernelName.has_value() && el.first != (arch_id + kernelName.value())) continue; if(kernelName.has_value()) { - TORCH_CHECK( - N % cfg.tileN == 0 && - cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && - (add_bias == 0 || cfg.bias == 1), - __func__, - " the specified kernel name ", el.first, - " cannot support the input shape (N=", N, ", tileN=", cfg.tileN, - ") or bias/preshuffle setting (preshuffle=", bpreshuffle, - ", bias=", add_bias, ")." - ); + TORCH_CHECK(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && + (add_bias == 0 || cfg.bias == 1), + __func__, + " the specified kernel name ", + el.first, + " cannot support the input shape (N=", + N, + ", tileN=", + cfg.tileN, + ") or bias/preshuffle setting (preshuffle=", + bpreshuffle, + ", bias=", + add_bias, + ")."); } - if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && (add_bias == 0 || cfg.bias == 1)) + if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && + (add_bias == 0 || cfg.bias == 1) && clean == cfg.clean) { - // 1. select splitK int split_K = 1; if(splitk.has_value()) - split_K = splitk.value(); - else if (cfg.splitK == 1)// auto select + split_K = std::min(splitk.value(), 16); + else if(cfg.splitK == 1) { - pure_tg_num = - ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); // M-orient support OOB + pure_tg_num = ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); if(pure_tg_num < num_cu) { - TORCH_CHECK(cfg.subK > 0, __func__, " cfg.subK must be greater than 0 to avoid division by zero."); - int max_split = std::min( - std::min(static_cast(num_cu / pure_tg_num), 16), - static_cast(K / cfg.subK) // “K-dim must satisfy min 128 bytes. BF16 are 2 bytes each, this means min ele of K is 64.” - ); + TORCH_CHECK(cfg.subK > 0, + __func__, + " cfg.subK must be greater than 0 to avoid division by zero."); + int max_split = std::min(std::min(static_cast(num_cu / pure_tg_num), 16), + static_cast(K / cfg.subK)); for(int i = max_split; i >= 1; i--) { if(K % 64 == 0) @@ -132,16 +137,15 @@ get_heuristic_kernel(int M, } } - uint32_t tg_num = pure_tg_num * split_K; - // 2. better or not - uint32_t local_round = (tg_num + num_cu - 1) / num_cu; - float local_compute2mem_effi = cfg.tileM * cfg.tileN / (cfg.tileM + cfg.tileN); + uint32_t tg_num = pure_tg_num * split_K; + uint32_t local_round = (tg_num + num_cu - 1) / num_cu; + float local_compute2mem_effi = + static_cast(cfg.tileM * cfg.tileN) / (cfg.tileM + cfg.tileN); bool is_earlier_round = (local_round < round); bool is_same_round = (local_round == round); bool has_sufficient_empty_cu = (empty_cu > (local_round * num_cu - tg_num)); - bool has_same_empty_cu = empty_cu == (local_round * num_cu - tg_num); + bool has_same_empty_cu = (empty_cu == (local_round * num_cu - tg_num)); bool has_better_efficiency = (local_compute2mem_effi > compute2mem_effi); - // printf("oob %d, tielM: %d\n", oob, cfg.tileM); bool less_oob = (M % cfg.tileM == 0) ? (oob > 0) : (cfg.tileM - M % cfg.tileM < oob); bool has_same_oob = (cfg.tileM - (M % cfg.tileM)) == oob; @@ -153,7 +157,6 @@ get_heuristic_kernel(int M, compute2mem_effi = local_compute2mem_effi; oob = (M % cfg.tileM == 0) ? 0 : cfg.tileM - (M % cfg.tileM); selectedKernelName = el.first; - // printf("Selected Kernel: %s\n", selectedKernelName.c_str()); selectedsplitK = split_K; } } @@ -163,175 +166,101 @@ get_heuristic_kernel(int M, return std::make_tuple(selectedKernelName, selectedsplitK); } -torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 - torch::Tensor& B, // B:[N, K] bf16 - torch::Tensor& out, // Out:[M, N] f32 +AiterAsmKernel* get_or_load_kernel(const std::string& selectedKernelName, + CFG* config_map, + unsigned int& SUBM, + unsigned int& SUBN) +{ + static std::unordered_map> impl_ptr_map; + + auto it_kl = config_map->find(selectedKernelName); + TORCH_CHECK(it_kl != config_map->end(), __func__, " not find kernel~ " + selectedKernelName); + + const auto& cfg = it_kl->second; + const char* name = cfg.knl_name.c_str(); + const char* co_name = cfg.co_name.c_str(); + SUBM = cfg.tileM; + SUBN = cfg.tileN; + + auto result = impl_ptr_map.emplace(name, nullptr); + if(result.second) + result.first->second = std::make_unique(name, co_name); + + return result.first->second.get(); +} + +torch::Tensor gemm_a16w16_asm(torch::Tensor& A, + torch::Tensor& B, + torch::Tensor& out, + torch::Tensor& semaphore, std::optional bias, std::optional splitK, std::optional kernelName, bool bpreshuffle = false) { - TORCH_CHECK(out.dtype() == torch::ScalarType::Float || out.dtype() == torch::ScalarType::BFloat16, + TORCH_CHECK(out.dtype() == torch::ScalarType::Float || + out.dtype() == torch::ScalarType::BFloat16, "GEMM A16W16 asm only support Float32 or Bf16 output now!"); - + std::string arch_id = get_gpu_arch(); - // 1. prepare args - int Mdim = A.size(0); - int Ndim = B.size(0); - int Kdim = A.size(1); + int Mdim = A.size(0); + int Ndim = B.size(0); + int Kdim = A.size(1); - unsigned int SUBM = 64; + unsigned int SUBM = 32; unsigned int SUBN = 64; - float alpha = 1.0; - float beta = 0.0; - int szA = Mdim * Kdim; - int szB = Kdim * Ndim; - int szC = Mdim * Ndim; - int szBias = 1 * Ndim; - int sz_A_pad = 0; - int sz_B_pad = 0; - int sz_C_pad = 0; - int strideD0 = 0; - int strideD1 = 0; - int strideC0 = 0; - int strideC1 = 0; - int strideA0 = 0; - int strideA1 = 0; - int strideB0 = 0; - int strideB1 = 0; - int is_out_b16 = 0; - int add_bias = bias.has_value() ? 1 : 0; - // A row major, B col major, C row major - strideA0 = strideA1 = A.stride(0) * A.element_size(); // in bytes - strideB0 = strideB1 = B.stride(0) * B.element_size(); - const auto elem_bytes = out.element_size(); - strideC0 = strideC1 = strideD0 = strideD1 = Ndim * elem_bytes; // inbytes - if (out.dtype() == torch::ScalarType::BFloat16) - is_out_b16 = 1; - - szA += sz_A_pad; - szB += sz_B_pad; - szC += sz_C_pad; - KernelArgs args; - size_t arg_size = sizeof(args); + KernelArgs args = {}; args.ptr_D = (void*)out.data_ptr(); - // args.ptr_C = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; - args.ptr_C = (void*)NULL; - args.ptr_A = (void*)A.data_ptr(); - args.ptr_B = (void*)B.data_ptr(); - args.ptr_Bias = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; - args.alpha = alpha; - args.beta = beta; - args.stride_C0 = strideC0; - args.stride_A0 = strideA0; - args.stride_B0 = strideB0; - args.M = Mdim; - args.N = Ndim; - args.K = Kdim; - args.is_out_b16 = is_out_b16; - args.add_bias = add_bias; + args.ptr_C = nullptr; + args.ptr_A = (void*)A.data_ptr(); + args.ptr_B = (void*)B.data_ptr(); + args.ptr_Bias = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; + args.alpha = 1.0f; + args.beta = 0.0f; + args.stride_A0 = A.stride(0) * A.element_size(); + args.stride_B0 = B.stride(0) * B.element_size(); + args.stride_C0 = args.stride_D0 = Ndim * out.element_size(); + args.M = Mdim; + args.N = Ndim; + args.K = Kdim; + args.is_out_b16 = (out.dtype() == torch::ScalarType::BFloat16) ? 1 : 0; + args.add_bias = bias.has_value() ? 1 : 0; - // args.stride_D0 = 25; - // args.stride_D1 = 80; - // args.stride_C1 = 3; - // args.stride_A1 = 124; - - // 2. select kl - static std::unordered_map> impl_ptr_map; - AiterAsmKernel* impl_ptr = nullptr; - CFG* config_map = &cfg_bf16gemm_fp32bf16; - - // 2.1 static dict + CFG* config_map = &cfg_bf16gemm_fp32bf16; std::string selectedKernelName = kernelName.has_value() ? arch_id + kernelName.value() : ""; int selectedksplit = splitK.value_or(0) ?: 1; - if(!kernelName.has_value() || kernelName == "" || !splitK.has_value()) + if(!kernelName.has_value() || kernelName.value_or("").empty() || !splitK.has_value()) { - - auto it_sel = get_heuristic_kernel(Mdim, - Ndim, - Kdim, - config_map, - arch_id, - bpreshuffle, - add_bias, - splitK.has_value() ? splitK : std::nullopt, - kernelName.has_value() ? kernelName : std::nullopt); - selectedKernelName = std::get<0>(it_sel); - selectedksplit = std::get<1>(it_sel); + auto [name, split] = get_heuristic_kernel(Mdim, + Ndim, + Kdim, + config_map, + arch_id, + bpreshuffle, + args.add_bias, + 1, + splitK, + kernelName); + selectedKernelName = name; + selectedksplit = split; } - - args.splitk = selectedksplit; - // printf("=== KernelArgs Important Parameters ===\n"); - // printf("ptr_D: %p\n", args.ptr_D); - // printf("ptr_A: %p\n", args.ptr_A); - // printf("ptr_B: %p\n", args.ptr_B); - // printf("alpha: %f\n", args.alpha); - // printf("beta: %f\n", args.beta); - // printf("stride_D0: %u\n", args.stride_D0); - // printf("stride_D1: %u\n", args.stride_D1); - // printf("stride_C0: %u\n", args.stride_C0); - // printf("stride_C1: %u\n", args.stride_C1); - // printf("stride_A0: %u\n", args.stride_A0); - // printf("stride_A1: %u\n", args.stride_A1); - // printf("stride_B0: %u\n", args.stride_B0); - // printf("stride_B1: %u\n", args.stride_B1); - // printf("M: %u\n", args.M); - // printf("N: %u\n", args.N); - // printf("K: %u\n", args.K); - // printf("splitk: %u\n", args.splitk); - // printf("is_out_b16: %u\n", args.is_out_b16); - // printf("add_bias: %u\n", args.add_bias); - // printf("=======================================\n"); - - auto it_kl = config_map->find(selectedKernelName); - if(it_kl != config_map->end()) - { - const auto& cfg = it_kl->second; - const char* name = cfg.knl_name.c_str(); - const char* co_name = cfg.co_name.c_str(); - SUBM = cfg.tileM; - SUBN = cfg.tileN; - auto result = impl_ptr_map.emplace(name, nullptr); // insert new kl. - if(result.second) // emplace successfully - result.first->second = std::make_unique(name, co_name); - impl_ptr = result.first->second.get(); - } - else - TORCH_CHECK(false, __func__, " not find kernel~ " + selectedKernelName); - - // 3. launch kl + args.splitk = selectedksplit; + AiterAsmKernel* impl_ptr = get_or_load_kernel(selectedKernelName, config_map, SUBM, SUBN); const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(A)); const hipStream_t stream = at::hip::getCurrentHIPStream(); - int bdx = 256; int gdx = (Ndim + SUBN - 1) / SUBN; - int gdy = ((Mdim + SUBM - 1) / SUBM); - int gdz = 1; + int gdy = (Mdim + SUBM - 1) / SUBM; + int gdz = selectedksplit; - if(selectedksplit > 1) - { - out.zero_(); - // HIP_CALL(hipMemsetAsync(out.data_ptr(), 0, elem_bytes * szC, stream)) - int k_per_tg = Kdim / selectedksplit; - gdz = selectedksplit; - } + TORCH_CHECK(gdx <= 16, __func__, " gdx (", gdx, ") must be <= 16"); // 16 = 512/32 - // printf("argsize: %zu\n", arg_size); - // printf("gdx: %d\n", gdx); - // printf("gdy: %d\n", gdy); - // printf("gdz: %d\n", gdz); + // semaphore.fill_(selectedksplit); + args.ptr_semaphore = (void*)semaphore.data_ptr(); - impl_ptr->launch_kernel({&args, - &arg_size, - gdx, // gdx - gdy, // gdy - gdz, // gdz - 256, // bdx: 4 wv64 - 1, // bdy - 1, // bdz - stream}); + size_t arg_size = sizeof(args); + impl_ptr->launch_kernel({&args, &arg_size, gdx, gdy, gdz, 256, 1, 1, stream}); - // 4. return out return out; } diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv index 3e58803dd2..63698c39bc 100755 --- a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv +++ b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv @@ -1,11 +1,23 @@ -knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias -_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1 +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias,clean +_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1,0 +_ZN5aiter49bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co,1,128,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co,1,32,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co,1,48,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co,1,64,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co,1,80,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co,1,96,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_splitk_clean.co,1,96,64,0,0,1,64,1,1 +_ZN5aiter49bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co,1,160,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_splitk_clean.co,1,32,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_splitk_clean.co,1,48,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_splitk_clean.co,1,64,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_splitk_clean.co,1,80,64,0,0,1,64,1,1 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..4fef26f0ea Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..4399cc9c78 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..7d039667e4 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co new file mode 100755 index 0000000000..dd39d15d4b Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..fcba584883 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co new file mode 100755 index 0000000000..9be88b91c4 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..f4eb170061 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co new file mode 100755 index 0000000000..36eb76a83c Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..7676e8b634 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co new file mode 100755 index 0000000000..c59edd0bd1 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..697bf1870a Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co new file mode 100755 index 0000000000..333832ab10 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv index 9f2183e46a..bbce04c538 100644 --- a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv +++ b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv @@ -1,13 +1,25 @@ -knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias -_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1,0,64,0 -_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0,0,64,0 -_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1 +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias,clean +_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1,0,64,0,0 +_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0,0,64,0,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1,0 +_ZN5aiter49bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co,1,128,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co,1,32,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co,1,48,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co,1,64,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co,1,80,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co,1,96,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_splitk_clean.co,1,96,64,0,0,1,64,1,1 +_ZN5aiter49bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co,1,160,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_splitk_clean.co,1,32,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_splitk_clean.co,1,48,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_splitk_clean.co,1,64,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_splitk_clean.co,1,80,64,0,0,1,64,1,1 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..7594ad577d Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..b7a2b37260 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..8231d55a8c Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co new file mode 100755 index 0000000000..432c63e0d0 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..c671e30964 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co new file mode 100755 index 0000000000..191f0841ae Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..0d4f08a2f2 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co new file mode 100755 index 0000000000..888bd5897b Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..daffdec4e2 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co new file mode 100755 index 0000000000..6bc2b0212c Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000..ad1c2180b7 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co new file mode 100755 index 0000000000..c435c53f18 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co differ diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index c5e297b0ff..d2a04692dd 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -59,8 +59,9 @@ def run_gemm_b(x, weight, bias=None, otype=None, scaleA=None, scaleB=None): def run_bf16gemm_asm( x, weight, out_asm, bias=None, splitK=None, kernelName=None, bpreshuffle=False ): + sema = aiter.get_semaphore_workspace(out_asm.device) return aiter.gemm_a16w16_asm( - x, weight, out_asm, bias, splitK, kernelName, bpreshuffle + x, weight, out_asm, sema, bias, splitK, kernelName, bpreshuffle )