diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 504b506513..ce731d73cf 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -20,7 +20,7 @@ this_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, f"{this_dir}/utils/") -from chip_info import get_gfx +from chip_info import get_gfx, get_gfx_list from cpp_extension import _jit_compile, get_hip_version from file_baton import FileBaton from torch_guard import torch_compile_guard # noqa: E402 @@ -264,9 +264,16 @@ def get_config_file(env_name, default_file, tuned_file_name): sys.path.insert(0, AITER_META_DIR) AITER_CSRC_DIR = f"{AITER_META_DIR}/csrc" AITER_GRADLIB_DIR = f"{AITER_META_DIR}/gradlib" -gfx = get_gfx() -AITER_ASM_DIR = f"{AITER_META_DIR}/hsa/{gfx}/" -os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR +gfx = get_gfx_list() +if len(gfx) == 1: + # single GPU arch + AITER_ASM_DIR = f"{AITER_META_DIR}/hsa/{gfx[0]}/" + os.environ["AITER_ASM_DIR"] = AITER_ASM_DIR +else: + # multiple GPU archs + AITER_ASM_DIR = [f"{AITER_META_DIR}/hsa/{g}/" for g in gfx] + os.environ["AITER_ASM_DIR"] = ":".join(AITER_ASM_DIR) + CK_3RDPARTY_DIR = os.environ.get( "CK_DIR", f"{AITER_META_DIR}/3rdparty/composable_kernel" ) diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index 1d1e2f3e9f..0ef416b8e9 100755 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -84,7 +84,7 @@ "extra_ldflags": "None", "extra_include": [], "verbose": "False", - "blob_gen_cmd": "f'{get_asm_dir()}/pa/codegen.py --output_dir {{}}'" + "blob_gen_cmd": "f'{AITER_META_DIR}/hsa/codegen.py -m pa --output_dir {{}}'" }, "module_pa": { "srcs": [ @@ -317,7 +317,7 @@ "extra_ldflags": "None", "extra_include": [], "verbose": "False", - "blob_gen_cmd": "f'{get_asm_dir()}/i8gemm/codegen.py --output_dir {{}}'" + "blob_gen_cmd": "f'{AITER_META_DIR}/hsa/codegen.py -m i8gemm --output_dir {{}}'" }, "module_gemm_a16w16_asm": { "srcs": [ @@ -329,7 +329,7 @@ "extra_ldflags": "None", "extra_include": [], "verbose": "False", - "blob_gen_cmd": "f'{get_asm_dir()}/bf16gemm/codegen.py --output_dir {{}}'" + "blob_gen_cmd": "f'{AITER_META_DIR}/hsa/codegen.py -m bf16gemm --output_dir {{}}'" }, "module_gemm_a4w4_asm": { "srcs": [ @@ -341,7 +341,7 @@ "extra_ldflags": "None", "extra_include": [], "verbose": "False", - "blob_gen_cmd": "f'{get_asm_dir()}/f4gemm/codegen.py --output_dir {{}}'" + "blob_gen_cmd": "f'{AITER_META_DIR}/hsa/codegen.py -m f4gemm --output_dir {{}}'" }, "module_gemm_a8w8_blockscale_asm": { "srcs": [ @@ -386,8 +386,8 @@ ], "verbose": "False", "blob_gen_cmd": [ - "f'{get_asm_dir()}/fmoe_2stages/codegen.py --output_dir {{}}'", - "f'{get_asm_dir()}/fmoe/codegen.py --output_dir {{}}'" + "f'{AITER_META_DIR}/hsa/codegen.py -m fmoe_2stages --output_dir {{}}'", + "f'{AITER_META_DIR}/hsa/codegen.py -m fmoe --output_dir {{}}'" ] }, "module_moe_ck2stages": { diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index bf63634b68..e9f86a5cf9 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -23,6 +23,7 @@ def gen_gemm_a16w16_asm_fake_tensors( bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, + bpreshuffle: bool = False, ) -> Tensor: return out @@ -39,6 +40,7 @@ def gemm_a16w16_asm( bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, + bpreshuffle: bool = False, ) -> Tensor: ... diff --git a/csrc/include/asm_gemm_a16w16.h b/csrc/include/asm_gemm_a16w16.h index a5c9fd285a..c7788bb3ec 100644 --- a/csrc/include/asm_gemm_a16w16.h +++ b/csrc/include/asm_gemm_a16w16.h @@ -8,4 +8,5 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 torch::Tensor& out, // Out:[M, N] f32 std::optional bias, std::optional splitK, - std::optional kernelName); + std::optional kernelName, + bool bpreshuffle = false); diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index cb7c92f6c9..2e0bb4a842 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -396,7 +396,8 @@ namespace py = pybind11; py::arg("out"), \ py::arg("bias") = std::nullopt, \ py::arg("splitK") = std::nullopt, \ - py::arg("kernelName") = std::nullopt); + py::arg("kernelName") = std::nullopt, \ + py::arg("bpreshuffle") = false); #define GEMM_A4W4_ASM_PYBIND \ m.def("gemm_a4w4_asm", \ diff --git a/csrc/py_itfs_cu/asm_fmoe.cu b/csrc/py_itfs_cu/asm_fmoe.cu index a31552ba08..d22daf3058 100755 --- a/csrc/py_itfs_cu/asm_fmoe.cu +++ b/csrc/py_itfs_cu/asm_fmoe.cu @@ -260,7 +260,8 @@ FMoeKernel* get_heuristic_kernel( uint32_t tg_num = 0; uint32_t num_persistent_tgs = 0; uint32_t round = 0xffffffff; - std::string selectedKl = kernel_name; + std::string arch_id = get_gpu_arch(); + std::string selectedKl = kernel_name.empty() ? "" : arch_id + kernel_name; int vskip = 1; static std::unordered_map> impl_ptr_map; @@ -271,6 +272,8 @@ FMoeKernel* get_heuristic_kernel( { for(const auto& el : *cfgs) { + if (el.first.find(arch_id) != 0) + continue; const auto& cfg = el.second; if(cfg.vskip == vskip && cfg.smf == smf) { @@ -312,7 +315,7 @@ FMoeKernel* get_heuristic_kernel( if(it != cfgs->end()) { const auto& cfg = it->second; - const char* name = cfg.name.c_str(); + const char* name = cfg.knl_name.c_str(); const char* co_name = cfg.co_name.c_str(); auto result = impl_ptr_map.emplace(name, nullptr); if(cfg.ps == 1) diff --git a/csrc/py_itfs_cu/asm_gemm_a16w16.cu b/csrc/py_itfs_cu/asm_gemm_a16w16.cu index b48695d2d1..18e94aa4ec 100644 --- a/csrc/py_itfs_cu/asm_gemm_a16w16.cu +++ b/csrc/py_itfs_cu/asm_gemm_a16w16.cu @@ -47,7 +47,9 @@ struct __attribute__((packed)) KernelArgs unsigned int K; p3 _p16; unsigned int splitk; - p2 _p17; + p3 _p17; + unsigned int is_out_b16; + p3 _p18; }; std::tuple @@ -55,6 +57,8 @@ get_heuristic_kernel(int M, int N, int K, CFG* cfgs, + std::string arch_id, + bool bpreshuffle, std::optional splitk = std::nullopt, std::optional kernelName = std::nullopt) { @@ -75,10 +79,12 @@ get_heuristic_kernel(int M, for(const auto& el : *cfgs) { + if (el.first.find(arch_id) != 0) + continue; const auto& cfg = el.second; - if(kernelName.has_value() && kernelName.value() != el.first) + if(kernelName.has_value() && el.first != (arch_id + kernelName.value())) continue; - if(N % cfg.tileN == 0) + if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0)) { // 1. select splitK int split_K = 1; @@ -125,6 +131,7 @@ get_heuristic_kernel(int M, compute2mem_effi = local_compute2mem_effi; oob = (M % cfg.tileM == 0) ? 0 : cfg.tileM - (M % cfg.tileM); selectedKernelName = el.first; + // printf("Selected Kernel: %s\n", selectedKernelName.c_str()); selectedsplitK = split_K; } } @@ -139,11 +146,13 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 torch::Tensor& out, // Out:[M, N] f32 std::optional bias, std::optional splitK, - std::optional kernelName) + std::optional kernelName, + bool bpreshuffle = false) { - TORCH_CHECK(out.dtype() == torch::ScalarType::Float, - "GEMM A16W16 asm only support Float32 output now!"); - + TORCH_CHECK(out.dtype() == torch::ScalarType::Float || out.dtype() == torch::ScalarType::BFloat16, + "GEMM A16W16 asm only support Float32 or Bf16 output now!"); + + std::string arch_id = get_gpu_arch(); // 1. prepare args int Mdim = A.size(0); int Ndim = B.size(0); @@ -167,10 +176,14 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 int strideA1 = 0; int strideB0 = 0; int strideB1 = 0; + int is_out_b16 = 0; // A row major, B col major, C row major strideA0 = strideA1 = Kdim * 2; // in bytes strideB0 = strideB1 = Kdim * 2; - strideC0 = strideC1 = strideD0 = strideD1 = Ndim * 4; // inbytes + const auto elem_bytes = out.element_size(); + strideC0 = strideC1 = strideD0 = strideD1 = Ndim * elem_bytes; // inbytes + if (out.dtype() == torch::ScalarType::BFloat16) + is_out_b16 = 1; szA += sz_A_pad; szB += sz_B_pad; @@ -191,6 +204,7 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 args.M = Mdim; args.N = Ndim; args.K = Kdim; + args.is_out_b16 = is_out_b16; // args.stride_D0 = 25; // args.stride_D1 = 80; @@ -200,18 +214,20 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 // 2. select kl static std::unordered_map> impl_ptr_map; AiterAsmKernel* impl_ptr = nullptr; - CFG* config_map = &cfg_bf16gemm_outf32; + CFG* config_map = &cfg_bf16gemm_fp32bf16; // 2.1 static dict - std::string selectedKernelName = kernelName.value_or(""); + std::string selectedKernelName = kernelName.has_value() ? arch_id + kernelName.value() : ""; int selectedksplit = splitK.value_or(0) ?: 1; - if(!kernelName.has_value() || kernelName == "") + if(!kernelName.has_value() || kernelName == "" || !splitK.has_value()) { auto it_sel = get_heuristic_kernel(Mdim, Ndim, Kdim, config_map, + arch_id, + bpreshuffle, splitK.has_value() ? splitK : std::nullopt, kernelName.has_value() ? kernelName : std::nullopt); selectedKernelName = std::get<0>(it_sel); @@ -237,13 +253,14 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 // printf("N: %u\n", args.N); // printf("K: %u\n", args.K); // printf("splitk: %u\n", args.splitk); + // printf("is_out_b16: %u\n", args.is_out_b16); // printf("=======================================\n"); auto it_kl = config_map->find(selectedKernelName); if(it_kl != config_map->end()) { const auto& cfg = it_kl->second; - const char* name = cfg.name.c_str(); + const char* name = cfg.knl_name.c_str(); const char* co_name = cfg.co_name.c_str(); SUBM = cfg.tileM; SUBN = cfg.tileN; diff --git a/csrc/py_itfs_cu/asm_gemm_a4w4.cu b/csrc/py_itfs_cu/asm_gemm_a4w4.cu index 5c5e279658..53fda03a93 100644 --- a/csrc/py_itfs_cu/asm_gemm_a4w4.cu +++ b/csrc/py_itfs_cu/asm_gemm_a4w4.cu @@ -65,8 +65,7 @@ static CFG* get_cfg(torch::Tensor& inp, torch::Tensor& out) { #if defined(__Float4_e2m1fn_x2) - if(inp.dtype() == torch_fp4x2 && - out.scalar_type() == at::ScalarType::BFloat16) + if(inp.dtype() == torch_fp4x2 && out.scalar_type() == at::ScalarType::BFloat16) #else if((inp.dtype() == torch::kUInt8) && out.scalar_type() == at::ScalarType::BFloat16) #endif @@ -87,6 +86,7 @@ static CFG* get_cfg(torch::Tensor& inp, torch::Tensor& out) std::tuple get_heuristic_kernel(int M, int N, int K, + std::string arch_id, std::optional log2_k_split, std::optional bpreshuffle, CFG* cfgs) @@ -107,6 +107,8 @@ std::tuple get_heuristic_kernel(int M, for(const auto& el : *cfgs) { + if(el.first.find(arch_id) != 0) + continue; const auto& cfg = el.second; if(cfg.bpreshuffle == bpreshuffle_en && ((cfg.splitK == log2_k_split_en) || !log2_k_split.has_value())) @@ -197,8 +199,8 @@ torch::Tensor gemm_a4w4_asm(torch::Tensor& A, // A:[M, K/2] f4x2 const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(A)); const hipStream_t stream = at::hip::getCurrentHIPStream(); - CFG* config_map = get_cfg(A, out); - using DictKey = std::tuple, std::optional>; + CFG* config_map = get_cfg(A, out); + using DictKey = std::tuple, std::optional>; struct SimpleHash { size_t operator()(const DictKey& key) const @@ -220,6 +222,9 @@ torch::Tensor gemm_a4w4_asm(torch::Tensor& A, // A:[M, K/2] f4x2 static std::unordered_map> impl_ptr_map; + std::string arch_id = get_gpu_arch(); + kernelName = kernelName.empty() ? "" : arch_id + kernelName; + int selectedksplit = log2_k_split.has_value() ? log2_k_split.value() : 0; if(kernelName.empty()) { @@ -232,7 +237,8 @@ torch::Tensor gemm_a4w4_asm(torch::Tensor& A, // A:[M, K/2] f4x2 } else { - auto it = get_heuristic_kernel(Mdim, Ndim, Kdim, log2_k_split, bpreshuffle, config_map); + auto it = get_heuristic_kernel( + Mdim, Ndim, Kdim, arch_id, log2_k_split, bpreshuffle, config_map); kernelName = std::get<0>(it); selectedksplit = std::get<1>(it); @@ -250,7 +256,7 @@ torch::Tensor gemm_a4w4_asm(torch::Tensor& A, // A:[M, K/2] f4x2 if(it != config_map->end()) { const auto& cfg = it->second; - const char* name = cfg.name.c_str(); + const char* name = cfg.knl_name.c_str(); const char* co_name = cfg.co_name.c_str(); SUBM = cfg.tile_M; SUBN = cfg.tile_N; @@ -260,7 +266,8 @@ torch::Tensor gemm_a4w4_asm(torch::Tensor& A, // A:[M, K/2] f4x2 args.log2_k_split = selectedksplit; int k_num = 1 << args.log2_k_split; TORCH_CHECK(Kdim % k_num == 0, __func__, " Kdim % (1 << args.log2_k_split) != 0 !"); - if(k_num>1)out.zero_(); + if(k_num > 1) + out.zero_(); int k_per_tg = Kdim / k_num; k_per_tg = ((k_per_tg + 256 - 1) / 256) * 256; gdz = (Kdim + k_per_tg - 1) / k_per_tg; diff --git a/csrc/py_itfs_cu/asm_gemm_a8w8.cu b/csrc/py_itfs_cu/asm_gemm_a8w8.cu index 04fa69488c..20520a034f 100644 --- a/csrc/py_itfs_cu/asm_gemm_a8w8.cu +++ b/csrc/py_itfs_cu/asm_gemm_a8w8.cu @@ -60,7 +60,7 @@ static CFG* get_cfg(torch::Tensor& inp, torch::Tensor& out) }; std::tuple get_heuristic_kernel( - int M, int N, int K, std::optional k_split, std::optional bpreshuffle, CFG* cfgs) + int M, int N, int K, std::string arch_id, std::optional k_split, std::optional bpreshuffle, CFG* cfgs) { k_split = k_split.value_or(0) ?: 1; hipDevice_t dev; @@ -80,11 +80,13 @@ std::tuple get_heuristic_kernel( for(const auto& el : *cfgs) { + if(el.first.find(arch_id) != 0) + continue; const auto& cfg = el.second; if(cfg.bpreshuffle == bpreshuffle_en && ((cfg.splitK == k_split_en) || !k_split.has_value())) { - if((N % cfg.tile_N) == 0) + if((N % cfg.tile_n) == 0) { std::vector splitK_list = (k_split.has_value() && cfg.splitK) @@ -96,13 +98,13 @@ std::tuple get_heuristic_kernel( for(auto& splitK : splitK_list) { - int tg_num_M = (M + cfg.tile_M - 1) / cfg.tile_M; - int tg_num_N = (N + cfg.tile_N - 1) / cfg.tile_N; + int tg_num_M = (M + cfg.tile_m - 1) / cfg.tile_m; + int tg_num_N = (N + cfg.tile_n - 1) / cfg.tile_n; tg_num = tg_num_M * tg_num_N * splitK; uint32_t local_round = (tg_num + num_cu - 1) / num_cu; float local_compute2mem_effi = - cfg.tile_M * cfg.tile_N / (cfg.tile_M + cfg.tile_N); + cfg.tile_m * cfg.tile_n / (cfg.tile_m + cfg.tile_n); bool is_earlier_round = (local_round < round); bool is_same_round = (local_round == round); @@ -190,7 +192,8 @@ torch::Tensor gemm_a8w8_asm(torch::Tensor& A, // A:[M, K] i8 TORCH_CHECK(false, __func__, " no kernel support a8w8 for this gpu arch"); } static std::unordered_map> impl_ptr_map; - + std::string arch_id = get_gpu_arch(); + kernelName = kernelName.empty() ? "" : arch_id + kernelName; int selectedksplit = splitK.value_or(0) ?: 1; if(kernelName.empty()) { @@ -203,7 +206,7 @@ torch::Tensor gemm_a8w8_asm(torch::Tensor& A, // A:[M, K] i8 } else { - auto it = get_heuristic_kernel(Mdim, Ndim, Kdim, splitK, bpreshuffle, config_map); + auto it = get_heuristic_kernel(Mdim, Ndim, Kdim, arch_id, splitK, bpreshuffle, config_map); kernelName = std::get<0>(it); selectedksplit = std::get<1>(it); @@ -223,10 +226,10 @@ torch::Tensor gemm_a8w8_asm(torch::Tensor& A, // A:[M, K] i8 if(it != config_map->end()) { const auto& cfg = it->second; - const char* name = cfg.name.c_str(); + const char* name = cfg.knl_name.c_str(); const char* co_name = cfg.co_name.c_str(); - SUBM = cfg.tile_M; - SUBN = cfg.tile_N; + SUBM = cfg.tile_m; + SUBN = cfg.tile_n; gdx = (Ndim / SUBN) * blockSizeX; gdy = (Mdim % SUBM == 0) ? Mdim / SUBM : Mdim / SUBM + 1; gdz = 1; diff --git a/csrc/py_itfs_cu/asm_moe_2stage.cu b/csrc/py_itfs_cu/asm_moe_2stage.cu index aa2e54d945..7111df37b7 100644 --- a/csrc/py_itfs_cu/asm_moe_2stage.cu +++ b/csrc/py_itfs_cu/asm_moe_2stage.cu @@ -7,7 +7,7 @@ #include #include "aiter_hip_common.h" #include "moe_op.h" -#include "asm_moe_2stage_configs.hpp" +#include "asm_fmoe_2stages_configs.hpp" #include "py_itfs_common.h" struct __attribute__((packed)) KernelArgs @@ -104,7 +104,7 @@ static CFG *get_cfg(torch::Tensor &inp, torch::Tensor &out, torch::Tensor &w1, Q } }; -std::string get_heuristic_kernel(int m_num, int N, int blockk_size, CFG *cfgs) +std::string get_heuristic_kernel(int m_num, int N, int blockk_size, CFG *cfgs, std::string arch_id) { hipDevice_t dev; hipDeviceProp_t dev_prop; @@ -118,13 +118,15 @@ std::string get_heuristic_kernel(int m_num, int N, int blockk_size, CFG *cfgs) for (const auto &el : *cfgs) { + if (el.first.find(arch_id) != 0) + continue; const auto &cfg = el.second; - if (cfg.tile_M != blockk_size || N % cfg.tile_N != 0) + if (cfg.tile_m != blockk_size || N % cfg.tile_n != 0) { continue; } - tg_num = (N + cfg.tile_N - 1) / cfg.tile_N * m_num; + tg_num = (N + cfg.tile_n - 1) / cfg.tile_n * m_num; uint32_t local_round = (tg_num + num_cu - 1) / num_cu; if (local_round < round) { @@ -171,9 +173,11 @@ void moe_stage1_g1u1( int model_dim = input.size(1); int hidden_dim = inter_dim; int sub_X_cnt = sorted_expert_ids.size(0); + std::string arch_id = get_gpu_arch(); + kernelName = !kernelName.empty() ? arch_id + kernelName : ""; if (kernelName.empty()) { - kernelName = get_heuristic_kernel(sub_X_cnt, inter_dim, block_m, config_map); + kernelName = get_heuristic_kernel(sub_X_cnt, inter_dim, block_m, config_map, arch_id); } AiterAsmKernel *impl_ptr = nullptr; @@ -181,10 +185,10 @@ void moe_stage1_g1u1( if (it != config_map->end()) { const auto &cfg = it->second; - const char *name = cfg.name.c_str(); + const char *name = cfg.knl_name.c_str(); const char *co_name = cfg.co_name.c_str(); - TORCH_CHECK(inter_dim % cfg.tile_N == 0, "ASM kernel " + std::string(name) + " is not supported for inter_dim = " + std::to_string(inter_dim)); + TORCH_CHECK(inter_dim % cfg.tile_n == 0, "ASM kernel " + std::string(name) + " is not supported for inter_dim = " + std::to_string(inter_dim)); auto result = impl_ptr_map.emplace(name, nullptr); if (result.second) @@ -204,8 +208,8 @@ void moe_stage1_g1u1( int dim = w2.size(1); int eprt = w1.size(0); const auto &cfg = it->second; - uint32_t sub_GU = cfg.tile_N; - TORCH_CHECK(block_m == cfg.tile_M, __func__, " kernel: ", cfg.name, " need block_m == ", cfg.tile_M); + uint32_t sub_GU = cfg.tile_n; + TORCH_CHECK(block_m == cfg.tile_m, __func__, " kernel: ", cfg.knl_name, " need block_m == ", cfg.tile_m); int stride_X = input.stride(0) * input.element_size(); int stride_GU = dim * w1.element_size(); diff --git a/csrc/py_itfs_cu/asm_pa.cu b/csrc/py_itfs_cu/asm_pa.cu index 9cee784308..73347aa5a5 100644 --- a/csrc/py_itfs_cu/asm_pa.cu +++ b/csrc/py_itfs_cu/asm_pa.cu @@ -52,6 +52,7 @@ std::string get_heuristic_kernel(std::string q_type, int msk, int hp, int block_size, + std::string arch_id, CFG* cfgs) { const std::vector mtp_flags = (mtp > 0) ? std::vector{mtp, 1} : std::vector{0}; @@ -63,11 +64,13 @@ std::string get_heuristic_kernel(std::string q_type, // find exact match for(const auto& el : *cfgs) { + if (el.first.find(arch_id) != 0) + continue; const auto& cfg = el.second; // hp is just distinct from uhp - if(cfg.q_type == q_type && cfg.kv_type == kv_type && cfg.gqa == gqa_ && - cfg.mtp == mtp_ && cfg.msk == msk && (cfg.hp == hp || hp == 1) && - cfg.block_size == block_size) + if(cfg.qType == q_type && cfg.kvType == kv_type && cfg.Gqa == gqa_ && + cfg.Mtp == mtp_ && cfg.Msk == msk && (cfg.Hp == hp || hp == 1) && + cfg.blkSz == block_size) return el.first; } @@ -111,6 +114,7 @@ torch::Tensor pa_fwd(torch::Tensor& Q, // [num_seqs, num_heads, head_size] { torch::Tensor output = out_.value_or(torch::empty_like(Q)); int batch = context_lens.size(0); + std::string arch_id = get_gpu_arch(); // int block_tables_stride0 = block_tables.size(1); int num_heads = Q.size(1); int head_size = Q.size(2); @@ -209,9 +213,9 @@ torch::Tensor pa_fwd(torch::Tensor& Q, // [num_seqs, num_heads, head_size] CFG* config_map = &cfg_pa_asm; // only one config csv in hsa//pa, now static std::unordered_map> impl_ptr_map; - - std::string kernelName = kernelName_.value_or( - get_heuristic_kernel(q_type, kv_type, gqa_ratio, mtp, msk, hp, block_size, config_map)); + std::string kernelName = kernelName_.has_value() ? arch_id + kernelName_.value() : ""; + if (kernelName.empty()) + kernelName = get_heuristic_kernel(q_type, kv_type, gqa_ratio, mtp, msk, hp, block_size, arch_id, config_map); if(kernelName.empty()) { TORCH_CHECK(false, __func__, "not supported this kernel now! "); @@ -223,7 +227,7 @@ torch::Tensor pa_fwd(torch::Tensor& Q, // [num_seqs, num_heads, head_size] if(it != config_map->end()) { const auto& cfg = it->second; - const char* name = cfg.name.c_str(); + const char* name = cfg.knl_name.c_str(); const char* co_name = cfg.co_name.c_str(); auto result = impl_ptr_map.emplace(name, nullptr); if(result.second) diff --git a/csrc/rocm_ops.cpp b/csrc/rocm_ops.cpp index ab2c0ce3f5..9b21b5b2cd 100644 --- a/csrc/rocm_ops.cpp +++ b/csrc/rocm_ops.cpp @@ -23,6 +23,7 @@ #include "gemm_a8w8.h" #include "gemm_a8w8_blockscale.h" #include "gemm_a8w8_bpreshuffle.h" +#include "gemm_a8w8_blockscale_bpreshuffle.h" #include "gemm_a8w8_bpreshuffle_cktile.h" #include "gemm_common.h" #include "hipbsolgemm.cuh" @@ -97,6 +98,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) // GEMM_A8W8_BLOCKSCALE_TUNE_PYBIND; GEMM_A4W4_BLOCKSCALE_PYBIND; GEMM_A8W8_BLOCKSCALE_PYBIND; + GEMM_A8W8_BLOCKSCALE_BPRESHUFFLE_PYBIND; AITER_OPERATOR_PYBIND; AITER_UNARY_PYBIND; CUSTOM_ALL_REDUCE_PYBIND; diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py index f14f1db5d7..44ee280019 100644 --- a/gradlib/gradlib/GemmTuner.py +++ b/gradlib/gradlib/GemmTuner.py @@ -53,9 +53,17 @@ def call_hipb_mm(input, weight, bias, scale_a, scale_b, solidx, out_dtype): ) -def run_gemm_bf16_asm(inp, w, out, bias=None, splitK=None, kernelName=None): +def run_gemm_bf16_asm( + inp, w, out, bias=None, splitK=None, kernelName=None, bpreshuffle=False +): return aiter.gemm_a16w16_asm( - inp, w, out, bias=bias, splitK=splitK, kernelName=kernelName + inp, + w, + out, + bias=bias, + splitK=splitK, + kernelName=kernelName, + bpreshuffle=bpreshuffle, ) @@ -227,6 +235,8 @@ def get_asm_kernels(self, file): print(f"ASM kernel list file not exist: {file}") return {} df = pd.read_csv(file) + if "bPreshuffle" in df.columns: + df = df[df["bPreshuffle"] != 1] kernel_dict = ( df.groupby(["tileM", "tileN", "pf"])["knl_name"].apply(list).to_dict() ) diff --git a/hsa/codegen.py b/hsa/codegen.py new file mode 100644 index 0000000000..1b9a890d90 --- /dev/null +++ b/hsa/codegen.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: MIT +# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. + +import os +import sys +import argparse +import glob +import pandas as pd +import numpy as np +from collections import defaultdict + +this_dir = os.path.dirname(os.path.abspath(__file__)) +base_dir = os.path.basename(this_dir) +archs = [ + os.path.basename(os.path.normpath(path)) + for path in os.environ.get("AITER_ASM_DIR").split(":") +] + + +content = """// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +#pragma once +#include + +""" + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="generate", + description="gen API for asm Bf16_gemm kernel", + ) + parser.add_argument( + "-m", + "--module", + required=True, + help="""module of ASM kernel, + e.g.: -m bf16gemm +""", + ) + parser.add_argument( + "-o", + "--output_dir", + default="aiter/jit/build", + required=False, + help="write all the blobs into a directory", + ) + args = parser.parse_args() + cfgs = [] + + csv_groups = defaultdict(list) + for arch in archs: + # print(f"{this_dir}/{arch}/{args.module}") + for el in glob.glob( + f"{this_dir}/{arch}/{args.module}/**/*.csv", recursive=True + ): + df = pd.read_csv(el) + cfgname = os.path.basename(el).split(".")[0] + csv_groups[cfgname].append({"file_path": el, "arch": arch}) + + ## deal with same name csv + cfgs = [] + have_get_header = False + for cfgname, file_info_list in csv_groups.items(): + dfs = [] + for file_info in file_info_list: + single_file = file_info["file_path"] + arch = file_info["arch"] + df = pd.read_csv(single_file) + # check headers + headers_list = df.columns.tolist() + required_columns = {"knl_name", "co_name"} + if not required_columns.issubset(headers_list): + missing = required_columns - set(headers_list) + print( + f"ERROR: Invalid assembly CSV format -- {single_file}. Missing required columns: {', '.join(missing)}" + ) + sys.exit(1) + df["arch"] = arch # add arch into df + dfs.append(df) + if dfs: + relpath = os.path.relpath( + os.path.dirname(single_file), f"{this_dir}/{arch}" + ) + combine_df = pd.concat(dfs, ignore_index=True).fillna(0) + if not have_get_header: + headers_list = combine_df.columns.tolist() + required_columns = {"knl_name", "co_name", "arch"} + other_columns = [ + col for col in headers_list if col not in required_columns + ] + other_columns_comma = ", ".join(other_columns) + sample_row = combine_df.iloc[0] + other_columns_cpp_def = "\n".join( + [ + f" {'int' if isinstance(sample_row[col], (int, float, np.integer)) else 'std::string'} {col};" + for col in other_columns + ] + ) + content += f""" +#define ADD_CFG({other_columns_comma}, arch, path, knl_name, co_name) \\ + {{ \\ + arch knl_name, {{ knl_name, path co_name, arch, {other_columns_comma} }} \\ + }} + +struct {args.module}Config +{{ + std::string knl_name; + std::string co_name; + std::string arch; +{other_columns_cpp_def} +}}; + +using CFG = std::unordered_map; + +""" + have_get_header = True + cfg = [ + f'ADD_CFG({", ".join(f"\"{getattr(row, col)}\"" if not str(getattr(row, col)).isdigit() else f"{getattr(row, col):>4}" for col in other_columns)}, ' + f'"{row.arch}", "{relpath}/", "{row.knl_name}", "{row.co_name}"),' + for row in combine_df.itertuples(index=False) + ] + cfg_txt = "\n ".join(cfg) + "\n" + + txt = f"""static CFG cfg_{cfgname} = {{ + {cfg_txt}}};""" + cfgs.append(txt) + + content += "\n".join(cfgs) + "\n" + + with open(f"{args.output_dir}/asm_{args.module}_configs.hpp", "w") as f: + f.write(content) diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv new file mode 100755 index 0000000000..69e2f746b1 --- /dev/null +++ b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv @@ -0,0 +1,10 @@ +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle +_ZN5aiter36bf16gemm_fp32bf16_tn_128x64_bshuffleE,bf16gemm_fp32bf16_tn_128x64_bshuffle.co,1,128,64,0,1 +_ZN5aiter36bf16gemm_fp32bf16_tn_160x64_bshuffleE,bf16gemm_fp32bf16_tn_160x64_bshuffle.co,1,160,64,0,1 +_ZN5aiter30bf16gemm_fp32bf16_tn_32x64_pf3E,bf16gemm_fp32bf16_tn_32x64_pf3.co,1,32,64,3,0 +_ZN5aiter35bf16gemm_fp32bf16_tn_48x64_bshuffleE,bf16gemm_fp32bf16_tn_48x64_bshuffle.co,1,48,64,0,1 +_ZN5aiter30bf16gemm_fp32bf16_tn_48x64_pf3E,bf16gemm_fp32bf16_tn_48x64_pf3.co,1,48,64,3,0 +_ZN5aiter35bf16gemm_fp32bf16_tn_64x64_bshuffleE,bf16gemm_fp32bf16_tn_64x64_bshuffle.co,1,64,64,0,1 +_ZN5aiter30bf16gemm_fp32bf16_tn_64x64_pf3E,bf16gemm_fp32bf16_tn_64x64_pf3.co,1,64,64,3,0 +_ZN5aiter35bf16gemm_fp32bf16_tn_96x64_bshuffleE,bf16gemm_fp32bf16_tn_96x64_bshuffle.co,1,96,64,0,1 +_ZN5aiter30bf16gemm_fp32bf16_tn_96x64_pf3E,bf16gemm_fp32bf16_tn_96x64_pf3.co,1,96,64,3,0 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle.co new file mode 100755 index 0000000000..14157a9b16 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle.co new file mode 100755 index 0000000000..548cd2befd Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle.co new file mode 100755 index 0000000000..5923b16e1e Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_pf3.co new file mode 100755 index 0000000000..a9724fca02 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_pf3.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle.co new file mode 100755 index 0000000000..bf63148a89 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_pf3.co new file mode 100755 index 0000000000..5a9acc9974 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_pf3.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle.co new file mode 100755 index 0000000000..fb07f018c0 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_pf3.co new file mode 100755 index 0000000000..0c90056a10 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_pf3.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle.co new file mode 100755 index 0000000000..a2dac5ddc3 Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_pf3.co new file mode 100755 index 0000000000..1b619b5bab Binary files /dev/null and b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_pf3.co differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_outf32.csv b/hsa/gfx942/bf16gemm/bf16gemm_outf32.csv deleted file mode 100755 index 714fdce694..0000000000 --- a/hsa/gfx942/bf16gemm/bf16gemm_outf32.csv +++ /dev/null @@ -1,5 +0,0 @@ -knl_name,knl_file,tn,tileM,tileN,pf -_ZN5aiter28bf16gemm_outf32_tn_32x64_pf3E,bf16gemm_outf32_tn_32x64_pf3.co,1,32,64,3 -_ZN5aiter28bf16gemm_outf32_tn_48x64_pf3E,bf16gemm_outf32_tn_48x64_pf3.co,1,48,64,3 -_ZN5aiter28bf16gemm_outf32_tn_64x64_pf3E,bf16gemm_outf32_tn_64x64_pf3.co,1,64,64,3 -_ZN5aiter28bf16gemm_outf32_tn_96x64_pf3E,bf16gemm_outf32_tn_96x64_pf3.co,1,96,64,3 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_32x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_32x64_pf3.co deleted file mode 100755 index 273e574e41..0000000000 Binary files a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_32x64_pf3.co and /dev/null differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_48x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_48x64_pf3.co deleted file mode 100755 index cedc1e2455..0000000000 Binary files a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_48x64_pf3.co and /dev/null differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_64x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_64x64_pf3.co deleted file mode 100755 index ee41f2e493..0000000000 Binary files a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_64x64_pf3.co and /dev/null differ diff --git a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_96x64_pf3.co b/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_96x64_pf3.co deleted file mode 100755 index 539490ca4d..0000000000 Binary files a/hsa/gfx942/bf16gemm/bf16gemm_outf32_tn_96x64_pf3.co and /dev/null differ diff --git a/hsa/gfx942/bf16gemm/codegen.py b/hsa/gfx942/bf16gemm/codegen.py deleted file mode 100755 index b915bac6b3..0000000000 --- a/hsa/gfx942/bf16gemm/codegen.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) -base_dir = os.path.basename(this_dir) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(tn, tileM, tileN, pf, path, name, co) \\ - { \\ - name, { name, path co, tn, tileM, tileN, pf } \\ - } - -struct BF16GemmConfig -{ - std::string name; - std::string co_name; - int tn; - int tileM; - int tileN; - int pf; -}; - -using CFG = std::unordered_map; - -""" - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for asm Bf16_gemm kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - cfgs = [] - for el in glob.glob(f"{this_dir}/**/*.csv", recursive=True): - - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tn:>4}, {tileM:>4},{tileN:>4}, {pf:>4}, "{base_dir}/{os.path.dirname(os.path.relpath(el, this_dir))}/", "{Name}", "{Co}"),' - for Name, Co, tn, tileM, tileN, pf in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_bf16gemm_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx942/f4gemm/codegen.py b/hsa/gfx942/f4gemm/codegen.py deleted file mode 100644 index 6421c4451c..0000000000 --- a/hsa/gfx942/f4gemm/codegen.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(tile_m, tile_n, splitK, bpreshuffle, path, name, co) \\ - { \\ - name, { name, path co, tile_m, tile_n, splitK, bpreshuffle} \\ - } - -struct F4GemmConfig -{ - std::string name; - std::string co_name; - int tile_M; - int tile_N; - int splitK; - int bpreshuffle; -}; - -using CFG = std::unordered_map; - -""" -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for CK fmha kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tileM:>4}, {tileN:>4}, {splitK:>4}, {bpreshuffle:>4}, "f4gemm/", "{name}", "{co}"),' - for tileM, tileN, splitK, bpreshuffle, name, co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_f4gemm_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx942/f4gemm/f4gemm_bf16_per1x32Fp4.csv b/hsa/gfx942/f4gemm/f4gemm_bf16_per1x32Fp4.csv index 420eb022a1..d77ba460d9 100644 --- a/hsa/gfx942/f4gemm/f4gemm_bf16_per1x32Fp4.csv +++ b/hsa/gfx942/f4gemm/f4gemm_bf16_per1x32Fp4.csv @@ -1 +1 @@ -tile_m,tile_n,splitK,bpreshuffle,knl_name,co_name \ No newline at end of file +tile_M,tile_N,splitK,bpreshuffle,knl_name,co_name \ No newline at end of file diff --git a/hsa/gfx942/fmoe/codegen.py b/hsa/gfx942/fmoe/codegen.py deleted file mode 100644 index c0732f97e8..0000000000 --- a/hsa/gfx942/fmoe/codegen.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) -base_dir = os.path.basename(this_dir) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(atm, vskip, smf, tg_num_perCU, ps, subGU_m, subGU_n, path, name, co) \\ - { \\ - name, { name, path co, atm, vskip, smf, tg_num_perCU, ps, subGU_m, subGU_n } \\ - } - -struct AsmFmoeConfig -{ - std::string name; - std::string co_name; - int atm; - int vskip; - int smf; - int tg_num_perCU; - int ps; - int subGU_m; - int subGU_n; -}; - -using CFG = std::unordered_map; - -""" - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for asm Fused_moe kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/**/*.csv", recursive=True): - - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({atm}, {vskip},{smf:>4}, {tg_num_perCU:>4}, {ps:>4},{subGU_m:>4}, {subGU_n:>4}, "{base_dir}/{os.path.dirname(os.path.relpath(el, this_dir))}/", "{Name}", "{Co}"),' - for Name, Co, atm, vskip, smf, tg_num_perCU, ps, subGU_m, subGU_n in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_fmoe_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv index acc2fbd586..f25b1fa86f 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co,0,0,0,1,0,32,256 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv index 346185b83f..1cb9f73b18 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter54fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co,2,0,0,1,0,32,512 _ZN5aiter57fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co,2,0,0,1,1,32,512 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv index 0050418f71..7a8a58ec25 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter55fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co,0,1,2,1,1,32,448 _ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128E,fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384.co,0,1,0,1,0,32,384 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv index 31d3fd0642..fec8c165ca 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x128E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x256E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256.co,0,1,0,1,0,32,256 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv index ce1e704453..d9c73f3015 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448.co,0,1,0,1,1,32,448 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512E,fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512.co,0,1,0,1,1,32,512 _ZN5aiter46fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x320E,fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x320.co,0,1,0,1,0,32,320 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv index 8cf66d2c51..abd2a9f4fa 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter46fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x256E,fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x192E,fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x192.co,0,1,0,1,1,32,192 _ZN5aiter53fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x448E,fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x448.co,0,1,2,1,0,32,448 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv index 2ed85f504a..de93d2c889 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448.co,0,0,0,1,0,32,448 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128.co,0,0,0,1,0,32,128 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co,0,0,0,1,0,32,384 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv index 9f12215838..fdbd422869 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter47fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter52fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co,0,0,0,1,1,32,256 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv index 49ad79b97e..7f83e9c340 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter57fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512E,fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co,2,0,0,1,1,32,512 _ZN5aiter54fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_gelu_1tg_32x512E,fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co,2,0,0,1,0,32,512 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv index c18c77c61f..177a828e4b 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter48fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x384E,fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x384.co,0,1,0,1,1,32,384 _ZN5aiter55fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x192E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x192.co,0,1,2,1,1,32,192 _ZN5aiter52fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x384E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x384.co,0,1,2,1,0,32,384 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv index 6bd4342697..36ef0cf602 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320.co,0,1,0,1,1,32,320 _ZN5aiter50fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x512E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x512.co,0,1,0,1,1,32,512 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv index 433f34af1a..5122d5d7f7 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter46fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x384E,fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x384.co,0,1,0,1,0,32,384 _ZN5aiter49fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512E,fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x512.co,0,1,0,1,1,32,512 _ZN5aiter50fmoe_fp16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512E,fmoe_fp16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512.co,0,1,1,1,0,32,512 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv index 980b34732c..be126c8f3b 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter53fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x128E,fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x128.co,0,1,2,1,0,32,128 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x512E,fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x512.co,0,1,0,1,0,32,512 _ZN5aiter56fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x192E,fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x192.co,0,1,2,1,1,32,192 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv index 4699b0b80c..fcc4c46c70 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter51fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x128E,fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x128.co,0,0,0,1,1,32,128 _ZN5aiter51fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192E,fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192.co,0,0,0,1,1,32,192 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448E,fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x448.co,0,0,0,1,0,32,448 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv index c90cc46875..401f30905c 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co,0,0,0,1,1,32,256 _ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv index bb41af16f0..acfbbeec58 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter57fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co,2,0,0,1,1,32,512 _ZN5aiter54fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_32x512.co,2,0,0,1,0,32,512 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv index 6781f19371..f5ce8322be 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter52fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x384.co,0,1,2,1,0,32,384 _ZN5aiter55fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x384.co,0,1,2,1,1,32,384 _ZN5aiter55fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448.co,0,1,2,1,1,32,448 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv index 45f901746c..58e2579456 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512.co,0,1,0,1,0,32,512 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192.co,0,1,0,1,1,32,192 _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co,0,1,0,1,0,32,192 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv index cf713ec2fa..2c9beaad3c 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter46fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x256E,fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter50fmoe_bf16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512E,fmoe_bf16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512.co,0,1,1,1,0,32,512 _ZN5aiter54fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_32x512E,fmoe_bf16_pertokenInt8_g1u0_vs_atm_opt_silu_1tg_32x512.co,1,1,0,1,0,32,512 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv index e95150b5c4..539a4de39e 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x384E,fmoe_bf16_pertokenInt8_g1u1_vs_smf_silu_1tg_32x384.co,0,1,1,1,0,32,384 _ZN5aiter53fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x384E,fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x384.co,0,1,2,1,0,32,384 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x512E,fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x512.co,0,1,0,1,1,32,512 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv index a638e9010a..9b01c5e4ee 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co,0,0,0,1,0,32,448 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x320E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x320.co,0,0,0,1,0,32,320 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128.co,0,0,0,1,0,32,128 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv index 6f3751de71..25cac50a65 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co,0,0,0,1,0,32,256 _ZN5aiter50fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter52fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co,0,0,0,1,1,32,256 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv index db122e5031..fb3a0b2406 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter54fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_silu_1tg_32x512E,fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_silu_1tg_32x512.co,2,0,0,1,0,32,512 _ZN5aiter57fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512E,fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co,2,0,0,1,1,32,512 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv index b391ea19a1..d77575fff4 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter52fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x512E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x512.co,0,1,2,1,0,32,512 _ZN5aiter55fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x448.co,0,1,2,1,1,32,448 _ZN5aiter52fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x320E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_32x320.co,0,1,2,1,0,32,320 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv index be05bcb5f8..639f36cbae 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x320E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x320.co,0,1,0,1,0,32,320 _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co,0,1,0,1,1,32,384 _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x320E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x320.co,0,1,0,1,1,32,320 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv index 65e60d80a0..93b68397e6 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512E,fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co,0,1,0,1,1,32,512 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x128E,fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter50fmoe_fp16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512E,fmoe_fp16_pertokenInt8_g1u0_vs_smf_silu_1tg_32x512.co,0,1,1,1,0,32,512 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv index 1f1737dea8..4274afdc37 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter56fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x192E,fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x192.co,0,1,2,1,1,32,192 _ZN5aiter49fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x256E,fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter53fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x320E,fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x320.co,0,1,2,1,0,32,320 diff --git a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv index cca539ee62..ddc4561bce 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter51fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512E,fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x512.co,0,0,0,1,1,32,512 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192E,fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x192.co,0,0,0,1,0,32,192 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448E,fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co,0,0,0,1,0,32,448 diff --git a/hsa/gfx942/fmoe_2stages/codegen.py b/hsa/gfx942/fmoe_2stages/codegen.py deleted file mode 100644 index 0693c455bf..0000000000 --- a/hsa/gfx942/fmoe_2stages/codegen.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(M, N, path, name, co) \\ - { \\ - name, { name, path co, M, N } \\ - } - -struct FMoe2StageConfig -{ - std::string name; - std::string co_name; - int tile_M; - int tile_N; -}; - -using CFG = std::unordered_map; - -""" -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for CK fmha kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tileM:>4}, {tileN:>4}, "fmoe_2stages/", "{name}", "{co}"),' - for tileM, tileN, name, co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_moe_2stage_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx942/i8gemm/codegen.py b/hsa/gfx942/i8gemm/codegen.py deleted file mode 100755 index 4edb356d77..0000000000 --- a/hsa/gfx942/i8gemm/codegen.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(tile_m, tile_n, splitK, bpreshuffle, path, name, co) \\ - { \\ - name, { name, path co, tile_m, tile_n, splitK, bpreshuffle} \\ - } - -struct I8GemmConfig -{ - std::string name; - std::string co_name; - int tile_M; - int tile_N; - int splitK; - int bpreshuffle; -}; - -using CFG = std::unordered_map; - -""" -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="generate configuration API for i8gemm asm kernels", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tileM:>4}, {tileN:>4}, {splitK:>4}, {bpreshuffle:>4}, "i8gemm/", "{name}", "{co}"),' - for tileM, tileN, splitK, bpreshuffle, name, co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_i8gemm_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx942/pa/codegen.py b/hsa/gfx942/pa/codegen.py deleted file mode 100644 index 7c85dcd0a3..0000000000 --- a/hsa/gfx942/pa/codegen.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(q_type, kv_type, gqa, mtp, msk, hp, block_size, path, name, co) \\ - { \\ - name, { name, path co, q_type, kv_type, gqa, mtp, msk, hp, block_size } \\ - } - -struct AsmPaConfig -{ - std::string name; - std::string co_name; - std::string q_type; - std::string kv_type; - int gqa; - int mtp; - int msk; - int hp; - int block_size; -}; - -using CFG = std::unordered_map; - -""" - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for asm PA kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG("{qType}", "{kvType}",{Gqa:>4}, {Mtp:>2}, {Msk:>2}, {Hp:>2}, {blkSz:>2}, "pa/", "{Name}", "{Co}"),' - for qType, kvType, Gqa, Mtp, Msk, Hp, blkSz, Name, Co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_pa_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx942/pa/pa_asm.csv b/hsa/gfx942/pa/pa_asm.csv index 604e5bd9f0..a135e41628 100644 --- a/hsa/gfx942/pa/pa_asm.csv +++ b/hsa/gfx942/pa/pa_asm.csv @@ -1,4 +1,4 @@ -qType,kvType,Gqa,Mtp,Msk,Hp,blkSz,knl_name,knl_file +qType,kvType,Gqa,Mtp,Msk,Hp,blkSz,knl_name,co_name bf16,fp8,16,0,0,0,16,_ZN5aiter32pa_bf16_pertokenFp8_gqa16_2tg_4wE,pa_bf16_pertokenFp8_gqa16_2tg_4w.co bf16,int8,16,0,0,0,16,_ZN5aiter33pa_bf16_pertokenInt8_gqa16_2tg_4wE,pa_bf16_pertokenInt8_gqa16_2tg_4w.co bf16,bf16,16,0,0,0,16,_ZN5aiter28pa_bf16_noquant_gqa16_1tg_4wE,pa_bf16_noquant_gqa16_1tg_4w.co diff --git a/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256.co b/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256.co new file mode 100755 index 0000000000..3d03288b40 Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256_bpreshuffle.co b/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256_bpreshuffle.co new file mode 100755 index 0000000000..5d9330bdfd Binary files /dev/null and b/hsa/gfx950/bf16gemm/bf16gemm_bf16_tn_256x256_bpreshuffle.co differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv new file mode 100644 index 0000000000..03cd2f9787 --- /dev/null +++ b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv @@ -0,0 +1,3 @@ +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle +_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1 +_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_outf32.csv b/hsa/gfx950/bf16gemm/bf16gemm_outf32.csv deleted file mode 100755 index 714fdce694..0000000000 --- a/hsa/gfx950/bf16gemm/bf16gemm_outf32.csv +++ /dev/null @@ -1,5 +0,0 @@ -knl_name,knl_file,tn,tileM,tileN,pf -_ZN5aiter28bf16gemm_outf32_tn_32x64_pf3E,bf16gemm_outf32_tn_32x64_pf3.co,1,32,64,3 -_ZN5aiter28bf16gemm_outf32_tn_48x64_pf3E,bf16gemm_outf32_tn_48x64_pf3.co,1,48,64,3 -_ZN5aiter28bf16gemm_outf32_tn_64x64_pf3E,bf16gemm_outf32_tn_64x64_pf3.co,1,64,64,3 -_ZN5aiter28bf16gemm_outf32_tn_96x64_pf3E,bf16gemm_outf32_tn_96x64_pf3.co,1,96,64,3 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_32x64_pf3.co b/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_32x64_pf3.co deleted file mode 100755 index e8b66a682a..0000000000 Binary files a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_32x64_pf3.co and /dev/null differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_48x64_pf3.co b/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_48x64_pf3.co deleted file mode 100755 index e80adfdd5f..0000000000 Binary files a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_48x64_pf3.co and /dev/null differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_64x64_pf3.co b/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_64x64_pf3.co deleted file mode 100755 index 8920454b67..0000000000 Binary files a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_64x64_pf3.co and /dev/null differ diff --git a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_96x64_pf3.co b/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_96x64_pf3.co deleted file mode 100755 index 2d1be397f6..0000000000 Binary files a/hsa/gfx950/bf16gemm/bf16gemm_outf32_tn_96x64_pf3.co and /dev/null differ diff --git a/hsa/gfx950/bf16gemm/codegen.py b/hsa/gfx950/bf16gemm/codegen.py deleted file mode 100755 index b915bac6b3..0000000000 --- a/hsa/gfx950/bf16gemm/codegen.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) -base_dir = os.path.basename(this_dir) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(tn, tileM, tileN, pf, path, name, co) \\ - { \\ - name, { name, path co, tn, tileM, tileN, pf } \\ - } - -struct BF16GemmConfig -{ - std::string name; - std::string co_name; - int tn; - int tileM; - int tileN; - int pf; -}; - -using CFG = std::unordered_map; - -""" - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for asm Bf16_gemm kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - cfgs = [] - for el in glob.glob(f"{this_dir}/**/*.csv", recursive=True): - - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tn:>4}, {tileM:>4},{tileN:>4}, {pf:>4}, "{base_dir}/{os.path.dirname(os.path.relpath(el, this_dir))}/", "{Name}", "{Co}"),' - for Name, Co, tn, tileM, tileN, pf in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_bf16gemm_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx950/f4gemm/codegen.py b/hsa/gfx950/f4gemm/codegen.py deleted file mode 100644 index 6421c4451c..0000000000 --- a/hsa/gfx950/f4gemm/codegen.py +++ /dev/null @@ -1,64 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(tile_m, tile_n, splitK, bpreshuffle, path, name, co) \\ - { \\ - name, { name, path co, tile_m, tile_n, splitK, bpreshuffle} \\ - } - -struct F4GemmConfig -{ - std::string name; - std::string co_name; - int tile_M; - int tile_N; - int splitK; - int bpreshuffle; -}; - -using CFG = std::unordered_map; - -""" -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for CK fmha kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tileM:>4}, {tileN:>4}, {splitK:>4}, {bpreshuffle:>4}, "f4gemm/", "{name}", "{co}"),' - for tileM, tileN, splitK, bpreshuffle, name, co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_f4gemm_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx950/f4gemm/f4gemm_bf16_per1x32Fp4.csv b/hsa/gfx950/f4gemm/f4gemm_bf16_per1x32Fp4.csv index 2c2e30d646..b721cb7dcf 100644 --- a/hsa/gfx950/f4gemm/f4gemm_bf16_per1x32Fp4.csv +++ b/hsa/gfx950/f4gemm/f4gemm_bf16_per1x32Fp4.csv @@ -1,4 +1,4 @@ -tile_m,tile_n,splitK,bpreshuffle,knl_name,co_name +tile_M,tile_N,splitK,bpreshuffle,knl_name,co_name 256,256,0,0,_ZN5aiter44f4gemm_bf16_per1x32Fp4_noBpreShuffle_256x256E,f4gemm_bf16_per1x32Fp4_noBpreShuffle_256x256.co 256,256,1,1,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256E,f4gemm_bf16_per1x32Fp4_BpreShuffle_256x256.co 128,512,1,1,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E,f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512.co diff --git a/hsa/gfx950/fmoe/codegen.py b/hsa/gfx950/fmoe/codegen.py deleted file mode 100644 index c0732f97e8..0000000000 --- a/hsa/gfx950/fmoe/codegen.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) -base_dir = os.path.basename(this_dir) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(atm, vskip, smf, tg_num_perCU, ps, subGU_m, subGU_n, path, name, co) \\ - { \\ - name, { name, path co, atm, vskip, smf, tg_num_perCU, ps, subGU_m, subGU_n } \\ - } - -struct AsmFmoeConfig -{ - std::string name; - std::string co_name; - int atm; - int vskip; - int smf; - int tg_num_perCU; - int ps; - int subGU_m; - int subGU_n; -}; - -using CFG = std::unordered_map; - -""" - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for asm Fused_moe kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/**/*.csv", recursive=True): - - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({atm}, {vskip},{smf:>4}, {tg_num_perCU:>4}, {ps:>4},{subGU_m:>4}, {subGU_n:>4}, "{base_dir}/{os.path.dirname(os.path.relpath(el, this_dir))}/", "{Name}", "{Co}"),' - for Name, Co, atm, vskip, smf, tg_num_perCU, ps, subGU_m, subGU_n in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_fmoe_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv index 2e40ae0786..9e39eb9ae3 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co,0,0,0,1,0,32,256 _ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv index 7842820378..d068d3c4ed 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_noquant_g1u0_gelu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter57fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co,2,0,0,1,1,32,512 _ZN5aiter54fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co,2,0,0,1,0,32,512 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv index a09ac7ba2d..2502bcd1e7 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384.co,0,1,0,1,0,32,384 _ZN5aiter55fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co,0,1,2,1,1,32,448 _ZN5aiter52fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x320E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_gelu_1tg_32x320.co,0,1,2,1,0,32,320 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv index 1e62666e2c..5537a862c9 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x320E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x320.co,0,1,0,1,0,32,320 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x320.co,0,1,0,1,1,32,320 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384.co,0,1,0,1,1,32,384 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv index 7ed04ce9e6..c4ecd7c4c8 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u0_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter46fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x448E,fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_32x448.co,0,1,0,1,0,32,448 _ZN5aiter50fmoe_bf16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512E,fmoe_bf16_pertokenInt8_g1u0_vs_smf_gelu_1tg_32x512.co,0,1,1,1,0,32,512 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x448.co,0,1,0,1,1,32,448 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv index edebee35d1..b7ada621eb 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x448.co,0,1,0,1,1,32,448 _ZN5aiter53fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x512E,fmoe_bf16_pertokenInt8_g1u1_vs_multix_gelu_1tg_32x512.co,0,1,2,1,0,32,512 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x384E,fmoe_bf16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x384.co,0,1,0,1,1,32,384 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv index 9aebb8aec4..b1253732c8 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenInt8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter51fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x192.co,0,0,0,1,1,32,192 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co,0,0,0,1,0,32,384 _ZN5aiter51fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x448E,fmoe_bf16_pertokenInt8_g1u1_tkw1_gelu_1tg_ps_32x448.co,0,0,0,1,1,32,448 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenMXfp4_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenMXfp4_g1u1_gelu.csv index 2f79a8caff..51f6f9a0ff 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenMXfp4_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_bf16_pertokenMXfp4_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256E,fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256.co,0,0,0,2,0,32,256 _ZN5aiter47fmoe_bf16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256E,fmoe_bf16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256.co,0,1,0,2,0,32,256 _ZN5aiter49fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512E,fmoe_bf16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512.co,0,0,0,1,0,32,512 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv index 7c676e0554..2c47cf39f4 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_blockscaleFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter49fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256E,fmoe_fp16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co,0,0,0,1,0,32,256 _ZN5aiter47fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv index 3b1fa5fa98..8f6ce6cf27 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_noquant_g1u0_gelu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter54fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_gelu_1tg_32x512E,fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_gelu_1tg_32x512.co,2,0,0,1,0,32,512 _ZN5aiter57fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512E,fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_gelu_1tg_ps_32x512.co,2,0,0,1,1,32,512 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv index 66f723ba1c..ef476f0f6c 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter55fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_gelu_1tg_ps_32x448.co,0,1,2,1,1,32,448 _ZN5aiter49fmoe_fp16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x512E,fmoe_fp16_pertokenFp8_g1u1_vs_smf_gelu_1tg_32x512.co,0,1,1,1,0,32,512 _ZN5aiter48fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x320E,fmoe_fp16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x320.co,0,1,0,1,1,32,320 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv index ad4c997d9b..3b4b4cbf55 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenFp8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x192E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x192.co,0,1,0,1,1,32,192 _ZN5aiter50fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x192E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_32x192.co,0,1,0,1,0,32,192 _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_gelu_1tg_ps_32x384.co,0,1,0,1,1,32,384 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv index eb625692fa..88111615cd 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u0_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter46fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x448E,fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x448.co,0,1,0,1,0,32,448 _ZN5aiter49fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x128E,fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_ps_32x128.co,0,1,0,1,1,32,128 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x256E,fmoe_fp16_pertokenInt8_g1u0_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv index 2dc7dba3e8..3f487919e6 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter56fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x320E,fmoe_fp16_pertokenInt8_g1u1_vs_multix_gelu_1tg_ps_32x320.co,0,1,2,1,1,32,320 _ZN5aiter49fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x256E,fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x256E,fmoe_fp16_pertokenInt8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv index f6663cb7be..c1a31053ec 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenInt8_g1u1_gelu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384E,fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x384.co,0,0,0,1,0,32,384 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128E,fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x128.co,0,0,0,1,0,32,128 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512E,fmoe_fp16_pertokenInt8_g1u1_tkw1_gelu_1tg_32x512.co,0,0,0,1,0,32,512 diff --git a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenMXfp4_g1u1_gelu.csv b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenMXfp4_g1u1_gelu.csv index 3b6b18c57e..e110676c8a 100644 --- a/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenMXfp4_g1u1_gelu.csv +++ b/hsa/gfx950/fmoe/gelu/fmoe_fp16_pertokenMXfp4_g1u1_gelu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512E,fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_1tg_32x512.co,0,0,0,1,0,32,512 _ZN5aiter49fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256E,fmoe_fp16_pertokenMXfp4_g1u1_novs_gelu_2tg_32x256.co,0,0,0,2,0,32,256 _ZN5aiter47fmoe_fp16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256E,fmoe_fp16_pertokenMXfp4_g1u1_vs_gelu_2tg_32x256.co,0,1,0,2,0,32,256 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv index 243e0d75d3..5ffc611c85 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co,0,0,0,1,1,32,256 _ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co,0,0,0,1,0,32,256 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv index bb41af16f0..acfbbeec58 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_noquant_g1u0_silu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter57fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co,2,0,0,1,1,32,512 _ZN5aiter54fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_32x512E,fmoe_bf16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_32x512.co,2,0,0,1,0,32,512 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv index 6543326267..a13849fa45 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192.co,0,1,0,1,0,32,192 _ZN5aiter55fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320E,fmoe_bf16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320.co,0,1,2,1,1,32,320 _ZN5aiter49fmoe_bf16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x512E,fmoe_bf16_pertokenFp8_g1u1_vs_smf_silu_1tg_32x512.co,0,1,1,1,0,32,512 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv index 6059a431bf..cf40973078 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x256E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter53fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co,0,1,0,1,1,32,384 _ZN5aiter50fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512E,fmoe_bf16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x512.co,0,1,0,1,0,32,512 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv index b7809dfd8d..feabfd8834 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u0_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x320E,fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x320.co,0,1,0,1,1,32,320 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512E,fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co,0,1,0,1,1,32,512 _ZN5aiter46fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x448E,fmoe_bf16_pertokenInt8_g1u0_vs_silu_1tg_32x448.co,0,1,0,1,0,32,448 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv index 7ddedeccf3..716484a4ca 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x384E,fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x384.co,0,1,0,1,1,32,384 _ZN5aiter53fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x256E,fmoe_bf16_pertokenInt8_g1u1_vs_multix_silu_1tg_32x256.co,0,1,2,1,0,32,256 _ZN5aiter49fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x320E,fmoe_bf16_pertokenInt8_g1u1_vs_silu_1tg_ps_32x320.co,0,1,0,1,1,32,320 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv index 8e1f534655..f9033ee2ca 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenInt8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x128.co,0,0,0,1,0,32,128 _ZN5aiter51fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x448E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x448.co,0,0,0,1,1,32,448 _ZN5aiter48fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448E,fmoe_bf16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co,0,0,0,1,0,32,448 diff --git a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenMXfp4_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenMXfp4_g1u1_silu.csv index 1a3bc0a25f..7d973478dd 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenMXfp4_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_bf16_pertokenMXfp4_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter47fmoe_bf16_pertokenMXfp4_g1u1_vs_silu_2tg_32x256E,fmoe_bf16_pertokenMXfp4_g1u1_vs_silu_2tg_32x256.co,0,1,0,2,0,32,256 _ZN5aiter49fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_2tg_32x256E,fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_2tg_32x256.co,0,0,0,2,0,32,256 _ZN5aiter49fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512E,fmoe_bf16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512.co,0,0,0,1,0,32,512 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv index 7dcb29ff40..9d856e07be 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_blockscaleFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 _ZN5aiter47fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,fmoe_fp16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co,0,1,0,1,0,32,256 _ZN5aiter52fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,fmoe_fp16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co,0,0,0,1,1,32,256 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv index db122e5031..fb3a0b2406 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_noquant_g1u0_silu.csv @@ -1,3 +1,3 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter54fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_silu_1tg_32x512E,fmoe_fp16_noquantFp16_g1u0_vs_atm_inlv_silu_1tg_32x512.co,2,0,0,1,0,32,512 _ZN5aiter57fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512E,fmoe_fp16_noquantBf16_g1u0_vs_atm_inlv_silu_1tg_ps_32x512.co,2,0,0,1,1,32,512 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv index 43b7df6283..0edd8eb4bf 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter48fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x448E,fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x448.co,0,1,0,1,1,32,448 _ZN5aiter55fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320E,fmoe_fp16_pertokenFp8_g1u1_vs_multix_silu_1tg_ps_32x320.co,0,1,2,1,1,32,320 _ZN5aiter45fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x448E,fmoe_fp16_pertokenFp8_g1u1_vs_silu_1tg_32x448.co,0,1,0,1,0,32,448 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv index 74e20693c3..f24465d16d 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenFp8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter50fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_32x192.co,0,1,0,1,0,32,192 _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x384.co,0,1,0,1,1,32,384 _ZN5aiter53fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192E,fmoe_fp16_pertokenFp8_g1u1_vs_tkw1_silu_1tg_ps_32x192.co,0,1,0,1,1,32,192 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv index d5b12b2b31..53b28009b7 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u0_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter49fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512E,fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x512.co,0,1,0,1,1,32,512 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x128E,fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter49fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x256E,fmoe_fp16_pertokenInt8_g1u0_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv index e444c1fb90..dccfcbb4c6 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter56fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x128E,fmoe_fp16_pertokenInt8_g1u1_vs_multix_silu_1tg_ps_32x128.co,0,1,2,1,1,32,128 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x512E,fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x512.co,0,1,0,1,0,32,512 _ZN5aiter46fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x448E,fmoe_fp16_pertokenInt8_g1u1_vs_silu_1tg_32x448.co,0,1,0,1,0,32,448 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv index c29be06cde..f987857fe5 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenInt8_g1u1_silu_tkw1.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter51fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x192E,fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_ps_32x192.co,0,0,0,1,1,32,192 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448E,fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x448.co,0,0,0,1,0,32,448 _ZN5aiter48fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384E,fmoe_fp16_pertokenInt8_g1u1_tkw1_silu_1tg_32x384.co,0,0,0,1,0,32,384 diff --git a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenMXfp4_g1u1_silu.csv b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenMXfp4_g1u1_silu.csv index 901e9d6526..d93dd5ccdc 100644 --- a/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenMXfp4_g1u1_silu.csv +++ b/hsa/gfx950/fmoe/silu/fmoe_fp16_pertokenMXfp4_g1u1_silu.csv @@ -1,4 +1,4 @@ -knl_name,knl_file,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n _ZN5aiter52fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_1tg_ps_32x512E,fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_1tg_ps_32x512.co,0,0,0,1,1,32,512 _ZN5aiter49fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512E,fmoe_fp16_pertokenMXfp4_g1u1_novs_silu_1tg_32x512.co,0,0,0,1,0,32,512 _ZN5aiter50fmoe_fp16_pertokenMXfp4_g1u1_vs_silu_2tg_ps_32x256E,fmoe_fp16_pertokenMXfp4_g1u1_vs_silu_2tg_ps_32x256.co,0,1,0,2,1,32,256 diff --git a/hsa/gfx950/fmoe_2stages/codegen.py b/hsa/gfx950/fmoe_2stages/codegen.py deleted file mode 100644 index 0693c455bf..0000000000 --- a/hsa/gfx950/fmoe_2stages/codegen.py +++ /dev/null @@ -1,62 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(M, N, path, name, co) \\ - { \\ - name, { name, path co, M, N } \\ - } - -struct FMoe2StageConfig -{ - std::string name; - std::string co_name; - int tile_M; - int tile_N; -}; - -using CFG = std::unordered_map; - -""" -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for CK fmha kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tileM:>4}, {tileN:>4}, "fmoe_2stages/", "{name}", "{co}"),' - for tileM, tileN, name, co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_moe_2stage_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx950/i8gemm/codegen.py b/hsa/gfx950/i8gemm/codegen.py deleted file mode 100644 index 215ae2ef74..0000000000 --- a/hsa/gfx950/i8gemm/codegen.py +++ /dev/null @@ -1,76 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) -hsa_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(tile_m, tile_n, splitK, bpreshuffle, path, name, co) \\ - { \\ - name, { name, path co, tile_m, tile_n, splitK, bpreshuffle} \\ - } - -struct I8GemmConfig -{ - std::string name; - std::string co_name; - int tile_M; - int tile_N; - int splitK; - int bpreshuffle; -}; - -using CFG = std::unordered_map; - -""" -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="generate configuration API for i8gemm asm kernels", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG({tileM:>4}, {tileN:>4}, {splitK:>4}, {bpreshuffle:>4}, "i8gemm/", "{name}", "{co}"),' - for tileM, tileN, splitK, bpreshuffle, name, co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - ## remove this when adding a kernel on gfx950 - if not cfgs: - for el in glob.glob(f"{hsa_dir}/gfx942/{os.path.basename(this_dir)}/*.csv"): - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n" - cfgname = "i8gemm_bf16_perTokenI8" - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_i8gemm_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx950/pa/codegen.py b/hsa/gfx950/pa/codegen.py deleted file mode 100755 index 7c85dcd0a3..0000000000 --- a/hsa/gfx950/pa/codegen.py +++ /dev/null @@ -1,68 +0,0 @@ -# SPDX-License-Identifier: MIT -# Copyright (C) 2018-2025, Advanced Micro Devices, Inc. All rights reserved. - -import os -import argparse -import glob -import pandas as pd - -this_dir = os.path.dirname(os.path.abspath(__file__)) - -template = """// SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. -#pragma once -#include - -#define ADD_CFG(q_type, kv_type, gqa, mtp, msk, hp, block_size, path, name, co) \\ - { \\ - name, { name, path co, q_type, kv_type, gqa, mtp, msk, hp, block_size } \\ - } - -struct AsmPaConfig -{ - std::string name; - std::string co_name; - std::string q_type; - std::string kv_type; - int gqa; - int mtp; - int msk; - int hp; - int block_size; -}; - -using CFG = std::unordered_map; - -""" - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - prog="generate", - description="gen API for asm PA kernel", - ) - parser.add_argument( - "-o", - "--output_dir", - default="aiter/jit/build", - required=False, - help="write all the blobs into a directory", - ) - args = parser.parse_args() - - cfgs = [] - for el in glob.glob(f"{this_dir}/*.csv"): - df = pd.read_csv(el) - cfg = [ - f'ADD_CFG("{qType}", "{kvType}",{Gqa:>4}, {Mtp:>2}, {Msk:>2}, {Hp:>2}, {blkSz:>2}, "pa/", "{Name}", "{Co}"),' - for qType, kvType, Gqa, Mtp, Msk, Hp, blkSz, Name, Co in df.values - ] - filename = os.path.basename(el) - cfgname = filename.split(".")[0] - cfg_txt = "\n ".join(cfg) + "\n" - - txt = f"""static CFG cfg_{cfgname} = {{ - {cfg_txt}}};""" - cfgs.append(txt) - txt_all = template + "\n".join(cfgs) - with open(f"{args.output_dir}/asm_pa_configs.hpp", "w") as f: - f.write(txt_all) diff --git a/hsa/gfx950/pa/pa_asm.csv b/hsa/gfx950/pa/pa_asm.csv index 005a5ca1ed..14a5526deb 100644 --- a/hsa/gfx950/pa/pa_asm.csv +++ b/hsa/gfx950/pa/pa_asm.csv @@ -1,4 +1,4 @@ -qType,kvType,Gqa,Mtp,Msk,Hp,blkSz,knl_name,knl_file +qType,kvType,Gqa,Mtp,Msk,Hp,blkSz,knl_name,co_name bf16,int8,16,0,0,0,16,_ZN5aiter33pa_bf16_pertokenInt8_gqa16_2tg_4wE,pa_bf16_pertokenInt8_gqa16_2tg_4w.co bf16,fp8,16,1,1,0,16,_ZN5aiter41pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1E,pa_bf16_pertokenFp8_gqa16_1tg_4w_mtp_msk1.co fp16,fp16,16,0,0,0,16,_ZN5aiter28pa_fp16_noquant_gqa16_1tg_4wE,pa_fp16_noquant_gqa16_1tg_4w.co diff --git a/hsa/readme.md b/hsa/readme.md new file mode 100644 index 0000000000..a9c05744b9 --- /dev/null +++ b/hsa/readme.md @@ -0,0 +1,6 @@ +# ASM Kernel Config Code Generator + +This script (codegen.py) merges per-architecture CSV kernel metadata into a single generated C++ header containing: + +# HOW +Your CSV file must include the columns "knl_name" and "col_name". diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index ed0d44a341..bf9cfdc41b 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -60,9 +60,11 @@ def run_gemm_b(x, weight, bias=None, otype=None, scaleA=None, scaleB=None): @perftest(num_iters=TEST_NUM_ITERS) def run_bf16gemm_asm( - x, weight, out_asm, otype=dtypes.fp32, bias=None, splitK=None, kernelName=None + x, weight, out_asm, bias=None, splitK=1, kernelName=None, bpreshuffle=0 ): - return aiter.gemm_a16w16_asm(x, weight, out_asm, bias, splitK, kernelName) + return aiter.gemm_a16w16_asm( + x, weight, out_asm, bias, splitK, kernelName, bpreshuffle + ) @perftest(num_iters=TEST_NUM_ITERS) @@ -141,18 +143,19 @@ def test_gemm(dtype, m, n, k, bias=False, otype=None, scaleA=None, scaleB=None): ### run bf16gemm_f32 asm if ( dtype == dtypes.bf16 - and otype == dtypes.fp32 + and (otype == dtypes.fp32 or otype == dtypes.bf16) and (k % 64 == 0) and (n % 64 == 0) # N % tileN == 0 # and (m in [64, 80, 128, 150, 192, 220, 256, 384, 448, 512]) # and (n == 256) # and (k == 5120 or k == 7168) and bias is None + and False ): - # wshuffle = shuffle_weight(weight, layout=(16, 16)) # out_asm = torch.empty((m + 191) // 192 * 192, n, dtype=otype) out_asm = torch.empty(m, n, dtype=otype, device=x.device) - (d, *_), avg_d = run_bf16gemm_asm(x, weight, out_asm, otype=dtypes.fp32) + wshuffle = shuffle_weight(weight, layout=(16, 16)) + (d, *_), avg_d = run_bf16gemm_asm(x, wshuffle, out_asm, bpreshuffle=True) msg = f"[perf] dim: {str(dim):<20} dtype: {dtype}, B avg: {avg_b:<8.2f} us, asm avg: {avg_d:<8.2f} us, uplift: {avg_b/avg_d-1:<5.1%}" err_asm = checkAllclose(b, d, msg=msg)